rmv max_dist as internal field
parent
d0ac1b48e9
commit
cf1d578bf6
|
@ -4,6 +4,8 @@ using Distances
|
|||
import StatsAPI: pairwise, pairwise!
|
||||
abstract type StringSemiMetric <: SemiMetric end
|
||||
abstract type StringMetric <: Metric end
|
||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
||||
(dist::StringDistance)(s1, s2; max_dist = nothing) = dist(s1, s2)
|
||||
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
|
||||
T = typeof(dist("", ""))
|
||||
if (Missing <: s1) | (Missing <: s2)
|
||||
|
@ -17,10 +19,8 @@ Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = res
|
|||
include("distances/utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
|
||||
|
||||
include("normalize.jl")
|
||||
include("pairwise.jl")
|
||||
include("normalize.jl")
|
||||
include("find.jl")
|
||||
include("fuzzywuzzy.jl")
|
||||
|
||||
|
@ -32,7 +32,8 @@ include("fuzzywuzzy.jl")
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
export
|
||||
export
|
||||
StringDistance,
|
||||
StringSemiMetric,
|
||||
StringMetric,
|
||||
# edit distances
|
||||
|
|
|
@ -5,18 +5,15 @@ Creates the Hamming distance
|
|||
|
||||
The Hamming distance is defined as the number of characters that do not match
|
||||
"""
|
||||
struct Hamming{V <: Union{Int, Nothing}} <: StringMetric
|
||||
max_dist::V
|
||||
end
|
||||
Hamming() = Hamming(nothing)
|
||||
struct Hamming <: StringMetric end
|
||||
|
||||
function (dist::Hamming{T})(s1, s2) where {T}
|
||||
function (dist::Hamming)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
out = abs(length(s2) - length(s1))
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
out += ch1 != ch2
|
||||
if T <: Int
|
||||
out > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
out > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
end
|
||||
return out
|
||||
|
@ -118,22 +115,20 @@ Creates the Levenshtein distance
|
|||
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
||||
substitutions of a single character) required to change one string into the other.
|
||||
"""
|
||||
struct Levenshtein{V <: Union{Int, Nothing}} <: StringMetric
|
||||
max_dist::V
|
||||
end
|
||||
Levenshtein() = Levenshtein(nothing)
|
||||
struct Levenshtein <: StringMetric end
|
||||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
# to differentiate distance equal to max_dist or not, which is important for find fctions.
|
||||
function (dist::Levenshtein{T})(s1, s2) where {T}
|
||||
function (dist::Levenshtein)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
s1, s2 = s2, s1
|
||||
len1, len2 = len2, len1
|
||||
end
|
||||
if T <: Int
|
||||
len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
len2 - len1 > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
# prefix common to both strings can be ignored
|
||||
k = common_prefix(s1, s2)
|
||||
|
@ -144,7 +139,7 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
|
|||
for (i1, ch1) in enumerate(s1)
|
||||
i1 > k || continue
|
||||
left = current = i1 - k - 1
|
||||
if T <: Int
|
||||
if max_dist !== nothing
|
||||
value_lb = left - 1
|
||||
end
|
||||
for (i2, ch2) in enumerate(s2)
|
||||
|
@ -153,17 +148,17 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
|
|||
if ch1 != ch2
|
||||
current = min(current, above, left) + 1
|
||||
end
|
||||
if T <: Int
|
||||
if max_dist !== nothing
|
||||
value_lb = min(value_lb, left)
|
||||
end
|
||||
@inbounds v[i2 - k] = current
|
||||
end
|
||||
if T <: Int
|
||||
value_lb > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
value_lb > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
end
|
||||
if T <: Int
|
||||
current > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
current > max_dist && return Int(max_dist + 1 )
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
@ -183,22 +178,19 @@ end
|
|||
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
|
||||
the triangle inequality.
|
||||
"""
|
||||
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: StringSemiMetric
|
||||
max_dist::V
|
||||
end
|
||||
OptimalStringAlignement() = OptimalStringAlignement(nothing)
|
||||
struct OptimalStringAlignement <: StringSemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
function (dist::OptimalStringAlignement{T})(s1, s2) where {T}
|
||||
function (dist::OptimalStringAlignement)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
s1, s2 = s2, s1
|
||||
len1, len2 = len2, len1
|
||||
end
|
||||
if T <: Int
|
||||
len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
len2 - len1 > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
k = common_prefix(s1, s2)
|
||||
k == len1 && return len2 - k
|
||||
|
@ -206,28 +198,25 @@ function (dist::OptimalStringAlignement{T})(s1, s2) where {T}
|
|||
w = similar(v)
|
||||
prevch1, prevch2 = first(s1), first(s2)
|
||||
current = 0
|
||||
if T <: Int
|
||||
if max_dist !== nothing
|
||||
i2_start = 0
|
||||
i2_end = dist.max_dist
|
||||
i2_end = max_dist
|
||||
end
|
||||
for (i1, ch1) in enumerate(s1)
|
||||
i1 > k || (prevch1 = ch1 ; continue)
|
||||
left = i1 - k - 1
|
||||
current = left + 1
|
||||
nextTransCost = 0
|
||||
if T <: Int
|
||||
i2_start += i1 - k - 1 + len2 - len1 > dist.max_dist
|
||||
if max_dist !== nothing
|
||||
i2_start += i1 - k - 1 + len2 - len1 > max_dist
|
||||
i2_end += i2_end < len2
|
||||
end
|
||||
for (i2, ch2) in enumerate(s2)
|
||||
i2 > k || (prevch2 = ch2 ; continue)
|
||||
# no need to look beyond window of lower right diagonal - max distance cells
|
||||
# lower right diag is i1 - (len2 - len1)) and the upper left diagonal + dist.max_dist cells (upper left is i1)
|
||||
if T <: Int
|
||||
if !(i2_start <= i2 - k - 1 < i2_end)
|
||||
prevch2 = ch2
|
||||
continue
|
||||
end
|
||||
# lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
|
||||
if max_dist !== nothing
|
||||
(i2_start <= i2 - k - 1 < i2_end) || (prevch2 = ch2 ; continue)
|
||||
end
|
||||
@inbounds above, current, left = current, left, v[i2 - k]
|
||||
@inbounds w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
|
||||
|
@ -241,13 +230,13 @@ function (dist::OptimalStringAlignement{T})(s1, s2) where {T}
|
|||
@inbounds v[i2 - k] = current
|
||||
prevch2 = ch2
|
||||
end
|
||||
if T <: Int
|
||||
v[i1 - k + len2 - len1] > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
v[i1 - k + len2 - len1] > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
prevch1 = ch1
|
||||
end
|
||||
if T <: Int
|
||||
current > dist.max_dist && return dist.max_dist + 1
|
||||
if max_dist !== nothing
|
||||
current > max_dist && return Int(max_dist + 1)
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
|
26
src/find.jl
26
src/find.jl
|
@ -1,3 +1,19 @@
|
|||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
1 - Normalized(dist)(s1, s2; max_dist = 1 - min_score)
|
||||
end
|
||||
|
||||
"""
|
||||
findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
|
||||
|
||||
|
@ -18,7 +34,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
|
|||
(nothing, nothing)
|
||||
```
|
||||
"""
|
||||
function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
|
@ -37,15 +53,15 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc
|
|||
end
|
||||
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s
|
||||
_preprocess(dist::StringDistance, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
findnearest(s, itr, dist; min_score = min_score)
|
||||
end
|
||||
|
||||
"""
|
||||
findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
||||
|
||||
`findall` returns the vector of indices for elements of `itr` that have a
|
||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
|
@ -66,7 +82,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
|||
0-element Array{Int64,1}
|
||||
```
|
||||
"""
|
||||
function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _preprocess(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
|
|
|
@ -19,37 +19,37 @@ struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|||
dist::S
|
||||
end
|
||||
|
||||
function (dist::Partial)(s1, s2)
|
||||
function (dist::Partial)(s1, s2; max_dist = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
out = dist.dist(s1, s2)
|
||||
out = dist.dist(s1, s2; max_dist = max_dist)
|
||||
max_dist0 = (max_dist !== nothing) ? min(max_dist, out) : out
|
||||
((len1 == 0) | (len1 == len2)) && return out
|
||||
for x in qgrams(s2, len1)
|
||||
curr = dist.dist(s1, x)
|
||||
curr = dist.dist(s1, x; max_dist = max_dist0)
|
||||
out = min(out, curr)
|
||||
max_dist0 = min(max_dist0, curr)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
||||
function (dist::Partial{T})(s1, s2; max_dist = nothing) where {T <: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}}
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
out = 1.0
|
||||
for r in matching_blocks(s1, s2, 1, 1, len1, len2)
|
||||
# Make sure the substring of s2 has length len1
|
||||
s2_start = r[2] - r[1] + 1
|
||||
s2_end = s2_start + len1 - 1
|
||||
if s2_start < 1
|
||||
s2_end += 1 - s2_start
|
||||
s2_start += 1 - s2_start
|
||||
elseif s2_end > len2
|
||||
s2_start += len2 - s2_end
|
||||
s2_end += len2 - s2_end
|
||||
s2_start = 1
|
||||
elseif s2_start + len1 - 1 > len2
|
||||
s2_start += len2 - (s2_start + len1 - 1)
|
||||
end
|
||||
n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_end)
|
||||
curr = 1 - 2 * n_matched / (len1 + s2_end - s2_start + 1)
|
||||
n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_start + len1 - 1)
|
||||
curr = 1 - 2 * n_matched / (len1 + len1)
|
||||
out = min(out, curr)
|
||||
end
|
||||
return out
|
||||
|
@ -74,10 +74,6 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2,
|
|||
return x
|
||||
end
|
||||
|
||||
function normalize(dist::Partial; max_dist = 1.0)
|
||||
Partial(normalize(dist.dist; max_dist = max_dist))
|
||||
end
|
||||
|
||||
"""
|
||||
TokenSort(dist)
|
||||
|
||||
|
@ -101,15 +97,10 @@ struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|||
dist::S
|
||||
end
|
||||
|
||||
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
out = dist.dist(s1, s2)
|
||||
end
|
||||
|
||||
function normalize(dist::TokenSort; max_dist = 1.0)
|
||||
TokenSort(normalize(dist.dist; max_dist = max_dist))
|
||||
f = s -> join(sort!(split(s)), " ")
|
||||
dist.dist(f(s1), f(s2); max_dist = max_dist)
|
||||
end
|
||||
|
||||
"""
|
||||
|
@ -135,7 +126,7 @@ struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|||
dist::S
|
||||
end
|
||||
|
||||
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
|
@ -143,18 +134,13 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
|
|||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return dist.dist(s1, s2)
|
||||
score_01 = dist.dist(s0, s1)
|
||||
score_02 = dist.dist(s0, s2)
|
||||
score_12 = dist.dist(s1, s2)
|
||||
min(score_01, score_02, score_12)
|
||||
isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist)
|
||||
out_01 = dist.dist(s0, s1; max_dist = max_dist)
|
||||
out_02 = dist.dist(s0, s2; max_dist = max_dist)
|
||||
out_12 = dist.dist(s1, s2; max_dist = max_dist)
|
||||
min(out_01, out_02, out_12)
|
||||
end
|
||||
|
||||
function normalize(dist::TokenSet; max_dist = 1.0)
|
||||
TokenSet(normalize(dist.dist; max_dist = max_dist))
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
TokenMax(dist)
|
||||
|
||||
|
@ -173,44 +159,34 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
|
|||
0.05
|
||||
```
|
||||
"""
|
||||
struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
struct TokenMax{S <: Normalized} <: StringSemiMetric
|
||||
dist::S
|
||||
max_dist::Float64
|
||||
end
|
||||
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
|
||||
TokenMax(dist::Normalized) = TokenMax{typeof(dist)}(dist)
|
||||
TokenMax(dist::Union{StringSemiMetric, StringMetric}) = TokenMax(Normalized(dist))
|
||||
|
||||
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = 1.0)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist = dist.max_dist
|
||||
dist0 = normalize(dist.dist; max_dist = max_dist)
|
||||
score = dist0(s1, s2)
|
||||
min_score = min(max_dist, score)
|
||||
unbase_scale = 0.95
|
||||
dist0 = dist.dist
|
||||
out = dist0(s1, s2; max_dist = max_dist)
|
||||
max_dist = min(max_dist, out)
|
||||
scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / partial_scale)
|
||||
score_partial = 1 - partial_scale * (1 - Partial(dist0)(s1, s2))
|
||||
min_score = min(max_dist, score_partial)
|
||||
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
|
||||
score_sort = 1 - unbase_scale * partial_scale * (1 - TokenSort(Partial(dist0))(s1, s2))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
|
||||
score_set = 1 - unbase_scale * partial_scale * (1 - TokenSet(Partial(dist0))(s1, s2))
|
||||
out = min(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
|
||||
score_sort = 1 - unbase_scale * (1 - TokenSort(dist0)(s1, s2))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
|
||||
score_set = 1 - unbase_scale * (1 - TokenSet(dist0)(s1, s2))
|
||||
out = min(score, score_sort, score_set)
|
||||
if len2 >= 1.5 * len1
|
||||
dist0 = Partial(dist0)
|
||||
pscale = 0.9
|
||||
pout = 1 - pscale * (1 - dist0(s1, s2; max_dist = 1 - (1 - max_dist) / pscale))
|
||||
out = min(out, pout)
|
||||
max_dist = min(max_dist, pout)
|
||||
scale *= pscale
|
||||
end
|
||||
out_sort = 1 - scale * (1 - TokenSort(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
|
||||
max_dist = min(max_dist, out_sort)
|
||||
out_set = 1 - scale * (1 - TokenSet(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
|
||||
out = min(out, out_sort, out_set)
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function normalize(dist::TokenMax; max_dist = 1.0)
|
||||
TokenMax(dist.dist, max_dist)
|
||||
end
|
||||
Normalized(dist::TokenMax) = TokenMax(dist.dist)
|
||||
|
|
|
@ -1,35 +1,52 @@
|
|||
struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::V
|
||||
max_dist::Float64
|
||||
end
|
||||
"""
|
||||
Normalized(dist)
|
||||
|
||||
function (dist::Normalized{<: Union{Jaro, JaroWinkler, RatcliffObershelp}})(s1, s2)
|
||||
Creates a normalized distance. The distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing)
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> Levenshtein()(s1, s2)
|
||||
25
|
||||
julia> StringDistances.Normalized(Levenshtein())(s1, s2)
|
||||
0.8064
|
||||
```
|
||||
"""
|
||||
struct Normalized{T <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::T
|
||||
end
|
||||
Normalized(dist::Normalized) = dist
|
||||
function (dist::Normalized)(s1, s2; max_dist = 1.0)
|
||||
out = dist.dist(s1, s2)
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
max_dist !== nothing && out > max_dist && return 1.0
|
||||
return out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2)
|
||||
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2; max_dist = 1.0)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
isempty(s1) && isempty(s2) && return 0.0
|
||||
out = dist.dist(s1, s2) / length(s2)
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
max_dist !== nothing && out > max_dist && return 1.0
|
||||
return out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:Union{Levenshtein{Nothing}, OptimalStringAlignement{Nothing}}})(s1, s2)
|
||||
function (dist::Normalized{<:Union{Levenshtein, OptimalStringAlignement}})(s1, s2; max_dist = 1.0)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
isempty(s1) && isempty(s2) && return 0.0
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if dist.dist isa Levenshtein
|
||||
d = Levenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
|
||||
if max_dist === nothing || max_dist == 1.0
|
||||
d = dist.dist(s1, s2)
|
||||
else
|
||||
d = OptimalStringAlignement(ceil(Int, len2 * dist.max_dist))(s1, s2)
|
||||
d = dist.dist(s1, s2; max_dist = ceil(Int, len2 * max_dist))
|
||||
end
|
||||
out = d / len2
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
max_dist !== nothing && out > max_dist && return 1.0
|
||||
return out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
|
||||
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2; max_dist = 1.0)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -40,41 +57,6 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
|
|||
else
|
||||
out = dist.dist(s1, s2)
|
||||
end
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
max_dist !== nothing && out > max_dist && return 1.0
|
||||
return out
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
normalize(dist)
|
||||
|
||||
Creates a normalized distance. The distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing)
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> Levenshtein()(s1, s2)
|
||||
25
|
||||
julia> StringDistances.normalize(Levenshtein())(s1, s2)
|
||||
0.8064
|
||||
```
|
||||
"""
|
||||
normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||
|
||||
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
|
|
@ -2,161 +2,162 @@ using StringDistances, Unicode, Test, Random
|
|||
|
||||
@testset "Distances" begin
|
||||
@testset "Hamming" begin
|
||||
@test evaluate(Hamming(), "martha", "marhta") ≈ 2
|
||||
@test evaluate(Hamming(), "es an ", " vs an") ≈ 6
|
||||
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) ≈ 1
|
||||
@inferred evaluate(Hamming(), "", "")
|
||||
@test ismissing(evaluate(Hamming(), "", missing))
|
||||
@test Hamming()("martha", "marhta") ≈ 2
|
||||
@test Hamming()("es an ", " vs an") ≈ 6
|
||||
@test Hamming()([1, 2, 3], [1,2, 4]) ≈ 1
|
||||
@inferred Hamming()("", "")
|
||||
@test ismissing(Hamming()("", missing))
|
||||
end
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
|
||||
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
|
||||
@test Jaro()("martha", "marhta") ≈ 0.05555555555555547
|
||||
@test Jaro()("es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test Jaro()(" vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test Jaro()([1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
|
||||
@test Jaro()(graphemes("alborgów"), graphemes("amoniak")) == Jaro()("alborgów", "amoniak")
|
||||
@test Jaro()(" vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Jaro(), "", "")
|
||||
@test ismissing(evaluate(Jaro(), "", missing))
|
||||
@inferred Jaro()("", "")
|
||||
@test ismissing(Jaro()("", missing))
|
||||
end
|
||||
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
@test evaluate(Levenshtein(), "", "abc") == 3
|
||||
@test evaluate(Levenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||
@test evaluate(Levenshtein(), "a cat", "an act") == 3
|
||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
@test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1
|
||||
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
|
||||
@test Levenshtein()("", "") == 0
|
||||
@test Levenshtein()("abc", "") == 3
|
||||
@test Levenshtein()("", "abc") == 3
|
||||
@test Levenshtein()("bc", "abc") == 1
|
||||
@test Levenshtein()("kitten", "sitting") == 3
|
||||
@test Levenshtein()("saturday", "sunday") == 3
|
||||
@test Levenshtein()("hi, my name is", "my name is") == 4
|
||||
@test Levenshtein()("a cat", "an act") == 3
|
||||
@test Levenshtein()("alborgów", "amoniak") == 6
|
||||
@test Levenshtein()([1, 2, 3], [1, 2, 4]) == 1
|
||||
@test Levenshtein()(graphemes("alborgów"), graphemes("amoniak")) == Levenshtein()("alborgów", "amoniak")
|
||||
@test Levenshtein()("", "abc") == 3
|
||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||
@inferred evaluate(Levenshtein(), "", "")
|
||||
@test ismissing(evaluate(Levenshtein(), "", missing))
|
||||
@inferred Levenshtein()("", "")
|
||||
@test ismissing(Levenshtein()("", missing))
|
||||
end
|
||||
|
||||
@testset "OptimalStringAlignement" begin
|
||||
@test evaluate(OptimalStringAlignement(), "", "") == 0
|
||||
@test evaluate(OptimalStringAlignement(), "abc", "") == 3
|
||||
@test evaluate(OptimalStringAlignement(), "bc", "abc") == 1
|
||||
@test evaluate(OptimalStringAlignement(), "fuor", "four") == 1
|
||||
@test evaluate(OptimalStringAlignement(), "abcd", "acb") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(OptimalStringAlignement(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "ifhs", "fish") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "a cat", "an act") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "a cat", "an abct") == 4
|
||||
@test evaluate(OptimalStringAlignement(), "a cat", "a tc") == 3
|
||||
@test OptimalStringAlignement(2)("abcdef", "abcxyf") == 2
|
||||
@test OptimalStringAlignement()("", "") == 0
|
||||
@test OptimalStringAlignement()("abc", "") == 3
|
||||
@test OptimalStringAlignement()("bc", "abc") == 1
|
||||
@test OptimalStringAlignement()("fuor", "four") == 1
|
||||
@test OptimalStringAlignement()("abcd", "acb") == 2
|
||||
@test OptimalStringAlignement()("cape sand recycling ", "edith ann graham") == 17
|
||||
@test OptimalStringAlignement()("jellyifhs", "jellyfish") == 2
|
||||
@test OptimalStringAlignement()("ifhs", "fish") == 2
|
||||
@test OptimalStringAlignement()("a cat", "an act") == 2
|
||||
@test OptimalStringAlignement()("a cat", "an abct") == 4
|
||||
@test OptimalStringAlignement()("a cat", "a tc") == 3
|
||||
@test OptimalStringAlignement()("abcdef", "abcxyf") == 2
|
||||
@test OptimalStringAlignement()("abcdef", "abcxyf"; max_dist = 2) == 2
|
||||
|
||||
@test evaluate(OptimalStringAlignement(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(OptimalStringAlignement(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(OptimalStringAlignement(), "alborgów", "amoniak")
|
||||
@test OptimalStringAlignement()([1, 2, 3], [1,2, 4]) == 1
|
||||
@test OptimalStringAlignement()(graphemes("alborgów"), graphemes("amoniak")) == OptimalStringAlignement()("alborgów", "amoniak")
|
||||
@test OptimalStringAlignement()("bc", "abc") == 1
|
||||
@test result_type(OptimalStringAlignement(), "hello", "world") == Int
|
||||
@inferred evaluate(OptimalStringAlignement(), "", "")
|
||||
@test ismissing(evaluate(OptimalStringAlignement(), "", missing))
|
||||
@inferred OptimalStringAlignement()("", "")
|
||||
@test ismissing(OptimalStringAlignement()("", missing))
|
||||
end
|
||||
|
||||
@testset "DamerauLevenshtein" begin
|
||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||
@test evaluate(DamerauLevenshtein(), "CA", "ABC") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABDCEF") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "BACDFE") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABCDE") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "a cat", "an act") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "a cat", "an abct") == 3
|
||||
@test evaluate(DamerauLevenshtein(), "a cat", "a tc") == 2
|
||||
@test DamerauLevenshtein()("", "") == 0
|
||||
@test DamerauLevenshtein()("CA", "ABC") == 2
|
||||
@test DamerauLevenshtein()("ABCDEF", "ABDCEF") == 1
|
||||
@test DamerauLevenshtein()("ABCDEF", "BACDFE") == 2
|
||||
@test DamerauLevenshtein()("ABCDEF", "ABCDE") == 1
|
||||
@test DamerauLevenshtein()("a cat", "an act") == 2
|
||||
@test DamerauLevenshtein()("a cat", "an abct") == 3
|
||||
@test DamerauLevenshtein()("a cat", "a tc") == 2
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@inferred evaluate(DamerauLevenshtein(), "", "")
|
||||
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
|
||||
@inferred DamerauLevenshtein()("", "")
|
||||
@test ismissing(DamerauLevenshtein()("", missing))
|
||||
end
|
||||
|
||||
@testset "RatcliffObershelp" begin
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3
|
||||
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
|
||||
@test RatcliffObershelp()("dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test RatcliffObershelp()("alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test RatcliffObershelp()("pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test RatcliffObershelp()("", "pencilvaneya") ≈ 1.0
|
||||
@test RatcliffObershelp()("NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test RatcliffObershelp()("Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test RatcliffObershelp()("New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test RatcliffObershelp()([1, 2, 3], [1,2, 4]) ≈ 1/3
|
||||
@test RatcliffObershelp()(graphemes("alborgów"), graphemes("amoniak")) == RatcliffObershelp()("alborgów", "amoniak")
|
||||
@test RatcliffObershelp()("pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(RatcliffObershelp(), "", "")
|
||||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
@inferred RatcliffObershelp()("", "")
|
||||
@test ismissing(RatcliffObershelp()("", missing))
|
||||
end
|
||||
|
||||
@testset "QGram" begin
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
|
||||
@test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
|
||||
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak")
|
||||
@test QGram(1)("abc", "abc") == 0
|
||||
@test QGram(1)("", "abc") == 3
|
||||
@test QGram(1)("abc", "cba") == 0
|
||||
@test QGram(1)("abc", "ccc") == 4
|
||||
@test QGram(4)("aü☃", "aüaüafs") == 4
|
||||
@test QGram(2)(SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
|
||||
@test QGram(2)(graphemes("alborgów"), graphemes("amoniak")) ≈ QGram(2)("alborgów", "amoniak")
|
||||
@test QGram(1)("abc", "cba") == 0
|
||||
@test result_type(QGram(1), "hello", "world") == Int
|
||||
@test ismissing(evaluate(QGram(1), "", missing))
|
||||
@inferred evaluate(QGram(1), "", "")
|
||||
@test ismissing(QGram(1)("", missing))
|
||||
@inferred QGram(1)("", "")
|
||||
end
|
||||
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5
|
||||
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak")
|
||||
@test isnan(Cosine(2)("", "abc"))
|
||||
@test Cosine(2)("abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test Cosine(2)("leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test Cosine(2)([1, 2, 3], [1, 2, 4]) ≈ 0.5
|
||||
@test Cosine(2)(graphemes("alborgów"), graphemes("amoniak")) ≈ Cosine(2)("alborgów", "amoniak")
|
||||
@test Cosine(2)("leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Cosine(2), "", "")
|
||||
@test ismissing(evaluate(Cosine(2), "", missing))
|
||||
@inferred Cosine(2)("", "")
|
||||
@test ismissing(Cosine(2)("", missing))
|
||||
end
|
||||
|
||||
@testset "Jaccard" begin
|
||||
@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ 2/3 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak")
|
||||
@test Jaccard(1)("", "abc") ≈ 1.0
|
||||
@test Jaccard(1)("abc", "ccc") ≈ 2/3 atol = 1e-4
|
||||
@test Jaccard(2)("leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test Jaccard(2)([1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
|
||||
@test Jaccard(2)(graphemes("alborgów"), graphemes("amoniak")) ≈ Jaccard(2)("alborgów", "amoniak")
|
||||
@test Jaccard(2)("leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Jaccard(1), "", "")
|
||||
@test ismissing(evaluate(Jaccard(1), "", missing))
|
||||
@inferred Jaccard(1)("", "")
|
||||
@test ismissing(Jaccard(1)("", missing))
|
||||
end
|
||||
|
||||
@testset "SorensenDice" begin
|
||||
@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak")
|
||||
@test SorensenDice(1)("night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test SorensenDice(2)("night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test SorensenDice(2)(graphemes("alborgów"), graphemes("amoniak")) ≈ SorensenDice(2)("alborgów", "amoniak")
|
||||
@test SorensenDice(2)("night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(SorensenDice(1), "", "")
|
||||
@test ismissing(evaluate(SorensenDice(1), "", missing))
|
||||
@inferred SorensenDice(1)("", "")
|
||||
@test ismissing(SorensenDice(1)("", missing))
|
||||
end
|
||||
|
||||
@testset "Overlap" begin
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
@test Overlap(1)("night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test Overlap(1)("context", "contact") ≈ .2 atol = 1e-4
|
||||
@test Overlap(1)("context", "contact") ≈ .2 atol = 1e-4
|
||||
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Overlap(1), "", "")
|
||||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
@inferred Overlap(1)("", "")
|
||||
@test ismissing(Overlap(1)("", missing))
|
||||
end
|
||||
|
||||
@testset "MorisitaOverlap" begin
|
||||
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
|
||||
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
|
||||
@test MorisitaOverlap(1)("night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
|
||||
|
||||
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
|
||||
# ms1 = [1, 1, 1, 2, 1, 1, 0]
|
||||
# ms2 = [2, 1, 1, 2, 0, 0, 1]
|
||||
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
|
||||
@test evaluate(MorisitaOverlap(1), "context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
|
||||
@test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
|
||||
@test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4
|
||||
|
||||
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
|
||||
|
@ -166,17 +167,17 @@ using StringDistances, Unicode, Test, Random
|
|||
@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))
|
||||
|
||||
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(MorisitaOverlap(1), "", "")
|
||||
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
|
||||
@inferred MorisitaOverlap(1)("", "")
|
||||
@test ismissing(MorisitaOverlap(1)("", missing))
|
||||
end
|
||||
|
||||
@testset "NMD" begin
|
||||
# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
|
||||
@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5
|
||||
@test NMD(1)("night", "nacht") == 0.4 # (7-5)/5
|
||||
|
||||
# ms1 = [1, 1, 1, 2, 1, 1, 0]
|
||||
# ms2 = [2, 1, 1, 2, 0, 0, 1]
|
||||
@test evaluate(NMD(1), "context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
|
||||
@test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
|
||||
@test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4
|
||||
|
||||
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
|
||||
|
@ -184,8 +185,8 @@ using StringDistances, Unicode, Test, Random
|
|||
@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
|
||||
|
||||
@test result_type(NMD(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(NMD(1), "", "")
|
||||
@test ismissing(evaluate(NMD(1), "", missing))
|
||||
@inferred NMD(1)("", "")
|
||||
@test ismissing(NMD(1)("", missing))
|
||||
end
|
||||
|
||||
@testset "QGramDict and QGramSortedVector counts qgrams" begin
|
||||
|
@ -236,37 +237,37 @@ using StringDistances, Unicode, Test, Random
|
|||
for _ in 1:100
|
||||
qlen = rand(2:5)
|
||||
str1, str2 = partlyoverlappingstrings(6:100, Chars)
|
||||
d = Jaccard(qlen)
|
||||
dist = Jaccard(qlen)
|
||||
|
||||
qd1 = QGramDict(str1, qlen)
|
||||
qd2 = QGramDict(str2, qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2)
|
||||
@test dist(str1, str2) == dist(qd1, qd2)
|
||||
|
||||
qd1b = QGramDict(graphemes(str1), qlen)
|
||||
qd2b = QGramDict(graphemes(str2), qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b)
|
||||
@test dist(str1, str2) == dist(qd1b, qd2b)
|
||||
|
||||
qc1 = QGramSortedVector(str1, qlen)
|
||||
qc2 = QGramSortedVector(str2, qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2)
|
||||
@test dist(str1, str2) == dist(qc1, qc2)
|
||||
|
||||
qc1b = QGramSortedVector(graphemes(str1), qlen)
|
||||
qc2b = QGramSortedVector(graphemes(str2), qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b)
|
||||
@test dist(str1, str2) == dist(qc1b, qc2b)
|
||||
end
|
||||
end
|
||||
|
||||
@testset "QGram distance on short strings" begin
|
||||
@test isnan(evaluate(Overlap(2), "1", "2"))
|
||||
@test isnan(evaluate(Jaccard(3), "s1", "s2"))
|
||||
@test isnan(evaluate(Cosine(5), "s1", "s2"))
|
||||
@test isnan(Overlap(2)( "1", "2"))
|
||||
@test isnan(Jaccard(3)("s1", "s2"))
|
||||
@test isnan(Cosine(5)( "s1", "s2"))
|
||||
|
||||
@test !isnan(evaluate(Overlap(2), "s1", "s2"))
|
||||
@test !isnan(evaluate(Jaccard(3), "st1", "st2"))
|
||||
@test !isnan(evaluate(Cosine(5), "stri1", "stri2"))
|
||||
@test !isnan(Overlap(2)( "s1", "s2"))
|
||||
@test !isnan(Jaccard(3)("st1", "st2"))
|
||||
@test !isnan(Cosine(5)( "stri1", "stri2"))
|
||||
|
||||
@test !isnan(evaluate(Jaccard(3), "st1", "str2"))
|
||||
@test !isnan(evaluate(Jaccard(3), "str1", "st2"))
|
||||
@test !isnan(Jaccard(3)("st1", "str2"))
|
||||
@test !isnan(Jaccard(3)("str1", "st2"))
|
||||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
|
||||
|
@ -279,13 +280,13 @@ using StringDistances, Unicode, Test, Random
|
|||
# QGramDict gets same result as for standard string
|
||||
qd1 = QGramDict(str1, qlen)
|
||||
qd2 = QGramDict(str2, qlen)
|
||||
expected = evaluate(dist, str1, str2)
|
||||
@test expected == evaluate(dist, qd1, qd2)
|
||||
expected = dist(str1, str2)
|
||||
@test expected == dist(qd1, qd2)
|
||||
|
||||
# QGramSortedVector gets same result as for standard string
|
||||
qc1 = QGramSortedVector(str1, qlen)
|
||||
qc2 = QGramSortedVector(str2, qlen)
|
||||
@test expected == evaluate(dist, qc1, qc2)
|
||||
@test expected == dist(qc1, qc2)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -319,33 +320,30 @@ using StringDistances, Unicode, Test, Random
|
|||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
# Test with R package StringDist
|
||||
for x in solutions
|
||||
t, solution = x
|
||||
dist, solution = x
|
||||
for i in eachindex(solution)
|
||||
if isnan(evaluate(t, strings[i]...))
|
||||
if isnan(dist(strings[i]...))
|
||||
@test isnan(solution[i])
|
||||
else
|
||||
@test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4
|
||||
@test dist(strings[i]...) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
end
|
||||
end
|
||||
# test RatcliffObershelp
|
||||
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
|
||||
for i in eachindex(strings)
|
||||
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4
|
||||
@test round(Int, (1 - RatcliffObershelp()(strings[i]...)) * 100) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
|
||||
# test max_dist
|
||||
for i in eachindex(strings)
|
||||
d = Levenshtein()(strings[i]...)
|
||||
@test Levenshtein(d)(strings[i]...) == d
|
||||
@test Levenshtein()(strings[i]...; max_dist = d) == d
|
||||
d = OptimalStringAlignement()(strings[i]...)
|
||||
@test OptimalStringAlignement(d)(strings[i]...) == d
|
||||
@test OptimalStringAlignement()(strings[i]...; max_dist = d) == d
|
||||
end
|
||||
end
|
||||
|
||||
d = OptimalStringAlignement()("abcdef", "abcxyf")
|
||||
@test OptimalStringAlignement(d)("abcdef", "abcxyf") == d
|
||||
|
||||
|
||||
|
||||
#= R test
|
||||
|
|
|
@ -61,7 +61,7 @@ end
|
|||
compare("aüa", "aua", Levenshtein())
|
||||
@test compare("ok", missing, Levenshtein()) === missing
|
||||
compare("aüa", "aua", OptimalStringAlignement())
|
||||
@test StringDistances.normalize(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
|
||||
@test StringDistances.Normalized(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
|
||||
@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0
|
||||
|
||||
# RatcliffObershelp
|
||||
|
|
Loading…
Reference in New Issue