rmv max_dist as internal field

pull/57/head
matthieugomez 2021-09-13 09:14:02 -04:00
parent d0ac1b48e9
commit cf1d578bf6
7 changed files with 256 additions and 294 deletions

View File

@ -4,6 +4,8 @@ using Distances
import StatsAPI: pairwise, pairwise! import StatsAPI: pairwise, pairwise!
abstract type StringSemiMetric <: SemiMetric end abstract type StringSemiMetric <: SemiMetric end
abstract type StringMetric <: Metric end abstract type StringMetric <: Metric end
const StringDistance = Union{StringSemiMetric, StringMetric}
(dist::StringDistance)(s1, s2; max_dist = nothing) = dist(s1, s2)
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type) function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
T = typeof(dist("", "")) T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2) if (Missing <: s1) | (Missing <: s2)
@ -17,10 +19,8 @@ Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = res
include("distances/utils.jl") include("distances/utils.jl")
include("distances/edit.jl") include("distances/edit.jl")
include("distances/qgram.jl") include("distances/qgram.jl")
include("normalize.jl")
include("pairwise.jl") include("pairwise.jl")
include("normalize.jl")
include("find.jl") include("find.jl")
include("fuzzywuzzy.jl") include("fuzzywuzzy.jl")
@ -32,7 +32,8 @@ include("fuzzywuzzy.jl")
## ##
############################################################################## ##############################################################################
export export
StringDistance,
StringSemiMetric, StringSemiMetric,
StringMetric, StringMetric,
# edit distances # edit distances

View File

@ -5,18 +5,15 @@ Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match The Hamming distance is defined as the number of characters that do not match
""" """
struct Hamming{V <: Union{Int, Nothing}} <: StringMetric struct Hamming <: StringMetric end
max_dist::V
end
Hamming() = Hamming(nothing)
function (dist::Hamming{T})(s1, s2) where {T} function (dist::Hamming)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
out = abs(length(s2) - length(s1)) out = abs(length(s2) - length(s1))
for (ch1, ch2) in zip(s1, s2) for (ch1, ch2) in zip(s1, s2)
out += ch1 != ch2 out += ch1 != ch2
if T <: Int if max_dist !== nothing
out > dist.max_dist && return dist.max_dist + 1 out > max_dist && return Int(max_dist + 1)
end end
end end
return out return out
@ -118,22 +115,20 @@ Creates the Levenshtein distance
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other. substitutions of a single character) required to change one string into the other.
""" """
struct Levenshtein{V <: Union{Int, Nothing}} <: StringMetric struct Levenshtein <: StringMetric end
max_dist::V
end
Levenshtein() = Levenshtein(nothing)
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist + 1 if distance higher than max_dist # Return max_dist + 1 if distance higher than max_dist
# to differentiate distance equal to max_dist or not, which is important for find fctions. # to differentiate distance equal to max_dist or not, which is important for find fctions.
function (dist::Levenshtein{T})(s1, s2) where {T} function (dist::Levenshtein)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
if len1 > len2 if len1 > len2
s1, s2 = s2, s1 s1, s2 = s2, s1
len1, len2 = len2, len1 len1, len2 = len2, len1
end end
if T <: Int if max_dist !== nothing
len2 - len1 > dist.max_dist && return dist.max_dist + 1 len2 - len1 > max_dist && return Int(max_dist + 1)
end end
# prefix common to both strings can be ignored # prefix common to both strings can be ignored
k = common_prefix(s1, s2) k = common_prefix(s1, s2)
@ -144,7 +139,7 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
for (i1, ch1) in enumerate(s1) for (i1, ch1) in enumerate(s1)
i1 > k || continue i1 > k || continue
left = current = i1 - k - 1 left = current = i1 - k - 1
if T <: Int if max_dist !== nothing
value_lb = left - 1 value_lb = left - 1
end end
for (i2, ch2) in enumerate(s2) for (i2, ch2) in enumerate(s2)
@ -153,17 +148,17 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
if ch1 != ch2 if ch1 != ch2
current = min(current, above, left) + 1 current = min(current, above, left) + 1
end end
if T <: Int if max_dist !== nothing
value_lb = min(value_lb, left) value_lb = min(value_lb, left)
end end
@inbounds v[i2 - k] = current @inbounds v[i2 - k] = current
end end
if T <: Int if max_dist !== nothing
value_lb > dist.max_dist && return dist.max_dist + 1 value_lb > max_dist && return Int(max_dist + 1)
end end
end end
if T <: Int if max_dist !== nothing
current > dist.max_dist && return dist.max_dist + 1 current > max_dist && return Int(max_dist + 1 )
end end
return current return current
end end
@ -183,22 +178,19 @@ end
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality. the triangle inequality.
""" """
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: StringSemiMetric struct OptimalStringAlignement <: StringSemiMetric end
max_dist::V
end
OptimalStringAlignement() = OptimalStringAlignement(nothing)
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist # Return max_dist + 1 if distance higher than max_dist
function (dist::OptimalStringAlignement{T})(s1, s2) where {T} function (dist::OptimalStringAlignement)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
if len1 > len2 if len1 > len2
s1, s2 = s2, s1 s1, s2 = s2, s1
len1, len2 = len2, len1 len1, len2 = len2, len1
end end
if T <: Int if max_dist !== nothing
len2 - len1 > dist.max_dist && return dist.max_dist + 1 len2 - len1 > max_dist && return Int(max_dist + 1)
end end
k = common_prefix(s1, s2) k = common_prefix(s1, s2)
k == len1 && return len2 - k k == len1 && return len2 - k
@ -206,28 +198,25 @@ function (dist::OptimalStringAlignement{T})(s1, s2) where {T}
w = similar(v) w = similar(v)
prevch1, prevch2 = first(s1), first(s2) prevch1, prevch2 = first(s1), first(s2)
current = 0 current = 0
if T <: Int if max_dist !== nothing
i2_start = 0 i2_start = 0
i2_end = dist.max_dist i2_end = max_dist
end end
for (i1, ch1) in enumerate(s1) for (i1, ch1) in enumerate(s1)
i1 > k || (prevch1 = ch1 ; continue) i1 > k || (prevch1 = ch1 ; continue)
left = i1 - k - 1 left = i1 - k - 1
current = left + 1 current = left + 1
nextTransCost = 0 nextTransCost = 0
if T <: Int if max_dist !== nothing
i2_start += i1 - k - 1 + len2 - len1 > dist.max_dist i2_start += i1 - k - 1 + len2 - len1 > max_dist
i2_end += i2_end < len2 i2_end += i2_end < len2
end end
for (i2, ch2) in enumerate(s2) for (i2, ch2) in enumerate(s2)
i2 > k || (prevch2 = ch2 ; continue) i2 > k || (prevch2 = ch2 ; continue)
# no need to look beyond window of lower right diagonal - max distance cells # no need to look beyond window of lower right diagonal - max distance cells
# lower right diag is i1 - (len2 - len1)) and the upper left diagonal + dist.max_dist cells (upper left is i1) # lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
if T <: Int if max_dist !== nothing
if !(i2_start <= i2 - k - 1 < i2_end) (i2_start <= i2 - k - 1 < i2_end) || (prevch2 = ch2 ; continue)
prevch2 = ch2
continue
end
end end
@inbounds above, current, left = current, left, v[i2 - k] @inbounds above, current, left = current, left, v[i2 - k]
@inbounds w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost @inbounds w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
@ -241,13 +230,13 @@ function (dist::OptimalStringAlignement{T})(s1, s2) where {T}
@inbounds v[i2 - k] = current @inbounds v[i2 - k] = current
prevch2 = ch2 prevch2 = ch2
end end
if T <: Int if max_dist !== nothing
v[i1 - k + len2 - len1] > dist.max_dist && return dist.max_dist + 1 v[i1 - k + len2 - len1] > max_dist && return Int(max_dist + 1)
end end
prevch1 = ch1 prevch1 = ch1
end end
if T <: Int if max_dist !== nothing
current > dist.max_dist && return dist.max_dist + 1 current > max_dist && return Int(max_dist + 1)
end end
return current return current
end end

View File

@ -1,3 +1,19 @@
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
1 - Normalized(dist)(s1, s2; max_dist = 1 - min_score)
end
""" """
findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index) findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
@ -18,7 +34,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing) (nothing, nothing)
``` ```
""" """
function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score) min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()] scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()] is = [0 for _ in 1:Threads.nthreads()]
@ -37,15 +53,15 @@ function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_sc
end end
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing _preprocess(dist::AbstractQGramDistance, ::Missing) = missing
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q) _preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s _preprocess(dist::StringDistance, s) = s
function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)" @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score) findnearest(s, itr, dist; min_score = min_score)
end end
""" """
findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8) findall(s, itr , dist::StringDistance; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a `findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`. similarity score higher or equal than `min_score` according to the distance `dist`.
@ -66,7 +82,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1} 0-element Array{Int64,1}
``` ```
""" """
function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8) function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()] out = [Int[] for _ in 1:Threads.nthreads()]
s = _preprocess(dist, s) s = _preprocess(dist, s)
# need collect since @threads requires a length method # need collect since @threads requires a length method

View File

@ -19,37 +19,37 @@ struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
function (dist::Partial)(s1, s2) function (dist::Partial)(s1, s2; max_dist = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
out = dist.dist(s1, s2) out = dist.dist(s1, s2; max_dist = max_dist)
max_dist0 = (max_dist !== nothing) ? min(max_dist, out) : out
((len1 == 0) | (len1 == len2)) && return out ((len1 == 0) | (len1 == len2)) && return out
for x in qgrams(s2, len1) for x in qgrams(s2, len1)
curr = dist.dist(s1, x) curr = dist.dist(s1, x; max_dist = max_dist0)
out = min(out, curr) out = min(out, curr)
max_dist0 = min(max_dist0, curr)
end end
return out return out
end end
function (dist::Partial{RatcliffObershelp})(s1, s2) function (dist::Partial{T})(s1, s2; max_dist = nothing) where {T <: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}}
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2) len1 == len2 && return dist.dist(s1, s2)
out = 1.0 out = 1.0
for r in matching_blocks(s1, s2, 1, 1, len1, len2) for r in matching_blocks(s1, s2, 1, 1, len1, len2)
# Make sure the substring of s2 has length len1 # Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1 s2_start = r[2] - r[1] + 1
s2_end = s2_start + len1 - 1
if s2_start < 1 if s2_start < 1
s2_end += 1 - s2_start s2_start = 1
s2_start += 1 - s2_start elseif s2_start + len1 - 1 > len2
elseif s2_end > len2 s2_start += len2 - (s2_start + len1 - 1)
s2_start += len2 - s2_end
s2_end += len2 - s2_end
end end
n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_end) n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_start + len1 - 1)
curr = 1 - 2 * n_matched / (len1 + s2_end - s2_start + 1) curr = 1 - 2 * n_matched / (len1 + len1)
out = min(out, curr) out = min(out, curr)
end end
return out return out
@ -74,10 +74,6 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2,
return x return x
end end
function normalize(dist::Partial; max_dist = 1.0)
Partial(normalize(dist.dist; max_dist = max_dist))
end
""" """
TokenSort(dist) TokenSort(dist)
@ -101,15 +97,10 @@ struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
s1 = join(sort!(split(s1)), " ") f = s -> join(sort!(split(s)), " ")
s2 = join(sort!(split(s2)), " ") dist.dist(f(s1), f(s2); max_dist = max_dist)
out = dist.dist(s1, s2)
end
function normalize(dist::TokenSort; max_dist = 1.0)
TokenSort(normalize(dist.dist; max_dist = max_dist))
end end
""" """
@ -135,7 +126,7 @@ struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
v1 = unique!(sort!(split(s1))) v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2))) v2 = unique!(sort!(split(s2)))
@ -143,18 +134,13 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
s0 = join(v0, " ") s0 = join(v0, " ")
s1 = join(v1, " ") s1 = join(v1, " ")
s2 = join(v2, " ") s2 = join(v2, " ")
isempty(s0) && return dist.dist(s1, s2) isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist)
score_01 = dist.dist(s0, s1) out_01 = dist.dist(s0, s1; max_dist = max_dist)
score_02 = dist.dist(s0, s2) out_02 = dist.dist(s0, s2; max_dist = max_dist)
score_12 = dist.dist(s1, s2) out_12 = dist.dist(s1, s2; max_dist = max_dist)
min(score_01, score_02, score_12) min(out_01, out_02, out_12)
end end
function normalize(dist::TokenSet; max_dist = 1.0)
TokenSet(normalize(dist.dist; max_dist = max_dist))
end
""" """
TokenMax(dist) TokenMax(dist)
@ -173,44 +159,34 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
0.05 0.05
``` ```
""" """
struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric struct TokenMax{S <: Normalized} <: StringSemiMetric
dist::S dist::S
max_dist::Float64
end end
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist) TokenMax(dist::Normalized) = TokenMax{typeof(dist)}(dist)
TokenMax(dist::Union{StringSemiMetric, StringMetric}) = TokenMax(Normalized(dist))
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
max_dist = dist.max_dist dist0 = dist.dist
dist0 = normalize(dist.dist; max_dist = max_dist) out = dist0(s1, s2; max_dist = max_dist)
score = dist0(s1, s2) max_dist = min(max_dist, out)
min_score = min(max_dist, score) scale = 0.95
unbase_scale = 0.95
# if one string is much shorter than the other, use partial # if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1) if len2 >= 1.5 * len1
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9 dist0 = Partial(dist0)
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / partial_scale) pscale = 0.9
score_partial = 1 - partial_scale * (1 - Partial(dist0)(s1, s2)) pout = 1 - pscale * (1 - dist0(s1, s2; max_dist = 1 - (1 - max_dist) / pscale))
min_score = min(max_dist, score_partial) out = min(out, pout)
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale)) max_dist = min(max_dist, pout)
score_sort = 1 - unbase_scale * partial_scale * (1 - TokenSort(Partial(dist0))(s1, s2)) scale *= pscale
max_dist = min(max_dist, score_sort)
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
score_set = 1 - unbase_scale * partial_scale * (1 - TokenSet(Partial(dist0))(s1, s2))
out = min(score, score_partial, score_sort, score_set)
else
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
score_sort = 1 - unbase_scale * (1 - TokenSort(dist0)(s1, s2))
max_dist = min(max_dist, score_sort)
dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
score_set = 1 - unbase_scale * (1 - TokenSet(dist0)(s1, s2))
out = min(score, score_sort, score_set)
end end
out_sort = 1 - scale * (1 - TokenSort(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
max_dist = min(max_dist, out_sort)
out_set = 1 - scale * (1 - TokenSet(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
out = min(out, out_sort, out_set)
out > max_dist ? 1.0 : out out > max_dist ? 1.0 : out
end end
function normalize(dist::TokenMax; max_dist = 1.0) Normalized(dist::TokenMax) = TokenMax(dist.dist)
TokenMax(dist.dist, max_dist)
end

View File

@ -1,35 +1,52 @@
struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric """
dist::V Normalized(dist)
max_dist::Float64
end
function (dist::Normalized{<: Union{Jaro, JaroWinkler, RatcliffObershelp}})(s1, s2) Creates a normalized distance. The distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing)
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> Levenshtein()(s1, s2)
25
julia> StringDistances.Normalized(Levenshtein())(s1, s2)
0.8064
```
"""
struct Normalized{T <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::T
end
Normalized(dist::Normalized) = dist
function (dist::Normalized)(s1, s2; max_dist = 1.0)
out = dist.dist(s1, s2) out = dist.dist(s1, s2)
out > dist.max_dist ? 1.0 : out max_dist !== nothing && out > max_dist && return 1.0
return out
end end
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2) function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0 isempty(s1) && isempty(s2) && return 0.0
out = dist.dist(s1, s2) / length(s2) out = dist.dist(s1, s2) / length(s2)
out > dist.max_dist ? 1.0 : out max_dist !== nothing && out > max_dist && return 1.0
return out
end end
function (dist::Normalized{<:Union{Levenshtein{Nothing}, OptimalStringAlignement{Nothing}}})(s1, s2) function (dist::Normalized{<:Union{Levenshtein, OptimalStringAlignement}})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0 isempty(s1) && isempty(s2) && return 0.0
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
if dist.dist isa Levenshtein if max_dist === nothing || max_dist == 1.0
d = Levenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2) d = dist.dist(s1, s2)
else else
d = OptimalStringAlignement(ceil(Int, len2 * dist.max_dist))(s1, s2) d = dist.dist(s1, s2; max_dist = ceil(Int, len2 * max_dist))
end end
out = d / len2 out = d / len2
out > dist.max_dist ? 1.0 : out max_dist !== nothing && out > max_dist && return 1.0
return out
end end
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2) function (dist::Normalized{<:AbstractQGramDistance})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing (s1 === missing) | (s2 === missing) && return missing
# When string length < q for qgram distance, returns s1 == s2 # When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
@ -40,41 +57,6 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
else else
out = dist.dist(s1, s2) out = dist.dist(s1, s2)
end end
out > dist.max_dist ? 1.0 : out max_dist !== nothing && out > max_dist && return 1.0
return out
end end
"""
normalize(dist)
Creates a normalized distance. The distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing)
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> Levenshtein()(s1, s2)
25
julia> StringDistances.normalize(Levenshtein())(s1, s2)
0.8064
```
"""
normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end

View File

@ -2,161 +2,162 @@ using StringDistances, Unicode, Test, Random
@testset "Distances" begin @testset "Distances" begin
@testset "Hamming" begin @testset "Hamming" begin
@test evaluate(Hamming(), "martha", "marhta") 2 @test Hamming()("martha", "marhta") 2
@test evaluate(Hamming(), "es an ", " vs an") 6 @test Hamming()("es an ", " vs an") 6
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) 1 @test Hamming()([1, 2, 3], [1,2, 4]) 1
@inferred evaluate(Hamming(), "", "") @inferred Hamming()("", "")
@test ismissing(evaluate(Hamming(), "", missing)) @test ismissing(Hamming()("", missing))
end end
@testset "Jaro" begin @testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547 @test Jaro()("martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777 @test Jaro()("es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777 @test Jaro()(" vs an", "es an ") 0.2777777777777777
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) 0.2222222222222222 @test Jaro()([1, 2, 3], [1,2, 4]) 0.2222222222222222
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak") @test Jaro()(graphemes("alborgów"), graphemes("amoniak")) == Jaro()("alborgów", "amoniak")
@test Jaro()(" vs an", "es an ") 0.2777777777777777 @test Jaro()(" vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1)) @test result_type(Jaro(), "hello", "world") == typeof(float(1))
@inferred evaluate(Jaro(), "", "") @inferred Jaro()("", "")
@test ismissing(evaluate(Jaro(), "", missing)) @test ismissing(Jaro()("", missing))
end end
@testset "Levenshtein" begin @testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0 @test Levenshtein()("", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3 @test Levenshtein()("abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3 @test Levenshtein()("", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1 @test Levenshtein()("bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3 @test Levenshtein()("kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3 @test Levenshtein()("saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 @test Levenshtein()("hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "a cat", "an act") == 3 @test Levenshtein()("a cat", "an act") == 3
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 @test Levenshtein()("alborgów", "amoniak") == 6
@test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1 @test Levenshtein()([1, 2, 3], [1, 2, 4]) == 1
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak") @test Levenshtein()(graphemes("alborgów"), graphemes("amoniak")) == Levenshtein()("alborgów", "amoniak")
@test Levenshtein()("", "abc") == 3 @test Levenshtein()("", "abc") == 3
@test result_type(Levenshtein(), "hello", "world") == Int @test result_type(Levenshtein(), "hello", "world") == Int
@inferred evaluate(Levenshtein(), "", "") @inferred Levenshtein()("", "")
@test ismissing(evaluate(Levenshtein(), "", missing)) @test ismissing(Levenshtein()("", missing))
end end
@testset "OptimalStringAlignement" begin @testset "OptimalStringAlignement" begin
@test evaluate(OptimalStringAlignement(), "", "") == 0 @test OptimalStringAlignement()("", "") == 0
@test evaluate(OptimalStringAlignement(), "abc", "") == 3 @test OptimalStringAlignement()("abc", "") == 3
@test evaluate(OptimalStringAlignement(), "bc", "abc") == 1 @test OptimalStringAlignement()("bc", "abc") == 1
@test evaluate(OptimalStringAlignement(), "fuor", "four") == 1 @test OptimalStringAlignement()("fuor", "four") == 1
@test evaluate(OptimalStringAlignement(), "abcd", "acb") == 2 @test OptimalStringAlignement()("abcd", "acb") == 2
@test evaluate(OptimalStringAlignement(), "cape sand recycling ", "edith ann graham") == 17 @test OptimalStringAlignement()("cape sand recycling ", "edith ann graham") == 17
@test evaluate(OptimalStringAlignement(), "jellyifhs", "jellyfish") == 2 @test OptimalStringAlignement()("jellyifhs", "jellyfish") == 2
@test evaluate(OptimalStringAlignement(), "ifhs", "fish") == 2 @test OptimalStringAlignement()("ifhs", "fish") == 2
@test evaluate(OptimalStringAlignement(), "a cat", "an act") == 2 @test OptimalStringAlignement()("a cat", "an act") == 2
@test evaluate(OptimalStringAlignement(), "a cat", "an abct") == 4 @test OptimalStringAlignement()("a cat", "an abct") == 4
@test evaluate(OptimalStringAlignement(), "a cat", "a tc") == 3 @test OptimalStringAlignement()("a cat", "a tc") == 3
@test OptimalStringAlignement(2)("abcdef", "abcxyf") == 2 @test OptimalStringAlignement()("abcdef", "abcxyf") == 2
@test OptimalStringAlignement()("abcdef", "abcxyf"; max_dist = 2) == 2
@test evaluate(OptimalStringAlignement(), [1, 2, 3], [1,2, 4]) == 1 @test OptimalStringAlignement()([1, 2, 3], [1,2, 4]) == 1
@test evaluate(OptimalStringAlignement(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(OptimalStringAlignement(), "alborgów", "amoniak") @test OptimalStringAlignement()(graphemes("alborgów"), graphemes("amoniak")) == OptimalStringAlignement()("alborgów", "amoniak")
@test OptimalStringAlignement()("bc", "abc") == 1 @test OptimalStringAlignement()("bc", "abc") == 1
@test result_type(OptimalStringAlignement(), "hello", "world") == Int @test result_type(OptimalStringAlignement(), "hello", "world") == Int
@inferred evaluate(OptimalStringAlignement(), "", "") @inferred OptimalStringAlignement()("", "")
@test ismissing(evaluate(OptimalStringAlignement(), "", missing)) @test ismissing(OptimalStringAlignement()("", missing))
end end
@testset "DamerauLevenshtein" begin @testset "DamerauLevenshtein" begin
@test evaluate(DamerauLevenshtein(), "", "") == 0 @test DamerauLevenshtein()("", "") == 0
@test evaluate(DamerauLevenshtein(), "CA", "ABC") == 2 @test DamerauLevenshtein()("CA", "ABC") == 2
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABDCEF") == 1 @test DamerauLevenshtein()("ABCDEF", "ABDCEF") == 1
@test evaluate(DamerauLevenshtein(), "ABCDEF", "BACDFE") == 2 @test DamerauLevenshtein()("ABCDEF", "BACDFE") == 2
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABCDE") == 1 @test DamerauLevenshtein()("ABCDEF", "ABCDE") == 1
@test evaluate(DamerauLevenshtein(), "a cat", "an act") == 2 @test DamerauLevenshtein()("a cat", "an act") == 2
@test evaluate(DamerauLevenshtein(), "a cat", "an abct") == 3 @test DamerauLevenshtein()("a cat", "an abct") == 3
@test evaluate(DamerauLevenshtein(), "a cat", "a tc") == 2 @test DamerauLevenshtein()("a cat", "a tc") == 2
@test result_type(DamerauLevenshtein(), "hello", "world") == Int @test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred evaluate(DamerauLevenshtein(), "", "") @inferred DamerauLevenshtein()("", "")
@test ismissing(evaluate(DamerauLevenshtein(), "", missing)) @test ismissing(DamerauLevenshtein()("", missing))
end end
@testset "RatcliffObershelp" begin @testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154 @test RatcliffObershelp()("dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579 @test RatcliffObershelp()("alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666 @test RatcliffObershelp()("pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0 @test RatcliffObershelp()("", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963 @test RatcliffObershelp()("NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869 @test RatcliffObershelp()("Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762 @test RatcliffObershelp()("New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) 1/3 @test RatcliffObershelp()([1, 2, 3], [1,2, 4]) 1/3
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak") @test RatcliffObershelp()(graphemes("alborgów"), graphemes("amoniak")) == RatcliffObershelp()("alborgów", "amoniak")
@test RatcliffObershelp()("pennsylvania", "pencilvaneya") 1 - 0.6666666666666 @test RatcliffObershelp()("pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@inferred evaluate(RatcliffObershelp(), "", "") @inferred RatcliffObershelp()("", "")
@test ismissing(evaluate(RatcliffObershelp(), "", missing)) @test ismissing(RatcliffObershelp()("", missing))
end end
@testset "QGram" begin @testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0 @test QGram(1)("abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3 @test QGram(1)("", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0 @test QGram(1)("abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4 @test QGram(1)("abc", "ccc") == 4
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4 @test QGram(4)("aü☃", "aüaüafs") == 4
@test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2 @test QGram(2)(SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(QGram(2), "alborgów", "amoniak") @test QGram(2)(graphemes("alborgów"), graphemes("amoniak")) QGram(2)("alborgów", "amoniak")
@test QGram(1)("abc", "cba") == 0 @test QGram(1)("abc", "cba") == 0
@test result_type(QGram(1), "hello", "world") == Int @test result_type(QGram(1), "hello", "world") == Int
@test ismissing(evaluate(QGram(1), "", missing)) @test ismissing(QGram(1)("", missing))
@inferred evaluate(QGram(1), "", "") @inferred QGram(1)("", "")
end end
@testset "Cosine" begin @testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc")) @test isnan(Cosine(2)("", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4 @test Cosine(2)("abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4 @test Cosine(2)("leia", "leela") 0.7113249 atol = 1e-4
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) 0.5 @test Cosine(2)([1, 2, 3], [1, 2, 4]) 0.5
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Cosine(2), "alborgów", "amoniak") @test Cosine(2)(graphemes("alborgów"), graphemes("amoniak")) Cosine(2)("alborgów", "amoniak")
@test Cosine(2)("leia", "leela") 0.7113249 atol = 1e-4 @test Cosine(2)("leia", "leela") 0.7113249 atol = 1e-4
@test result_type(Cosine(2), "hello", "world") == typeof(float(1)) @test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@inferred evaluate(Cosine(2), "", "") @inferred Cosine(2)("", "")
@test ismissing(evaluate(Cosine(2), "", missing)) @test ismissing(Cosine(2)("", missing))
end end
@testset "Jaccard" begin @testset "Jaccard" begin
@test evaluate(Jaccard(1), "", "abc") 1.0 @test Jaccard(1)("", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") 2/3 atol = 1e-4 @test Jaccard(1)("abc", "ccc") 2/3 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4 @test Jaccard(2)("leia", "leela") 0.83333 atol = 1e-4
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) 2/3 atol = 1e-4 @test Jaccard(2)([1, 2, 3], [1, 2, 4]) 2/3 atol = 1e-4
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Jaccard(2), "alborgów", "amoniak") @test Jaccard(2)(graphemes("alborgów"), graphemes("amoniak")) Jaccard(2)("alborgów", "amoniak")
@test Jaccard(2)("leia", "leela") 0.83333 atol = 1e-4 @test Jaccard(2)("leia", "leela") 0.83333 atol = 1e-4
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1)) @test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Jaccard(1), "", "") @inferred Jaccard(1)("", "")
@test ismissing(evaluate(Jaccard(1), "", missing)) @test ismissing(Jaccard(1)("", missing))
end end
@testset "SorensenDice" begin @testset "SorensenDice" begin
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4 @test SorensenDice(1)("night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4 @test SorensenDice(2)("night", "nacht") 0.75 atol = 1e-4
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(SorensenDice(2), "alborgów", "amoniak") @test SorensenDice(2)(graphemes("alborgów"), graphemes("amoniak")) SorensenDice(2)("alborgów", "amoniak")
@test SorensenDice(2)("night", "nacht") 0.75 atol = 1e-4 @test SorensenDice(2)("night", "nacht") 0.75 atol = 1e-4
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1)) @test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@inferred evaluate(SorensenDice(1), "", "") @inferred SorensenDice(1)("", "")
@test ismissing(evaluate(SorensenDice(1), "", missing)) @test ismissing(SorensenDice(1)("", missing))
end end
@testset "Overlap" begin @testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4 @test Overlap(1)("night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4 @test Overlap(1)("context", "contact") .2 atol = 1e-4
@test Overlap(1)("context", "contact") .2 atol = 1e-4 @test Overlap(1)("context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1)) @test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Overlap(1), "", "") @inferred Overlap(1)("", "")
@test ismissing(evaluate(Overlap(1), "", missing)) @test ismissing(Overlap(1)("", missing))
end end
@testset "MorisitaOverlap" begin @testset "MorisitaOverlap" begin
# overlap for 'n', 'h', and 't' and 5 q-grams per string: # overlap for 'n', 'h', and 't' and 5 q-grams per string:
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5)) @test MorisitaOverlap(1)("night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors # overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
# ms1 = [1, 1, 1, 2, 1, 1, 0] # ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1] # ms2 = [2, 1, 1, 2, 0, 0, 1]
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7 # sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
@test evaluate(MorisitaOverlap(1), "context", "contact") .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20 @test MorisitaOverlap(1)("context", "contact") .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") .2 atol = 1e-4 @test MorisitaOverlap(1)("context", "contact") .2 atol = 1e-4
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct" # Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
@ -166,17 +167,17 @@ using StringDistances, Unicode, Test, Random
@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6)) @test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(MorisitaOverlap(1), "", "") @inferred MorisitaOverlap(1)("", "")
@test ismissing(evaluate(MorisitaOverlap(1), "", missing)) @test ismissing(MorisitaOverlap(1)("", missing))
end end
@testset "NMD" begin @testset "NMD" begin
# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1] # m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5 @test NMD(1)("night", "nacht") == 0.4 # (7-5)/5
# ms1 = [1, 1, 1, 2, 1, 1, 0] # ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1] # ms2 = [2, 1, 1, 2, 0, 0, 1]
@test evaluate(NMD(1), "context", "contact") 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7) @test NMD(1)("context", "contact") 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
@test NMD(1)("context", "contact") 0.2857 atol = 1e-4 @test NMD(1)("context", "contact") 0.2857 atol = 1e-4
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0] # ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
@ -184,8 +185,8 @@ using StringDistances, Unicode, Test, Random
@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6 @test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
@test result_type(NMD(1), "hello", "world") == typeof(float(1)) @test result_type(NMD(1), "hello", "world") == typeof(float(1))
@inferred evaluate(NMD(1), "", "") @inferred NMD(1)("", "")
@test ismissing(evaluate(NMD(1), "", missing)) @test ismissing(NMD(1)("", missing))
end end
@testset "QGramDict and QGramSortedVector counts qgrams" begin @testset "QGramDict and QGramSortedVector counts qgrams" begin
@ -236,37 +237,37 @@ using StringDistances, Unicode, Test, Random
for _ in 1:100 for _ in 1:100
qlen = rand(2:5) qlen = rand(2:5)
str1, str2 = partlyoverlappingstrings(6:100, Chars) str1, str2 = partlyoverlappingstrings(6:100, Chars)
d = Jaccard(qlen) dist = Jaccard(qlen)
qd1 = QGramDict(str1, qlen) qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen) qd2 = QGramDict(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2) @test dist(str1, str2) == dist(qd1, qd2)
qd1b = QGramDict(graphemes(str1), qlen) qd1b = QGramDict(graphemes(str1), qlen)
qd2b = QGramDict(graphemes(str2), qlen) qd2b = QGramDict(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b) @test dist(str1, str2) == dist(qd1b, qd2b)
qc1 = QGramSortedVector(str1, qlen) qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen) qc2 = QGramSortedVector(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2) @test dist(str1, str2) == dist(qc1, qc2)
qc1b = QGramSortedVector(graphemes(str1), qlen) qc1b = QGramSortedVector(graphemes(str1), qlen)
qc2b = QGramSortedVector(graphemes(str2), qlen) qc2b = QGramSortedVector(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b) @test dist(str1, str2) == dist(qc1b, qc2b)
end end
end end
@testset "QGram distance on short strings" begin @testset "QGram distance on short strings" begin
@test isnan(evaluate(Overlap(2), "1", "2")) @test isnan(Overlap(2)( "1", "2"))
@test isnan(evaluate(Jaccard(3), "s1", "s2")) @test isnan(Jaccard(3)("s1", "s2"))
@test isnan(evaluate(Cosine(5), "s1", "s2")) @test isnan(Cosine(5)( "s1", "s2"))
@test !isnan(evaluate(Overlap(2), "s1", "s2")) @test !isnan(Overlap(2)( "s1", "s2"))
@test !isnan(evaluate(Jaccard(3), "st1", "st2")) @test !isnan(Jaccard(3)("st1", "st2"))
@test !isnan(evaluate(Cosine(5), "stri1", "stri2")) @test !isnan(Cosine(5)( "stri1", "stri2"))
@test !isnan(evaluate(Jaccard(3), "st1", "str2")) @test !isnan(Jaccard(3)("st1", "str2"))
@test !isnan(evaluate(Jaccard(3), "str1", "st2")) @test !isnan(Jaccard(3)("str1", "st2"))
end end
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
@ -279,13 +280,13 @@ using StringDistances, Unicode, Test, Random
# QGramDict gets same result as for standard string # QGramDict gets same result as for standard string
qd1 = QGramDict(str1, qlen) qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen) qd2 = QGramDict(str2, qlen)
expected = evaluate(dist, str1, str2) expected = dist(str1, str2)
@test expected == evaluate(dist, qd1, qd2) @test expected == dist(qd1, qd2)
# QGramSortedVector gets same result as for standard string # QGramSortedVector gets same result as for standard string
qc1 = QGramSortedVector(str1, qlen) qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen) qc2 = QGramSortedVector(str2, qlen)
@test expected == evaluate(dist, qc1, qc2) @test expected == dist(qc1, qc2)
end end
end end
end end
@ -319,33 +320,30 @@ using StringDistances, Unicode, Test, Random
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
# Test with R package StringDist # Test with R package StringDist
for x in solutions for x in solutions
t, solution = x dist, solution = x
for i in eachindex(solution) for i in eachindex(solution)
if isnan(evaluate(t, strings[i]...)) if isnan(dist(strings[i]...))
@test isnan(solution[i]) @test isnan(solution[i])
else else
@test evaluate(t, strings[i]...) solution[i] atol = 1e-4 @test dist(strings[i]...) solution[i] atol = 1e-4
end end
end end
end end
# test RatcliffObershelp # test RatcliffObershelp
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67] solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
for i in eachindex(strings) for i in eachindex(strings)
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) solution[i] atol = 1e-4 @test round(Int, (1 - RatcliffObershelp()(strings[i]...)) * 100) solution[i] atol = 1e-4
end end
# test max_dist # test max_dist
for i in eachindex(strings) for i in eachindex(strings)
d = Levenshtein()(strings[i]...) d = Levenshtein()(strings[i]...)
@test Levenshtein(d)(strings[i]...) == d @test Levenshtein()(strings[i]...; max_dist = d) == d
d = OptimalStringAlignement()(strings[i]...) d = OptimalStringAlignement()(strings[i]...)
@test OptimalStringAlignement(d)(strings[i]...) == d @test OptimalStringAlignement()(strings[i]...; max_dist = d) == d
end end
end end
d = OptimalStringAlignement()("abcdef", "abcxyf")
@test OptimalStringAlignement(d)("abcdef", "abcxyf") == d
#= R test #= R test

View File

@ -61,7 +61,7 @@ end
compare("aüa", "aua", Levenshtein()) compare("aüa", "aua", Levenshtein())
@test compare("ok", missing, Levenshtein()) === missing @test compare("ok", missing, Levenshtein()) === missing
compare("aüa", "aua", OptimalStringAlignement()) compare("aüa", "aua", OptimalStringAlignement())
@test StringDistances.normalize(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0 @test StringDistances.Normalized(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0 @test compare("ab", "de", Partial(OptimalStringAlignement())) == 0
# RatcliffObershelp # RatcliffObershelp