diff --git a/Project.toml b/Project.toml index 02ca65b..5db4854 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" [compat] julia = "1" -Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" +Distances = "0.8.1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 2688665..03b5d22 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -8,8 +8,11 @@ include("qgram.jl") include("modifiers.jl") const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize} -Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", "")) -Distances.evaluate(dist::StringDistance, args...) = dist(args...) +Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", "")) + + + + include("find.jl") ############################################################################## diff --git a/src/edit.jl b/src/edit.jl index 7a35a5f..983b806 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -12,12 +12,11 @@ where ``m`` is the number of matching characters and ``t`` is half the number of transpositions. """ struct Jaro <: SemiMetric end -isnormalized(::Jaro) = true ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html ## accepts any iterator, including AbstractString -function (dist::Jaro)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::Jaro)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) # If both are empty, the formula in Wikipedia gives 0 @@ -25,6 +24,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing) len2 == 0 && return 0.0 maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) + prevstate1 = firstindex(s1) ch1_match = Vector{eltype(s1)}(undef, len1) # m counts number matching characters m = 0 @@ -55,6 +55,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing) end x1 = iterate(s1, state1) i1 += 1 + prevstate1 = state1 end m == 0 && return 1.0 # t counts number of transpositions @@ -82,11 +83,11 @@ substitutions of a single character) required to change one string into the othe struct Levenshtein <: Metric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html -# Return max_dist +1 if distance higher than max_dist +# Return max_dist + 1 if distance higher than max_dist # This makes it possible to differentiate distance equalt to max_dist vs strictly higher # This is important for find_all function (dist::Levenshtein)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 @@ -125,8 +126,6 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing) return current end - - """ DamerauLevenshtein() @@ -139,8 +138,9 @@ required to change one string into the other. struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html +# Return max_dist + 1 if distance higher than max_dist function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 @@ -223,10 +223,8 @@ region on either side of the longest common subsequence. """ struct RatcliffObershelp <: SemiMetric end -isnormalized(::RatcliffObershelp) = true - -function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::RatcliffObershelp)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2) diff --git a/src/modifiers.jl b/src/modifiers.jl index 7a93b13..325881a 100755 --- a/src/modifiers.jl +++ b/src/modifiers.jl @@ -6,16 +6,12 @@ end Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 (or a `missing` if one element is missing) """ -function normalize(dist::SemiMetric) - isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist) -end - -isnormalized(dist::SemiMetric) = false -isnormalized(dist::Normalize) = true +# also a normalized distance always accept a third argument, max_dist. +normalize(dist::SemiMetric) = Normalize{typeof(dist)}(dist) function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0) - (ismissing(s1) | ismissing(s2)) && return missing + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 @@ -25,7 +21,7 @@ function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, ma end function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0) - (ismissing(s1) | ismissing(s2)) && return missing + ((s1 === missing) | (s2 === missing)) && return missing # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -37,6 +33,9 @@ function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0) end end +function (dist::Normalize)(s1, s2, max_dist = 1.0) + dist.dist(s1, s2) +end """ Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4) @@ -55,15 +54,15 @@ struct Winkler{S <: SemiMetric} <: SemiMetric maxlength::Integer # max length of common prefix. Default to 4 Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength) end -isnormalized(dist::Winkler) = true function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4) p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one") Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4) end +normalize(dist::Winkler) = dist function (dist::Winkler)(s1, s2, max_dist = 1.0) - # cannot do min_score because of boosting threshold + # cannot do max_dist because of boosting threshold score = dist.dist(s1, s2) if score <= 1 - dist.threshold l = common_prefix(s1, s2)[1] @@ -94,13 +93,13 @@ struct Partial{S <: SemiMetric} <: SemiMetric Partial{S}(dist::S) where {S <: SemiMetric} = new(dist) end Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist)) -isnormalized(dist::Partial) = true +normalize(dist::Partial) = dist function (dist::Partial)(s1, s2, max_dist = 1.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return dist.dist(s1, s2, max_dist) - len1 == 0 && return 0.0 + len1 == 0 && return 1.0 out = 1.0 for x in qgrams(s2, len1) curr = dist.dist(s1, x, max_dist) @@ -110,7 +109,7 @@ function (dist::Partial)(s1, s2, max_dist = 1.0) return out end -function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0) +function (dist::Partial{Normalize{RatcliffObershelp}})(s1, s2, max_dist = 1.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return dist.dist(s1, s2) @@ -127,7 +126,6 @@ function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0) s2_end += len2 - s2_end end curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end)) - out = min(out, curr) end return out @@ -155,7 +153,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist) end TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)) -isnormalized(dist::TokenSort) = true +normalize(dist::TokenSort) = dist # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) @@ -187,6 +185,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric end TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist)) isnormalized(dist::TokenSet) = true +normalize(dist::TokenSet) = dist # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) @@ -229,7 +228,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric end TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist)) -isnormalized(dist::TokenMax) = true +normalize(dist::TokenMax) = dist function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) s1, s2 = reorder(s1, s2) diff --git a/src/qgram.jl b/src/qgram.jl index c07407b..f3587a7 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -97,7 +97,7 @@ struct QGram <: QGramDistance end function (dist::QGram)(s1, s2) - (ismissing(s1) | ismissing(s2)) && return missing + ((s1 === missing) | (s2 === missing)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) n = 0 for (n1, n2) in itr @@ -122,8 +122,8 @@ struct Cosine <: QGramDistance q::Int end -function (dist::Cosine)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::Cosine)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) norm1, norm2, prodnorm = 0, 0, 0 for (n1, n2) in itr @@ -149,8 +149,8 @@ struct Jaccard <: QGramDistance q::Int end -function (dist::Jaccard)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::Jaccard)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr @@ -176,8 +176,8 @@ struct SorensenDice <: QGramDistance q::Int end -function (dist::SorensenDice)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::SorensenDice)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr @@ -203,8 +203,8 @@ struct Overlap <: QGramDistance q::Int end -function (dist::Overlap)(s1, s2, max_dist = nothing) - (ismissing(s1) | ismissing(s2)) && return missing +function (dist::Overlap)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr diff --git a/src/utils.jl b/src/utils.jl index d7aa513..a2d84bf 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -23,6 +23,8 @@ struct StringWithLength{T <: AbstractString} <: AbstractString l::Int end string_with_length(s::AbstractString) = StringWithLength(s, length(s)) +# Not really needed but avoid multi-encapsulation +string_with_length(s::StringWithLength) = s Base.length(s::StringWithLength) = s.l Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i) Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) diff --git a/test/modifiers.jl b/test/modifiers.jl index 3baa6f0..7edead5 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -37,6 +37,7 @@ using StringDistances, Unicode, Test @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 compare("aüa", "aua", TokenMax(RatcliffObershelp())) + @test compare("New York Yankees", "", Partial(Jaro())) ≈ 0.0 @test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0 @test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0 @test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444