diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 981ce93..f146a97 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -30,10 +30,10 @@ DamerauLevenshtein, Jaro, JaroWinkler, RatcliffObershelp, -QGramDistance, -QGram, +AbstractQGramDistance, QGramDict, QGramSortedVector, +QGram, Cosine, Jaccard, SorensenDice, @@ -48,7 +48,6 @@ evaluate, compare, result_type, qgrams, -normalize, findnearest, pairwise, pairwise! diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 098a3a7..3acb5ec 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K, end end -abstract type QGramDistance <: SemiMetric end +abstract type AbstractQGramDistance <: SemiMetric end -function (dist::QGramDistance)(s1, s2) +function (dist::AbstractQGramDistance)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing counter = newcounter(dist) for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) @@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2) calculate(dist, counter) end -function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts} +function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts} @assert dist.q == q(qc1) @assert dist.q == q(qc2) counter = newcounter(dist) @@ -268,11 +268,11 @@ The distance corresponds to where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s """ -struct QGram <: QGramDistance +struct QGram <: AbstractQGramDistance q::Int end -mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter +mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter n::T end @@ -296,11 +296,11 @@ The distance corresponds to where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s """ -struct Cosine <: QGramDistance +struct Cosine <: AbstractQGramDistance q::Int end -mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter +mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter left::T right::T shared::T @@ -326,7 +326,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct Jaccard <: QGramDistance +struct Jaccard <: AbstractQGramDistance q::Int end @@ -344,7 +344,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct SorensenDice <: QGramDistance +struct SorensenDice <: AbstractQGramDistance q::Int end @@ -362,7 +362,7 @@ The distance corresponds to where ``Q(s, q)`` denotes the set of q-grams of length n for the string s """ -struct Overlap <: QGramDistance +struct Overlap <: AbstractQGramDistance q::Int end @@ -396,11 +396,11 @@ The distance corresponds to where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the sum of those counts. """ -struct MorisitaOverlap <: QGramDistance +struct MorisitaOverlap <: AbstractQGramDistance q::Int end -mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter +mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter leftsum::T # sum(m(s1)) rightsum::T # sum(m(s2)) leftsq::T # sum(m(s1).^2) @@ -445,7 +445,7 @@ sum of those counts. For details see: https://www.sciencedirect.com/science/article/pii/S1047320313001417 """ -struct NMD <: QGramDistance +struct NMD <: AbstractQGramDistance q::Int end diff --git a/src/modifiers.jl b/src/modifiers.jl index 98287c4..0ee8338 100755 --- a/src/modifiers.jl +++ b/src/modifiers.jl @@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ ```julia-repl julia> s1 = "New York Mets vs Atlanta Braves" julia> s2 = "Atlanta Braves vs New York Mets" -julia> evaluate(Partial(RatcliffObershelp()), s1, s2) +julia> Partial(RatcliffObershelp())(s1, s2) 0.5483870967741935 ``` """ @@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric end function (dist::Partial)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) out = dist.dist(s1, s2) @@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2) end function (dist::Partial{RatcliffObershelp})(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return dist.dist(s1, s2) @@ -68,7 +70,7 @@ It is only defined on AbstractStrings. julia> s1 = "New York Mets vs Atlanta Braves" julia> s1 = "New York Mets vs Atlanta Braves" julia> s2 = "Atlanta Braves vs New York Mets" -julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2) +julia> TokenSort(RatcliffObershelp())(s1, s2) 0.0 ``` """ @@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric dist::S end -# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -function (dist::TokenSort)(s1::AbstractString, s2::AbstractString) +function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + ((s1 === missing) | (s2 === missing)) && return missing s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") out = dist.dist(s1, s2) @@ -89,9 +91,9 @@ end Creates the `TokenSet{dist}` distance. `TokenSet{dist}` returns the minimum the distances between: -t0 = [SORTED_INTERSECTION] -t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1] -t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2] +[SORTED_INTERSECTION] +[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1] +[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2] See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ It is only defined on AbstractStrings. @@ -100,7 +102,7 @@ It is only defined on AbstractStrings. ```julia-repl julia> s1 = "New York Mets vs Atlanta" julia> s2 = "Atlanta Braves vs New York Mets" -julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2) +julia> TokenSet(RatcliffObershelp())(s1, s2) 0.0 ``` """ @@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric dist::S end -# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -function (dist::TokenSet)(s1::AbstractString, s2::AbstractString) +function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + ((s1 === missing) | (s2 === missing)) && return missing v1 = unique!(sort!(split(s1))) v2 = unique!(sort!(split(s2))) v0 = intersect(v1, v2) diff --git a/src/normalize.jl b/src/normalize.jl index 161a0e3..cf52697 100755 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth out > dist.max_dist ? 1.0 : out end -function (dist::Normalized{<:QGramDistance})(s1, s2) +function (dist::Normalized{<:AbstractQGramDistance})(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) @@ -94,7 +94,8 @@ end TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist) normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist) -function (dist::TokenMax)(s1::AbstractString, s2::AbstractString) +function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist = dist.max_dist @@ -128,7 +129,7 @@ end -const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized} +const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized} """ compare(s1, s2, dist) @@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0) imax == 0 ? (nothing, nothing) : (itr[imax], imax) end -function _helper(s, dist::QGramDistance) +function _helper(s, dist::AbstractQGramDistance) s !== missing ? QGramSortedVector(s, dist.q) : s end _helper(s, dist::StringDistance) = s diff --git a/src/pairwise.jl b/src/pairwise.jl index 6d734e1..3fd6aa2 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -5,7 +5,7 @@ Compute distances between all pairs of elements in `xs` and `ys` according to the `StringDistance` `dist`. -For QGramDistances preprocessing will be used either if `preprocess` is set +For AbstractQGramDistances preprocessing will be used either if `preprocess` is set to true or if there are more than 5 elements in `xs`. Set `preprocess` to false if no preprocessing should be used, regardless of length. @@ -53,7 +53,7 @@ end Compute distances between all pairs of elements in `xs` and `ys` according to the `StringDistance` `dist` and write the result in `R`. -For QGramDistances preprocessing will be used either if `preprocess` is set +For AbstractQGramDistances preprocessing will be used either if `preprocess` is set to true or if there are more than 5 elements in `xs`. Set `preprocess` to false if no preprocessing should be used, regardless of length. """ @@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra _asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess) end -function _preprocess(xs, dist::QGramDistance, preprocess) +function _preprocess(xs, dist::AbstractQGramDistance, preprocess) if preprocess === nothing ? length(xs) >= 5 : preprocess return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs) else diff --git a/test/modifiers.jl b/test/modifiers.jl index 214d18b..7c2f73d 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -24,7 +24,7 @@ using StringDistances, Unicode, Test compare("aüa", "aua", Levenshtein()) compare("aüa", "aua", DamerauLevenshtein()) @test compare("ab", "de", Partial(DamerauLevenshtein())) == 0 - @test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 + @test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 # Winkler @test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4 @test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4 @@ -106,7 +106,7 @@ using StringDistances, Unicode, Test @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) - @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1) + @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1) @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] diff --git a/test/pairwise.jl b/test/pairwise.jl index 94e61ca..ae4010e 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing] for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGram, Cosine, Jaccard, SorensenDice, Overlap] - d = (DT <: QGramDistance) ? DT(2) : DT() + d = (DT <: AbstractQGramDistance) ? DT(2) : DT() R = pairwise(d, TestStrings1) @test size(R) == (4, 4) @@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing] end # Ensure same result if preprocessing for QGramDistances - if DT <: QGramDistance + if DT <: AbstractQGramDistance R4 = pairwise(d, TestStrings1; preprocess = true) @test typeof(R4) == typeof(R) @test size(R4) == size(R)