update

2020-11-14 11:40:44 -08:00 · 2020-11-14 11:40:44 -08:00 · f9675fd110
parent 1cc89f0827
commit f9675fd110
7 changed files with 39 additions and 37 deletions
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -30,10 +30,10 @@ DamerauLevenshtein,
 Jaro,
 JaroWinkler,
 RatcliffObershelp,
-QGramDistance,
-QGram,
+AbstractQGramDistance,
 QGramDict,
 QGramSortedVector,
+QGram,
 Cosine,
 Jaccard,
 SorensenDice,
@ -48,7 +48,6 @@ evaluate,
 compare,
 result_type,
 qgrams,
-normalize,
 findnearest,
 pairwise,
 pairwise!
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,
    end
 end

-abstract type QGramDistance <: SemiMetric end
+abstract type AbstractQGramDistance <: SemiMetric end

-function (dist::QGramDistance)(s1, s2)
+function (dist::AbstractQGramDistance)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	counter = newcounter(dist)
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2)
 	calculate(dist, counter)
 end

-function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
+function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
    @assert dist.q == q(qc1)
 	@assert dist.q == q(qc2)
 	counter = newcounter(dist)
@ -268,11 +268,11 @@ The distance corresponds to
 where ``v(s, q)`` denotes the vector on the space of q-grams of length q, 
 that contains the number of times a q-gram appears for the string s
 """
-struct QGram <: QGramDistance
+struct QGram <: AbstractQGramDistance
 	q::Int
 end

-mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
 	n::T
 end

@ -296,11 +296,11 @@ The distance corresponds to
 where ``v(s, q)`` denotes the vector on the space of q-grams of length q, 
 that contains the  number of times a q-gram appears for the string s
 """
-struct Cosine <: QGramDistance
+struct Cosine <: AbstractQGramDistance
 	q::Int
 end

-mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
 	left::T
 	right::T
 	shared::T
@ -326,7 +326,7 @@ The distance corresponds to

 where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 """
-struct Jaccard <: QGramDistance
+struct Jaccard <: AbstractQGramDistance
 	q::Int
 end

@ -344,7 +344,7 @@ The distance corresponds to

 where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 """
-struct SorensenDice <: QGramDistance
+struct SorensenDice <: AbstractQGramDistance
 	q::Int
 end

@ -362,7 +362,7 @@ The distance corresponds to

 where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 """
-struct Overlap <: QGramDistance
+struct Overlap <: AbstractQGramDistance
 	q::Int
 end

@ -396,11 +396,11 @@ The distance corresponds to
 where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
 sum of those counts.
 """
-struct MorisitaOverlap <: QGramDistance
+struct MorisitaOverlap <: AbstractQGramDistance
 	q::Int
 end

-mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
 	leftsum::T    # sum(m(s1))
 	rightsum::T   # sum(m(s2))
 	leftsq::T     # sum(m(s1).^2)
@ -445,7 +445,7 @@ sum of those counts.
 For details see:
 https://www.sciencedirect.com/science/article/pii/S1047320313001417
 """
-struct NMD <: QGramDistance
+struct NMD <: AbstractQGramDistance
 	q::Int
 end

--- a/src/modifiers.jl
+++ b/src/modifiers.jl
@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 ```julia-repl
 julia> s1 = "New York Mets vs Atlanta Braves"
 julia> s2 = "Atlanta Braves vs New York Mets"
-julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
+julia> Partial(RatcliffObershelp())(s1, s2)
 0.5483870967741935
 ```
 """
@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
 end

 function (dist::Partial)(s1, s2)
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    out = dist.dist(s1, s2)
@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2)
 end

 function (dist::Partial{RatcliffObershelp})(s1, s2)
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return dist.dist(s1, s2)
@ -68,7 +70,7 @@ It is only defined on AbstractStrings.
 julia> s1 = "New York Mets vs Atlanta Braves"
 julia> s1 = "New York Mets vs Atlanta Braves"
 julia> s2 = "Atlanta Braves vs New York Mets"
-julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
+julia> TokenSort(RatcliffObershelp())(s1, s2)
 0.0
 ```
 """
@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
    dist::S
 end

-# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function (dist::TokenSort)(s1::AbstractString, s2::AbstractString)
+function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
    out = dist.dist(s1, s2)
@ -89,9 +91,9 @@ end
 Creates the `TokenSet{dist}` distance.

 `TokenSet{dist}` returns the minimum the distances between:
-t0 = [SORTED_INTERSECTION]
-t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
-t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
+[SORTED_INTERSECTION]
+[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
+[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
 See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

 It is only defined on AbstractStrings.
@ -100,7 +102,7 @@ It is only defined on AbstractStrings.
 ```julia-repl
 julia> s1 = "New York Mets vs Atlanta"
 julia> s2 = "Atlanta Braves vs New York Mets"
-julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
+julia> TokenSet(RatcliffObershelp())(s1, s2)
 0.0
 ```
 """
@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
    dist::S
 end

-# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function (dist::TokenSet)(s1::AbstractString, s2::AbstractString)
+function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
+    ((s1 === missing) | (s2 === missing)) && return missing
    v1 = unique!(sort!(split(s1)))
    v2 = unique!(sort!(split(s2)))
    v0 = intersect(v1, v2)
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth
    out > dist.max_dist ? 1.0 : out
 end

-function (dist::Normalized{<:QGramDistance})(s1, s2)
+function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
    ((s1 === missing) | (s2 === missing)) && return missing
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
@ -94,7 +94,8 @@ end
 TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
 normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)

-function (dist::TokenMax)(s1::AbstractString, s2::AbstractString)
+function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    max_dist = dist.max_dist
@ -128,7 +129,7 @@ end



-const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
+const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}

 """
    compare(s1, s2, dist)
@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
 end

-function _helper(s, dist::QGramDistance)
+function _helper(s, dist::AbstractQGramDistance)
    s !== missing ? QGramSortedVector(s, dist.q) : s
 end
 _helper(s, dist::StringDistance) = s
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -5,7 +5,7 @@
 Compute distances between all pairs of elements in `xs`  and `ys` according to the
 `StringDistance` `dist`.

-For QGramDistances preprocessing will be used either if `preprocess` is set 
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
 to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
 false if no preprocessing should be used, regardless of length.

@ -53,7 +53,7 @@ end
 Compute distances between all pairs of elements in `xs` and `ys` according to the
 `StringDistance` `dist` and write the result in `R`.

-For QGramDistances preprocessing will be used either if `preprocess` is set 
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
 to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
 false if no preprocessing should be used, regardless of length.
 """
@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra
        _asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
 end

-function _preprocess(xs, dist::QGramDistance, preprocess)
+function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
    if preprocess === nothing ? length(xs) >= 5 : preprocess 
        return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
    else
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -24,7 +24,7 @@ using StringDistances, Unicode, Test
 	compare("aüa", "aua", Levenshtein())
 	compare("aüa", "aua", DamerauLevenshtein())
 	@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
-	@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
+	@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
 	# Winkler
 	@test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4
 	@test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4
@ -106,7 +106,7 @@ using StringDistances, Unicode, Test


 	@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
-	@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1)
+	@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)


 	@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
--- a/test/pairwise.jl
+++ b/test/pairwise.jl
@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing]
 	for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
 				QGram, Cosine, Jaccard, SorensenDice, Overlap]

-		d = (DT <: QGramDistance) ? DT(2) : DT()
+		d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
 		R = pairwise(d, TestStrings1)

 		@test size(R) == (4, 4)
@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing]
 		end

 		# Ensure same result if preprocessing for QGramDistances
-		if DT <: QGramDistance
+		if DT <: AbstractQGramDistance
 			R4 = pairwise(d, TestStrings1; preprocess = true)
 			@test typeof(R4) == typeof(R)
 			@test size(R4) == size(R)