pull/44/head
matthieugomez 2020-11-14 11:40:44 -08:00
parent 1cc89f0827
commit f9675fd110
7 changed files with 39 additions and 37 deletions

View File

@ -30,10 +30,10 @@ DamerauLevenshtein,
Jaro, Jaro,
JaroWinkler, JaroWinkler,
RatcliffObershelp, RatcliffObershelp,
QGramDistance, AbstractQGramDistance,
QGram,
QGramDict, QGramDict,
QGramSortedVector, QGramSortedVector,
QGram,
Cosine, Cosine,
Jaccard, Jaccard,
SorensenDice, SorensenDice,
@ -48,7 +48,6 @@ evaluate,
compare, compare,
result_type, result_type,
qgrams, qgrams,
normalize,
findnearest, findnearest,
pairwise, pairwise,
pairwise! pairwise!

View File

@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,
end end
end end
abstract type QGramDistance <: SemiMetric end abstract type AbstractQGramDistance <: SemiMetric end
function (dist::QGramDistance)(s1, s2) function (dist::AbstractQGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing ((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist) counter = newcounter(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2)
calculate(dist, counter) calculate(dist, counter)
end end
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts} function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
@assert dist.q == q(qc1) @assert dist.q == q(qc1)
@assert dist.q == q(qc2) @assert dist.q == q(qc2)
counter = newcounter(dist) counter = newcounter(dist)
@ -268,11 +268,11 @@ The distance corresponds to
where ``v(s, q)`` denotes the vector on the space of q-grams of length q, where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s that contains the number of times a q-gram appears for the string s
""" """
struct QGram <: QGramDistance struct QGram <: AbstractQGramDistance
q::Int q::Int
end end
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
n::T n::T
end end
@ -296,11 +296,11 @@ The distance corresponds to
where ``v(s, q)`` denotes the vector on the space of q-grams of length q, where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s that contains the number of times a q-gram appears for the string s
""" """
struct Cosine <: QGramDistance struct Cosine <: AbstractQGramDistance
q::Int q::Int
end end
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
left::T left::T
right::T right::T
shared::T shared::T
@ -326,7 +326,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
""" """
struct Jaccard <: QGramDistance struct Jaccard <: AbstractQGramDistance
q::Int q::Int
end end
@ -344,7 +344,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
""" """
struct SorensenDice <: QGramDistance struct SorensenDice <: AbstractQGramDistance
q::Int q::Int
end end
@ -362,7 +362,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
""" """
struct Overlap <: QGramDistance struct Overlap <: AbstractQGramDistance
q::Int q::Int
end end
@ -396,11 +396,11 @@ The distance corresponds to
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts. sum of those counts.
""" """
struct MorisitaOverlap <: QGramDistance struct MorisitaOverlap <: AbstractQGramDistance
q::Int q::Int
end end
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
leftsum::T # sum(m(s1)) leftsum::T # sum(m(s1))
rightsum::T # sum(m(s2)) rightsum::T # sum(m(s2))
leftsq::T # sum(m(s1).^2) leftsq::T # sum(m(s1).^2)
@ -445,7 +445,7 @@ sum of those counts.
For details see: For details see:
https://www.sciencedirect.com/science/article/pii/S1047320313001417 https://www.sciencedirect.com/science/article/pii/S1047320313001417
""" """
struct NMD <: QGramDistance struct NMD <: AbstractQGramDistance
q::Int q::Int
end end

View File

@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
```julia-repl ```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves" julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets" julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(Partial(RatcliffObershelp()), s1, s2) julia> Partial(RatcliffObershelp())(s1, s2)
0.5483870967741935 0.5483870967741935
``` ```
""" """
@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
end end
function (dist::Partial)(s1, s2) function (dist::Partial)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
out = dist.dist(s1, s2) out = dist.dist(s1, s2)
@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2)
end end
function (dist::Partial{RatcliffObershelp})(s1, s2) function (dist::Partial{RatcliffObershelp})(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2) len1 == len2 && return dist.dist(s1, s2)
@ -68,7 +70,7 @@ It is only defined on AbstractStrings.
julia> s1 = "New York Mets vs Atlanta Braves" julia> s1 = "New York Mets vs Atlanta Braves"
julia> s1 = "New York Mets vs Atlanta Braves" julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets" julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2) julia> TokenSort(RatcliffObershelp())(s1, s2)
0.0 0.0
``` ```
""" """
@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
dist::S dist::S
end end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString) ((s1 === missing) | (s2 === missing)) && return missing
s1 = join(sort!(split(s1)), " ") s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ") s2 = join(sort!(split(s2)), " ")
out = dist.dist(s1, s2) out = dist.dist(s1, s2)
@ -89,9 +91,9 @@ end
Creates the `TokenSet{dist}` distance. Creates the `TokenSet{dist}` distance.
`TokenSet{dist}` returns the minimum the distances between: `TokenSet{dist}` returns the minimum the distances between:
t0 = [SORTED_INTERSECTION] [SORTED_INTERSECTION]
t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1] [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2] [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
It is only defined on AbstractStrings. It is only defined on AbstractStrings.
@ -100,7 +102,7 @@ It is only defined on AbstractStrings.
```julia-repl ```julia-repl
julia> s1 = "New York Mets vs Atlanta" julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets" julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2) julia> TokenSet(RatcliffObershelp())(s1, s2)
0.0 0.0
``` ```
""" """
@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
dist::S dist::S
end end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString) ((s1 === missing) | (s2 === missing)) && return missing
v1 = unique!(sort!(split(s1))) v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2))) v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2) v0 = intersect(v1, v2)

View File

@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth
out > dist.max_dist ? 1.0 : out out > dist.max_dist ? 1.0 : out
end end
function (dist::Normalized{<:QGramDistance})(s1, s2) function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing ((s1 === missing) | (s2 === missing)) && return missing
# When string length < q for qgram distance, returns s1 == s2 # When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
@ -94,7 +94,8 @@ end
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist) TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist) normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString) function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
max_dist = dist.max_dist max_dist = dist.max_dist
@ -128,7 +129,7 @@ end
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized} const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
""" """
compare(s1, s2, dist) compare(s1, s2, dist)
@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
imax == 0 ? (nothing, nothing) : (itr[imax], imax) imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end end
function _helper(s, dist::QGramDistance) function _helper(s, dist::AbstractQGramDistance)
s !== missing ? QGramSortedVector(s, dist.q) : s s !== missing ? QGramSortedVector(s, dist.q) : s
end end
_helper(s, dist::StringDistance) = s _helper(s, dist::StringDistance) = s

View File

@ -5,7 +5,7 @@
Compute distances between all pairs of elements in `xs` and `ys` according to the Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`. `StringDistance` `dist`.
For QGramDistances preprocessing will be used either if `preprocess` is set For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length. false if no preprocessing should be used, regardless of length.
@ -53,7 +53,7 @@ end
Compute distances between all pairs of elements in `xs` and `ys` according to the Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `StringDistance` `dist` and write the result in `R`.
For QGramDistances preprocessing will be used either if `preprocess` is set For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length. false if no preprocessing should be used, regardless of length.
""" """
@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess) _asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
end end
function _preprocess(xs, dist::QGramDistance, preprocess) function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
if preprocess === nothing ? length(xs) >= 5 : preprocess if preprocess === nothing ? length(xs) >= 5 : preprocess
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs) return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
else else

View File

@ -24,7 +24,7 @@ using StringDistances, Unicode, Test
compare("aüa", "aua", Levenshtein()) compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein()) compare("aüa", "aua", DamerauLevenshtein())
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0 @test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 @test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
# Winkler # Winkler
@test compare("martha", "marhta", JaroWinkler()) 0.9611 atol = 1e-4 @test compare("martha", "marhta", JaroWinkler()) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", JaroWinkler()) 0.84 atol = 1e-4 @test compare("dwayne", "duane", JaroWinkler()) 0.84 atol = 1e-4
@ -106,7 +106,7 @@ using StringDistances, Unicode, Test
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1) @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]

View File

@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing]
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap] QGram, Cosine, Jaccard, SorensenDice, Overlap]
d = (DT <: QGramDistance) ? DT(2) : DT() d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
R = pairwise(d, TestStrings1) R = pairwise(d, TestStrings1)
@test size(R) == (4, 4) @test size(R) == (4, 4)
@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing]
end end
# Ensure same result if preprocessing for QGramDistances # Ensure same result if preprocessing for QGramDistances
if DT <: QGramDistance if DT <: AbstractQGramDistance
R4 = pairwise(d, TestStrings1; preprocess = true) R4 = pairwise(d, TestStrings1; preprocess = true)
@test typeof(R4) == typeof(R) @test typeof(R4) == typeof(R)
@test size(R4) == size(R) @test size(R4) == size(R)