pull/44/head
matthieugomez 2020-11-14 11:40:44 -08:00
parent 1cc89f0827
commit f9675fd110
7 changed files with 39 additions and 37 deletions

View File

@ -30,10 +30,10 @@ DamerauLevenshtein,
Jaro,
JaroWinkler,
RatcliffObershelp,
QGramDistance,
QGram,
AbstractQGramDistance,
QGramDict,
QGramSortedVector,
QGram,
Cosine,
Jaccard,
SorensenDice,
@ -48,7 +48,6 @@ evaluate,
compare,
result_type,
qgrams,
normalize,
findnearest,
pairwise,
pairwise!

View File

@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,
end
end
abstract type QGramDistance <: SemiMetric end
abstract type AbstractQGramDistance <: SemiMetric end
function (dist::QGramDistance)(s1, s2)
function (dist::AbstractQGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2)
calculate(dist, counter)
end
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
@assert dist.q == q(qc1)
@assert dist.q == q(qc2)
counter = newcounter(dist)
@ -268,11 +268,11 @@ The distance corresponds to
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct QGram <: QGramDistance
struct QGram <: AbstractQGramDistance
q::Int
end
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
n::T
end
@ -296,11 +296,11 @@ The distance corresponds to
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct Cosine <: QGramDistance
struct Cosine <: AbstractQGramDistance
q::Int
end
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
left::T
right::T
shared::T
@ -326,7 +326,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Jaccard <: QGramDistance
struct Jaccard <: AbstractQGramDistance
q::Int
end
@ -344,7 +344,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct SorensenDice <: QGramDistance
struct SorensenDice <: AbstractQGramDistance
q::Int
end
@ -362,7 +362,7 @@ The distance corresponds to
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Overlap <: QGramDistance
struct Overlap <: AbstractQGramDistance
q::Int
end
@ -396,11 +396,11 @@ The distance corresponds to
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
"""
struct MorisitaOverlap <: QGramDistance
struct MorisitaOverlap <: AbstractQGramDistance
q::Int
end
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
leftsum::T # sum(m(s1))
rightsum::T # sum(m(s2))
leftsq::T # sum(m(s1).^2)
@ -445,7 +445,7 @@ sum of those counts.
For details see:
https://www.sciencedirect.com/science/article/pii/S1047320313001417
"""
struct NMD <: QGramDistance
struct NMD <: AbstractQGramDistance
q::Int
end

View File

@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
julia> Partial(RatcliffObershelp())(s1, s2)
0.5483870967741935
```
"""
@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
end
function (dist::Partial)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
out = dist.dist(s1, s2)
@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2)
end
function (dist::Partial{RatcliffObershelp})(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2)
@ -68,7 +70,7 @@ It is only defined on AbstractStrings.
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
julia> TokenSort(RatcliffObershelp())(s1, s2)
0.0
```
"""
@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
dist::S
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString)
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
out = dist.dist(s1, s2)
@ -89,9 +91,9 @@ end
Creates the `TokenSet{dist}` distance.
`TokenSet{dist}` returns the minimum the distances between:
t0 = [SORTED_INTERSECTION]
t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
[SORTED_INTERSECTION]
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
It is only defined on AbstractStrings.
@ -100,7 +102,7 @@ It is only defined on AbstractStrings.
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
julia> TokenSet(RatcliffObershelp())(s1, s2)
0.0
```
"""
@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
dist::S
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString)
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)

View File

@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth
out > dist.max_dist ? 1.0 : out
end
function (dist::Normalized{<:QGramDistance})(s1, s2)
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
@ -94,7 +94,8 @@ end
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString)
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist = dist.max_dist
@ -128,7 +129,7 @@ end
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
"""
compare(s1, s2, dist)
@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end
function _helper(s, dist::QGramDistance)
function _helper(s, dist::AbstractQGramDistance)
s !== missing ? QGramSortedVector(s, dist.q) : s
end
_helper(s, dist::StringDistance) = s

View File

@ -5,7 +5,7 @@
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`.
For QGramDistances preprocessing will be used either if `preprocess` is set
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
@ -53,7 +53,7 @@ end
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`.
For QGramDistances preprocessing will be used either if `preprocess` is set
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
end
function _preprocess(xs, dist::QGramDistance, preprocess)
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
if preprocess === nothing ? length(xs) >= 5 : preprocess
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
else

View File

@ -24,7 +24,7 @@ using StringDistances, Unicode, Test
compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein())
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
# Winkler
@test compare("martha", "marhta", JaroWinkler()) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", JaroWinkler()) 0.84 atol = 1e-4
@ -106,7 +106,7 @@ using StringDistances, Unicode, Test
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]

View File

@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing]
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap]
d = (DT <: QGramDistance) ? DT(2) : DT()
d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
R = pairwise(d, TestStrings1)
@test size(R) == (4, 4)
@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing]
end
# Ensure same result if preprocessing for QGramDistances
if DT <: QGramDistance
if DT <: AbstractQGramDistance
R4 = pairwise(d, TestStrings1; preprocess = true)
@test typeof(R4) == typeof(R)
@test size(R4) == size(R)