update
parent
1cc89f0827
commit
f9675fd110
|
@ -30,10 +30,10 @@ DamerauLevenshtein,
|
|||
Jaro,
|
||||
JaroWinkler,
|
||||
RatcliffObershelp,
|
||||
QGramDistance,
|
||||
QGram,
|
||||
AbstractQGramDistance,
|
||||
QGramDict,
|
||||
QGramSortedVector,
|
||||
QGram,
|
||||
Cosine,
|
||||
Jaccard,
|
||||
SorensenDice,
|
||||
|
@ -48,7 +48,6 @@ evaluate,
|
|||
compare,
|
||||
result_type,
|
||||
qgrams,
|
||||
normalize,
|
||||
findnearest,
|
||||
pairwise,
|
||||
pairwise!
|
||||
|
|
|
@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,
|
|||
end
|
||||
end
|
||||
|
||||
abstract type QGramDistance <: SemiMetric end
|
||||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
|
||||
function (dist::QGramDistance)(s1, s2)
|
||||
function (dist::AbstractQGramDistance)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
counter = newcounter(dist)
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2)
|
|||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||||
function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||||
@assert dist.q == q(qc1)
|
||||
@assert dist.q == q(qc2)
|
||||
counter = newcounter(dist)
|
||||
|
@ -268,11 +268,11 @@ The distance corresponds to
|
|||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||||
that contains the number of times a q-gram appears for the string s
|
||||
"""
|
||||
struct QGram <: QGramDistance
|
||||
struct QGram <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
n::T
|
||||
end
|
||||
|
||||
|
@ -296,11 +296,11 @@ The distance corresponds to
|
|||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||||
that contains the number of times a q-gram appears for the string s
|
||||
"""
|
||||
struct Cosine <: QGramDistance
|
||||
struct Cosine <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
left::T
|
||||
right::T
|
||||
shared::T
|
||||
|
@ -326,7 +326,7 @@ The distance corresponds to
|
|||
|
||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||
"""
|
||||
struct Jaccard <: QGramDistance
|
||||
struct Jaccard <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
|
@ -344,7 +344,7 @@ The distance corresponds to
|
|||
|
||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||
"""
|
||||
struct SorensenDice <: QGramDistance
|
||||
struct SorensenDice <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
|
@ -362,7 +362,7 @@ The distance corresponds to
|
|||
|
||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||
"""
|
||||
struct Overlap <: QGramDistance
|
||||
struct Overlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
|
@ -396,11 +396,11 @@ The distance corresponds to
|
|||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||
sum of those counts.
|
||||
"""
|
||||
struct MorisitaOverlap <: QGramDistance
|
||||
struct MorisitaOverlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
leftsum::T # sum(m(s1))
|
||||
rightsum::T # sum(m(s2))
|
||||
leftsq::T # sum(m(s1).^2)
|
||||
|
@ -445,7 +445,7 @@ sum of those counts.
|
|||
For details see:
|
||||
https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
||||
"""
|
||||
struct NMD <: QGramDistance
|
||||
struct NMD <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
|||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
|
||||
julia> Partial(RatcliffObershelp())(s1, s2)
|
||||
0.5483870967741935
|
||||
```
|
||||
"""
|
||||
|
@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
|
||||
function (dist::Partial)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
out = dist.dist(s1, s2)
|
||||
|
@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2)
|
|||
end
|
||||
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
|
@ -68,7 +70,7 @@ It is only defined on AbstractStrings.
|
|||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
|
||||
julia> TokenSort(RatcliffObershelp())(s1, s2)
|
||||
0.0
|
||||
```
|
||||
"""
|
||||
|
@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
|
|||
dist::S
|
||||
end
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString)
|
||||
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
out = dist.dist(s1, s2)
|
||||
|
@ -89,9 +91,9 @@ end
|
|||
Creates the `TokenSet{dist}` distance.
|
||||
|
||||
`TokenSet{dist}` returns the minimum the distances between:
|
||||
t0 = [SORTED_INTERSECTION]
|
||||
t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
|
||||
t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
|
||||
[SORTED_INTERSECTION]
|
||||
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
|
||||
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
|
||||
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
|
||||
It is only defined on AbstractStrings.
|
||||
|
@ -100,7 +102,7 @@ It is only defined on AbstractStrings.
|
|||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
|
||||
julia> TokenSet(RatcliffObershelp())(s1, s2)
|
||||
0.0
|
||||
```
|
||||
"""
|
||||
|
@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
|
|||
dist::S
|
||||
end
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString)
|
||||
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
|
|
|
@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth
|
|||
out > dist.max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:QGramDistance})(s1, s2)
|
||||
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -94,7 +94,8 @@ end
|
|||
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
|
||||
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
|
||||
|
||||
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString)
|
||||
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist = dist.max_dist
|
||||
|
@ -128,7 +129,7 @@ end
|
|||
|
||||
|
||||
|
||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||
end
|
||||
|
||||
function _helper(s, dist::QGramDistance)
|
||||
function _helper(s, dist::AbstractQGramDistance)
|
||||
s !== missing ? QGramSortedVector(s, dist.q) : s
|
||||
end
|
||||
_helper(s, dist::StringDistance) = s
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist`.
|
||||
|
||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
|
||||
|
@ -53,7 +53,7 @@ end
|
|||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`.
|
||||
|
||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
"""
|
||||
|
@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra
|
|||
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _preprocess(xs, dist::QGramDistance, preprocess)
|
||||
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
|
||||
if preprocess === nothing ? length(xs) >= 5 : preprocess
|
||||
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
||||
else
|
||||
|
|
|
@ -24,7 +24,7 @@ using StringDistances, Unicode, Test
|
|||
compare("aüa", "aua", Levenshtein())
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
|
||||
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
||||
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
||||
# Winkler
|
||||
@test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4
|
||||
|
@ -106,7 +106,7 @@ using StringDistances, Unicode, Test
|
|||
|
||||
|
||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1)
|
||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)
|
||||
|
||||
|
||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||
|
|
|
@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing]
|
|||
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
||||
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||
|
||||
d = (DT <: QGramDistance) ? DT(2) : DT()
|
||||
d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
|
||||
R = pairwise(d, TestStrings1)
|
||||
|
||||
@test size(R) == (4, 4)
|
||||
|
@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing]
|
|||
end
|
||||
|
||||
# Ensure same result if preprocessing for QGramDistances
|
||||
if DT <: QGramDistance
|
||||
if DT <: AbstractQGramDistance
|
||||
R4 = pairwise(d, TestStrings1; preprocess = true)
|
||||
@test typeof(R4) == typeof(R)
|
||||
@test size(R4) == size(R)
|
||||
|
|
Loading…
Reference in New Issue