update
parent
1cc89f0827
commit
f9675fd110
|
@ -30,10 +30,10 @@ DamerauLevenshtein,
|
||||||
Jaro,
|
Jaro,
|
||||||
JaroWinkler,
|
JaroWinkler,
|
||||||
RatcliffObershelp,
|
RatcliffObershelp,
|
||||||
QGramDistance,
|
AbstractQGramDistance,
|
||||||
QGram,
|
|
||||||
QGramDict,
|
QGramDict,
|
||||||
QGramSortedVector,
|
QGramSortedVector,
|
||||||
|
QGram,
|
||||||
Cosine,
|
Cosine,
|
||||||
Jaccard,
|
Jaccard,
|
||||||
SorensenDice,
|
SorensenDice,
|
||||||
|
@ -48,7 +48,6 @@ evaluate,
|
||||||
compare,
|
compare,
|
||||||
result_type,
|
result_type,
|
||||||
qgrams,
|
qgrams,
|
||||||
normalize,
|
|
||||||
findnearest,
|
findnearest,
|
||||||
pairwise,
|
pairwise,
|
||||||
pairwise!
|
pairwise!
|
||||||
|
|
|
@ -237,9 +237,9 @@ function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
abstract type QGramDistance <: SemiMetric end
|
abstract type AbstractQGramDistance <: SemiMetric end
|
||||||
|
|
||||||
function (dist::QGramDistance)(s1, s2)
|
function (dist::AbstractQGramDistance)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
counter = newcounter(dist)
|
counter = newcounter(dist)
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -248,7 +248,7 @@ function (dist::QGramDistance)(s1, s2)
|
||||||
calculate(dist, counter)
|
calculate(dist, counter)
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||||||
@assert dist.q == q(qc1)
|
@assert dist.q == q(qc1)
|
||||||
@assert dist.q == q(qc2)
|
@assert dist.q == q(qc2)
|
||||||
counter = newcounter(dist)
|
counter = newcounter(dist)
|
||||||
|
@ -268,11 +268,11 @@ The distance corresponds to
|
||||||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||||||
that contains the number of times a q-gram appears for the string s
|
that contains the number of times a q-gram appears for the string s
|
||||||
"""
|
"""
|
||||||
struct QGram <: QGramDistance
|
struct QGram <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||||
n::T
|
n::T
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -296,11 +296,11 @@ The distance corresponds to
|
||||||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||||||
that contains the number of times a q-gram appears for the string s
|
that contains the number of times a q-gram appears for the string s
|
||||||
"""
|
"""
|
||||||
struct Cosine <: QGramDistance
|
struct Cosine <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||||
left::T
|
left::T
|
||||||
right::T
|
right::T
|
||||||
shared::T
|
shared::T
|
||||||
|
@ -326,7 +326,7 @@ The distance corresponds to
|
||||||
|
|
||||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
"""
|
"""
|
||||||
struct Jaccard <: QGramDistance
|
struct Jaccard <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -344,7 +344,7 @@ The distance corresponds to
|
||||||
|
|
||||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
"""
|
"""
|
||||||
struct SorensenDice <: QGramDistance
|
struct SorensenDice <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -362,7 +362,7 @@ The distance corresponds to
|
||||||
|
|
||||||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
"""
|
"""
|
||||||
struct Overlap <: QGramDistance
|
struct Overlap <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -396,11 +396,11 @@ The distance corresponds to
|
||||||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||||
sum of those counts.
|
sum of those counts.
|
||||||
"""
|
"""
|
||||||
struct MorisitaOverlap <: QGramDistance
|
struct MorisitaOverlap <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||||
leftsum::T # sum(m(s1))
|
leftsum::T # sum(m(s1))
|
||||||
rightsum::T # sum(m(s2))
|
rightsum::T # sum(m(s2))
|
||||||
leftsq::T # sum(m(s1).^2)
|
leftsq::T # sum(m(s1).^2)
|
||||||
|
@ -445,7 +445,7 @@ sum of those counts.
|
||||||
For details see:
|
For details see:
|
||||||
https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
||||||
"""
|
"""
|
||||||
struct NMD <: QGramDistance
|
struct NMD <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
```julia-repl
|
```julia-repl
|
||||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||||
julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
|
julia> Partial(RatcliffObershelp())(s1, s2)
|
||||||
0.5483870967741935
|
0.5483870967741935
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
@ -20,6 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Partial)(s1, s2)
|
function (dist::Partial)(s1, s2)
|
||||||
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
out = dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
|
@ -32,6 +33,7 @@ function (dist::Partial)(s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
||||||
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return dist.dist(s1, s2)
|
len1 == len2 && return dist.dist(s1, s2)
|
||||||
|
@ -68,7 +70,7 @@ It is only defined on AbstractStrings.
|
||||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||||
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
|
julia> TokenSort(RatcliffObershelp())(s1, s2)
|
||||||
0.0
|
0.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
@ -76,8 +78,8 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||||
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString)
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1 = join(sort!(split(s1)), " ")
|
s1 = join(sort!(split(s1)), " ")
|
||||||
s2 = join(sort!(split(s2)), " ")
|
s2 = join(sort!(split(s2)), " ")
|
||||||
out = dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
|
@ -89,9 +91,9 @@ end
|
||||||
Creates the `TokenSet{dist}` distance.
|
Creates the `TokenSet{dist}` distance.
|
||||||
|
|
||||||
`TokenSet{dist}` returns the minimum the distances between:
|
`TokenSet{dist}` returns the minimum the distances between:
|
||||||
t0 = [SORTED_INTERSECTION]
|
[SORTED_INTERSECTION]
|
||||||
t1 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
|
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
|
||||||
t2 = [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
|
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
|
||||||
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
|
|
||||||
It is only defined on AbstractStrings.
|
It is only defined on AbstractStrings.
|
||||||
|
@ -100,7 +102,7 @@ It is only defined on AbstractStrings.
|
||||||
```julia-repl
|
```julia-repl
|
||||||
julia> s1 = "New York Mets vs Atlanta"
|
julia> s1 = "New York Mets vs Atlanta"
|
||||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||||
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
|
julia> TokenSet(RatcliffObershelp())(s1, s2)
|
||||||
0.0
|
0.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
@ -108,8 +110,8 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||||
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString)
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
v1 = unique!(sort!(split(s1)))
|
v1 = unique!(sort!(split(s1)))
|
||||||
v2 = unique!(sort!(split(s2)))
|
v2 = unique!(sort!(split(s2)))
|
||||||
v0 = intersect(v1, v2)
|
v0 = intersect(v1, v2)
|
||||||
|
|
|
@ -26,7 +26,7 @@ function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Noth
|
||||||
out > dist.max_dist ? 1.0 : out
|
out > dist.max_dist ? 1.0 : out
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Normalized{<:QGramDistance})(s1, s2)
|
function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
# When string length < q for qgram distance, returns s1 == s2
|
# When string length < q for qgram distance, returns s1 == s2
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
|
@ -94,7 +94,8 @@ end
|
||||||
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
|
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
|
||||||
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
|
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
|
||||||
|
|
||||||
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString)
|
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||||
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
max_dist = dist.max_dist
|
max_dist = dist.max_dist
|
||||||
|
@ -128,7 +129,7 @@ end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
compare(s1, s2, dist)
|
compare(s1, s2, dist)
|
||||||
|
@ -184,7 +185,7 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||||
end
|
end
|
||||||
|
|
||||||
function _helper(s, dist::QGramDistance)
|
function _helper(s, dist::AbstractQGramDistance)
|
||||||
s !== missing ? QGramSortedVector(s, dist.q) : s
|
s !== missing ? QGramSortedVector(s, dist.q) : s
|
||||||
end
|
end
|
||||||
_helper(s, dist::StringDistance) = s
|
_helper(s, dist::StringDistance) = s
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||||
`StringDistance` `dist`.
|
`StringDistance` `dist`.
|
||||||
|
|
||||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||||
false if no preprocessing should be used, regardless of length.
|
false if no preprocessing should be used, regardless of length.
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ end
|
||||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||||
`StringDistance` `dist` and write the result in `R`.
|
`StringDistance` `dist` and write the result in `R`.
|
||||||
|
|
||||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||||
false if no preprocessing should be used, regardless of length.
|
false if no preprocessing should be used, regardless of length.
|
||||||
"""
|
"""
|
||||||
|
@ -73,7 +73,7 @@ function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstra
|
||||||
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
|
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
function _preprocess(xs, dist::QGramDistance, preprocess)
|
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
|
||||||
if preprocess === nothing ? length(xs) >= 5 : preprocess
|
if preprocess === nothing ? length(xs) >= 5 : preprocess
|
||||||
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
||||||
else
|
else
|
||||||
|
|
|
@ -24,7 +24,7 @@ using StringDistances, Unicode, Test
|
||||||
compare("aüa", "aua", Levenshtein())
|
compare("aüa", "aua", Levenshtein())
|
||||||
compare("aüa", "aua", DamerauLevenshtein())
|
compare("aüa", "aua", DamerauLevenshtein())
|
||||||
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
|
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
|
||||||
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
||||||
# Winkler
|
# Winkler
|
||||||
@test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4
|
@test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4
|
||||||
@test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4
|
@test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4
|
||||||
|
@ -106,7 +106,7 @@ using StringDistances, Unicode, Test
|
||||||
|
|
||||||
|
|
||||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
||||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], normalize(QGram(2))) == ("NewYork", 1)
|
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)
|
||||||
|
|
||||||
|
|
||||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||||
|
|
|
@ -11,7 +11,7 @@ TestStrings2missing = ["mew", missing]
|
||||||
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
||||||
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||||
|
|
||||||
d = (DT <: QGramDistance) ? DT(2) : DT()
|
d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
|
||||||
R = pairwise(d, TestStrings1)
|
R = pairwise(d, TestStrings1)
|
||||||
|
|
||||||
@test size(R) == (4, 4)
|
@test size(R) == (4, 4)
|
||||||
|
@ -70,7 +70,7 @@ TestStrings2missing = ["mew", missing]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Ensure same result if preprocessing for QGramDistances
|
# Ensure same result if preprocessing for QGramDistances
|
||||||
if DT <: QGramDistance
|
if DT <: AbstractQGramDistance
|
||||||
R4 = pairwise(d, TestStrings1; preprocess = true)
|
R4 = pairwise(d, TestStrings1; preprocess = true)
|
||||||
@test typeof(R4) == typeof(R)
|
@test typeof(R4) == typeof(R)
|
||||||
@test size(R4) == size(R)
|
@test size(R4) == size(R)
|
||||||
|
|
Loading…
Reference in New Issue