parent
2aff23fd6c
commit
254e5e15f6
|
@ -11,7 +11,7 @@ end
|
|||
Hamming() = Hamming(nothing)
|
||||
|
||||
function (dist::Hamming)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
out = abs(length(s2) - length(s1))
|
||||
dist.max_dist !== nothing && out > dist.max_dist && return dist.max_dist + 1
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
|
@ -39,7 +39,7 @@ struct Jaro <: SemiMetric end
|
|||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function (dist::Jaro)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# If both are empty, the formula in Wikipedia gives 0
|
||||
|
@ -92,7 +92,7 @@ JaroWinkler(; p = 0.1, threshold = 0.3, maxlength = 4) = JaroWinkler(p, threshol
|
|||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function (dist::JaroWinkler)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
out = Jaro()(s1, s2)
|
||||
|
@ -120,7 +120,7 @@ Levenshtein() = Levenshtein(nothing)
|
|||
# Return max_dist + 1 if distance higher than max_dist
|
||||
# to differentiate distance equal to max_dist or not, which is important for find fctions.
|
||||
function (dist::Levenshtein)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||
|
@ -174,7 +174,7 @@ DamerauLevenshtein() = DamerauLevenshtein(nothing)
|
|||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
function (dist::DamerauLevenshtein)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||
|
@ -241,7 +241,7 @@ region on either side of the longest common subsequence.
|
|||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
|
|
@ -90,12 +90,12 @@ abstract type AbstractQGramMatchCounter end
|
|||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
|
||||
function (dist::AbstractQGramDistance)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
counter = newcounter(dist)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
counter = eval_start(dist)
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
count!(dist, counter, n1, n2)
|
||||
counter = eval_op(dist, counter, n1, n2)
|
||||
end
|
||||
calculate(dist, counter)
|
||||
eval_reduce(dist, counter)
|
||||
end
|
||||
|
||||
|
||||
|
@ -114,16 +114,9 @@ that contains the number of times a q-gram appears for the string s
|
|||
struct QGram <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct SingleCounter <: AbstractQGramMatchCounter
|
||||
shared::Int
|
||||
end
|
||||
|
||||
newcounter(::QGram) = SingleCounter(0)
|
||||
@inline function count!(::QGram, c::SingleCounter, n1::Integer, n2::Integer)
|
||||
c.shared += abs(n1 - n2)
|
||||
end
|
||||
calculate(::QGram, c::SingleCounter) = c.shared
|
||||
eval_start(::QGram) = 0
|
||||
@inline eval_op(::QGram, c, n1::Integer, n2::Integer) = c + abs(n1 - n2)
|
||||
eval_reduce(::QGram, c) = c
|
||||
|
||||
"""
|
||||
Cosine(q::Int)
|
||||
|
@ -140,21 +133,9 @@ that contains the number of times a q-gram appears for the string s
|
|||
struct Cosine <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct ThreeCounters <: AbstractQGramMatchCounter
|
||||
left::Int
|
||||
right::Int
|
||||
shared::Int
|
||||
end
|
||||
|
||||
newcounter(::Cosine) = ThreeCounters(0, 0, 0)
|
||||
@inline function count!(::Cosine, c::ThreeCounters, n1::Integer, n2::Integer)
|
||||
c.left += n1^2
|
||||
c.right += n2^2
|
||||
c.shared += n1 * n2
|
||||
end
|
||||
calculate(::Cosine, c::ThreeCounters) =
|
||||
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
|
||||
eval_start(::Cosine) = (0, 0, 0)
|
||||
@inline eval_op(::Cosine, c, n1::Integer, n2::Integer) = (c[1] + n1^2, c[2] + n2^2, c[3] + n1 * n2)
|
||||
eval_reduce(::Cosine, c) = 1 - c[3] / sqrt(c[1] * c[2])
|
||||
|
||||
"""
|
||||
Jaccard(q::Int)
|
||||
|
@ -170,14 +151,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct Jaccard <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
newcounter(::Jaccard) = ThreeCounters(0, 0, 0)
|
||||
@inline function count!(::Jaccard, c::ThreeCounters, n1::Integer, n2::Integer)
|
||||
c.left += n1 > 0
|
||||
c.right += n2 > 0
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
calculate(::Jaccard, c::ThreeCounters) =
|
||||
1.0 - c.shared / (c.left + c.right - c.shared)
|
||||
eval_start(::Jaccard) = (0, 0, 0)
|
||||
@inline eval_op(::Jaccard, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||
eval_reduce(::Jaccard, c) = 1 - c[3] / (c[1] + c[2] - c[3])
|
||||
|
||||
"""
|
||||
SorensenDice(q::Int)
|
||||
|
@ -193,14 +169,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct SorensenDice <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
newcounter(::SorensenDice) = ThreeCounters(0, 0, 0)
|
||||
@inline function count!(::SorensenDice, c::ThreeCounters, n1::Integer, n2::Integer)
|
||||
c.left += n1 > 0
|
||||
c.right += n2 > 0
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
calculate(::SorensenDice, c::ThreeCounters) =
|
||||
1.0 - 2.0 * c.shared / (c.left + c.right)
|
||||
eval_start(::SorensenDice) = (0, 0, 0)
|
||||
@inline eval_op(::SorensenDice, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||
eval_reduce(::SorensenDice, c) = 1 - 2 * c[3] / (c[1] + c[2])
|
||||
|
||||
"""
|
||||
Overlap(q::Int)
|
||||
|
@ -216,14 +187,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct Overlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
newcounter(::Overlap) = ThreeCounters(0, 0, 0)
|
||||
@inline function count!(::Overlap, c::ThreeCounters, n1::Integer, n2::Integer)
|
||||
c.left += n1 > 0
|
||||
c.right += n2 > 0
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
calculate(::Overlap, c::ThreeCounters) =
|
||||
1.0 - c.shared / min(c.left, c.right)
|
||||
eval_start(::Overlap) = (0, 0, 0)
|
||||
@inline eval_op(::Overlap, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||
eval_reduce(::Overlap, c) = 1 - c[3] / min(c[1], c[2])
|
||||
|
||||
"""
|
||||
NMD(q::Int)
|
||||
|
@ -247,16 +213,9 @@ https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
|||
struct NMD <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
newcounter(::NMD) = ThreeCounters(0, 0, 0)
|
||||
@inline function count!(::NMD, c::ThreeCounters, n1::Integer, n2::Integer)
|
||||
c.left += n1
|
||||
c.right += n2
|
||||
c.shared += max(n1, n2)
|
||||
end
|
||||
calculate(::NMD, c::ThreeCounters) =
|
||||
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
|
||||
|
||||
eval_start(::NMD) = (0, 0, 0)
|
||||
@inline eval_op(::NMD, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + max(n1, n2))
|
||||
eval_reduce(::NMD, c) = (c[3] - min(c[1], c[2])) / max(c[1], c[2])
|
||||
|
||||
"""
|
||||
MorisitaOverlap(q::Int)
|
||||
|
@ -278,23 +237,6 @@ sum of those counts.
|
|||
struct MorisitaOverlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct FiveCounters <: AbstractQGramMatchCounter
|
||||
leftsum::Int # sum(m(s1))
|
||||
rightsum::Int # sum(m(s2))
|
||||
leftsq::Int # sum(m(s1).^2)
|
||||
rightsq::Int # sum(m(s2).^2)
|
||||
shared::Int # sum(m(s1) .* m(s2))
|
||||
end
|
||||
|
||||
newcounter(::MorisitaOverlap) = FiveCounters(0, 0, 0, 0, 0)
|
||||
@inline function count!(::MorisitaOverlap, c::FiveCounters, n1::Integer, n2::Integer)
|
||||
c.leftsum += n1
|
||||
c.rightsum += n2
|
||||
c.leftsq += n1^2
|
||||
c.rightsq += n2^2
|
||||
c.shared += n1 * n2
|
||||
end
|
||||
calculate(::MorisitaOverlap, c::FiveCounters) =
|
||||
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
|
||||
|
||||
eval_start(::MorisitaOverlap) = (0, 0, 0, 0, 0)
|
||||
@inline eval_op(::MorisitaOverlap, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + n1^2, c[4] + n2^2, c[5] + n1 * n2)
|
||||
eval_reduce(::MorisitaOverlap, c) = 1 - 2 * c[5] / (c[3] * c[2] / c[1] + c[4] * c[1] / c[2])
|
||||
|
|
|
@ -51,23 +51,23 @@ end
|
|||
|
||||
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
|
||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
|
||||
counter = newcounter(dist)
|
||||
counter = eval_start(dist)
|
||||
d1, d2 = qc1.counts, qc2.counts
|
||||
for (k1, c1) in d1
|
||||
index = Base.ht_keyindex2!(d2, k1)
|
||||
for (s1, n1) in d1
|
||||
index = Base.ht_keyindex2!(d2, s1)
|
||||
if index > 0
|
||||
count!(dist, counter, c1, d2.vals[index])
|
||||
counter = eval_op(dist, counter, n1, d2.vals[index])
|
||||
else
|
||||
count!(dist, counter, c1, 0)
|
||||
counter = eval_op(dist, counter, n1, 0)
|
||||
end
|
||||
end
|
||||
for (k2, c2) in d2
|
||||
index = Base.ht_keyindex2!(d1, k2)
|
||||
for (s2, n2) in d2
|
||||
index = Base.ht_keyindex2!(d1, s2)
|
||||
if index <= 0
|
||||
count!(dist, counter, 0, c2)
|
||||
counter = eval_op(dist, counter, 0, n2)
|
||||
end
|
||||
end
|
||||
calculate(dist, counter)
|
||||
eval_reduce(dist, counter)
|
||||
end
|
||||
|
||||
"""
|
||||
|
@ -118,37 +118,37 @@ end
|
|||
# specialied by subtypes for best performance.
|
||||
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
|
||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
|
||||
counter = newcounter(dist)
|
||||
counter = eval_start(dist)
|
||||
d1, d2 = qc1.counts, qc2.counts
|
||||
i1 = i2 = 1
|
||||
while true
|
||||
# length can be zero
|
||||
if i2 > length(d2)
|
||||
for i in i1:length(d1)
|
||||
@inbounds count!(dist, counter, d1[i][2], 0)
|
||||
@inbounds counter = eval_op(dist, counter, d1[i][2], 0)
|
||||
end
|
||||
break
|
||||
elseif i1 > length(d1)
|
||||
for i in i2:length(d2)
|
||||
@inbounds count!(dist, counter, 0, d2[i][2])
|
||||
@inbounds counter = eval_op(dist, counter, 0, d2[i][2])
|
||||
end
|
||||
break
|
||||
end
|
||||
@inbounds k1, n1 = d1[i1]
|
||||
@inbounds k2, n2 = d2[i2]
|
||||
cmpval = Base.cmp(k1, k2)
|
||||
@inbounds s1, n1 = d1[i1]
|
||||
@inbounds s2, n2 = d2[i2]
|
||||
cmpval = Base.cmp(s1, s2)
|
||||
if cmpval == -1 # k1 < k2
|
||||
count!(dist, counter, n1, 0)
|
||||
counter = eval_op(dist, counter, n1, 0)
|
||||
i1 += 1
|
||||
elseif cmpval == +1 # k2 < k1
|
||||
count!(dist, counter, 0, n2)
|
||||
elseif cmpval == 1 # k2 < k1
|
||||
counter = eval_op(dist, counter, 0, n2)
|
||||
i2 += 1
|
||||
else
|
||||
count!(dist, counter, n1, n2)
|
||||
counter = eval_op(dist, counter, n1, n2)
|
||||
i1 += 1
|
||||
i2 += 1
|
||||
end
|
||||
end
|
||||
calculate(dist, counter)
|
||||
eval_reduce(dist, counter)
|
||||
end
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
|
||||
function (dist::Partial)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
out = dist.dist(s1, s2)
|
||||
|
@ -33,7 +33,7 @@ function (dist::Partial)(s1, s2)
|
|||
end
|
||||
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
|
@ -79,7 +79,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
|
||||
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
out = dist.dist(s1, s2)
|
||||
|
@ -111,7 +111,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
|
||||
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
|
|
Loading…
Reference in New Issue