simplify code by removing counters
parent
f9c328e1c7
commit
12a4eabb5f
|
@ -11,7 +11,7 @@ end
|
||||||
Hamming() = Hamming(nothing)
|
Hamming() = Hamming(nothing)
|
||||||
|
|
||||||
function (dist::Hamming)(s1, s2)
|
function (dist::Hamming)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
out = abs(length(s2) - length(s1))
|
out = abs(length(s2) - length(s1))
|
||||||
dist.max_dist !== nothing && out > dist.max_dist && return dist.max_dist + 1
|
dist.max_dist !== nothing && out > dist.max_dist && return dist.max_dist + 1
|
||||||
for (ch1, ch2) in zip(s1, s2)
|
for (ch1, ch2) in zip(s1, s2)
|
||||||
|
@ -39,7 +39,7 @@ struct Jaro <: SemiMetric end
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
function (dist::Jaro)(s1, s2)
|
function (dist::Jaro)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
# If both are empty, the formula in Wikipedia gives 0
|
# If both are empty, the formula in Wikipedia gives 0
|
||||||
|
@ -92,7 +92,7 @@ JaroWinkler(; p = 0.1, threshold = 0.3, maxlength = 4) = JaroWinkler(p, threshol
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
function (dist::JaroWinkler)(s1, s2)
|
function (dist::JaroWinkler)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
out = Jaro()(s1, s2)
|
out = Jaro()(s1, s2)
|
||||||
|
@ -120,7 +120,7 @@ Levenshtein() = Levenshtein(nothing)
|
||||||
# Return max_dist + 1 if distance higher than max_dist
|
# Return max_dist + 1 if distance higher than max_dist
|
||||||
# to differentiate distance equal to max_dist or not, which is important for find fctions.
|
# to differentiate distance equal to max_dist or not, which is important for find fctions.
|
||||||
function (dist::Levenshtein)(s1, s2)
|
function (dist::Levenshtein)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||||
|
@ -174,7 +174,7 @@ DamerauLevenshtein() = DamerauLevenshtein(nothing)
|
||||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
# Return max_dist + 1 if distance higher than max_dist
|
# Return max_dist + 1 if distance higher than max_dist
|
||||||
function (dist::DamerauLevenshtein)(s1, s2)
|
function (dist::DamerauLevenshtein)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
|
||||||
|
@ -241,7 +241,7 @@ region on either side of the longest common subsequence.
|
||||||
struct RatcliffObershelp <: SemiMetric end
|
struct RatcliffObershelp <: SemiMetric end
|
||||||
|
|
||||||
function (dist::RatcliffObershelp)(s1, s2)
|
function (dist::RatcliffObershelp)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
|
|
@ -90,12 +90,12 @@ abstract type AbstractQGramMatchCounter end
|
||||||
abstract type AbstractQGramDistance <: SemiMetric end
|
abstract type AbstractQGramDistance <: SemiMetric end
|
||||||
|
|
||||||
function (dist::AbstractQGramDistance)(s1, s2)
|
function (dist::AbstractQGramDistance)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
counter = newcounter(dist)
|
counter = eval_start(dist)
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
count!(dist, counter, n1, n2)
|
counter = eval_op(dist, counter, n1, n2)
|
||||||
end
|
end
|
||||||
calculate(dist, counter)
|
eval_reduce(dist, counter)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -115,15 +115,10 @@ struct QGram <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct SingleCounter <: AbstractQGramMatchCounter
|
|
||||||
shared::Int
|
|
||||||
end
|
|
||||||
|
|
||||||
newcounter(::QGram) = SingleCounter(0)
|
eval_start(::QGram) = 0
|
||||||
@inline function count!(::QGram, c::SingleCounter, n1::Integer, n2::Integer)
|
@inline eval_op(::QGram, c, n1::Integer, n2::Integer) = c + abs(n1 - n2)
|
||||||
c.shared += abs(n1 - n2)
|
eval_reduce(::QGram, c) = c
|
||||||
end
|
|
||||||
calculate(::QGram, c::SingleCounter) = c.shared
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Cosine(q::Int)
|
Cosine(q::Int)
|
||||||
|
@ -141,20 +136,9 @@ struct Cosine <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct ThreeCounters <: AbstractQGramMatchCounter
|
eval_start(::Cosine) = (0, 0, 0)
|
||||||
left::Int
|
@inline eval_op(::Cosine, c, n1::Integer, n2::Integer) = (c[1] + n1^2, c[2] + n2^2, c[3] + n1 * n2)
|
||||||
right::Int
|
eval_reduce(::Cosine, c) = 1 - c[3] / sqrt(c[1] * c[2])
|
||||||
shared::Int
|
|
||||||
end
|
|
||||||
|
|
||||||
newcounter(::Cosine) = ThreeCounters(0, 0, 0)
|
|
||||||
@inline function count!(::Cosine, c::ThreeCounters, n1::Integer, n2::Integer)
|
|
||||||
c.left += n1^2
|
|
||||||
c.right += n2^2
|
|
||||||
c.shared += n1 * n2
|
|
||||||
end
|
|
||||||
calculate(::Cosine, c::ThreeCounters) =
|
|
||||||
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Jaccard(q::Int)
|
Jaccard(q::Int)
|
||||||
|
@ -170,14 +154,10 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
struct Jaccard <: AbstractQGramDistance
|
struct Jaccard <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
newcounter(::Jaccard) = ThreeCounters(0, 0, 0)
|
|
||||||
@inline function count!(::Jaccard, c::ThreeCounters, n1::Integer, n2::Integer)
|
eval_start(::Jaccard) = (0, 0, 0)
|
||||||
c.left += n1 > 0
|
@inline eval_op(::Jaccard, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||||
c.right += n2 > 0
|
eval_reduce(::Jaccard, c) = 1 - c[3] / (c[1] + c[2] - c[3])
|
||||||
c.shared += (n1 > 0) & (n2 > 0)
|
|
||||||
end
|
|
||||||
calculate(::Jaccard, c::ThreeCounters) =
|
|
||||||
1.0 - c.shared / (c.left + c.right - c.shared)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
SorensenDice(q::Int)
|
SorensenDice(q::Int)
|
||||||
|
@ -193,14 +173,10 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
struct SorensenDice <: AbstractQGramDistance
|
struct SorensenDice <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
newcounter(::SorensenDice) = ThreeCounters(0, 0, 0)
|
|
||||||
@inline function count!(::SorensenDice, c::ThreeCounters, n1::Integer, n2::Integer)
|
eval_start(::SorensenDice) = (0, 0, 0)
|
||||||
c.left += n1 > 0
|
@inline eval_op(::SorensenDice, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||||
c.right += n2 > 0
|
eval_reduce(::SorensenDice, c) = 1 - 2 * c[3] / (c[1] + c[2])
|
||||||
c.shared += (n1 > 0) & (n2 > 0)
|
|
||||||
end
|
|
||||||
calculate(::SorensenDice, c::ThreeCounters) =
|
|
||||||
1.0 - 2.0 * c.shared / (c.left + c.right)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Overlap(q::Int)
|
Overlap(q::Int)
|
||||||
|
@ -216,14 +192,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||||||
struct Overlap <: AbstractQGramDistance
|
struct Overlap <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
newcounter(::Overlap) = ThreeCounters(0, 0, 0)
|
eval_start(::Overlap) = (0, 0, 0)
|
||||||
@inline function count!(::Overlap, c::ThreeCounters, n1::Integer, n2::Integer)
|
@inline eval_op(::Overlap, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
|
||||||
c.left += n1 > 0
|
eval_reduce(::Overlap, c) = 1 - c[3] / min(c[1], c[2])
|
||||||
c.right += n2 > 0
|
|
||||||
c.shared += (n1 > 0) & (n2 > 0)
|
|
||||||
end
|
|
||||||
calculate(::Overlap, c::ThreeCounters) =
|
|
||||||
1.0 - c.shared / min(c.left, c.right)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
NMD(q::Int)
|
NMD(q::Int)
|
||||||
|
@ -247,15 +218,9 @@ https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
||||||
struct NMD <: AbstractQGramDistance
|
struct NMD <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
eval_start(::NMD) = (0, 0, 0)
|
||||||
newcounter(::NMD) = ThreeCounters(0, 0, 0)
|
@inline eval_op(::NMD, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + max(n1, n2))
|
||||||
@inline function count!(::NMD, c::ThreeCounters, n1::Integer, n2::Integer)
|
eval_reduce(::NMD, c) = (c[3] - min(c[1], c[2])) / max(c[1], c[2])
|
||||||
c.left += n1
|
|
||||||
c.right += n2
|
|
||||||
c.shared += max(n1, n2)
|
|
||||||
end
|
|
||||||
calculate(::NMD, c::ThreeCounters) =
|
|
||||||
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -279,22 +244,7 @@ struct MorisitaOverlap <: AbstractQGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
mutable struct FiveCounters <: AbstractQGramMatchCounter
|
eval_start(::MorisitaOverlap) = (0, 0, 0, 0, 0)
|
||||||
leftsum::Int # sum(m(s1))
|
@inline eval_op(::MorisitaOverlap, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + n1^2, c[4] + n2^2, c[5] + n1 * n2)
|
||||||
rightsum::Int # sum(m(s2))
|
eval_reduce(::MorisitaOverlap, c) = 1 - 2 * c[5] / (c[3] * c[2] / c[1] + c[4] * c[1] / c[2])
|
||||||
leftsq::Int # sum(m(s1).^2)
|
|
||||||
rightsq::Int # sum(m(s2).^2)
|
|
||||||
shared::Int # sum(m(s1) .* m(s2))
|
|
||||||
end
|
|
||||||
|
|
||||||
newcounter(::MorisitaOverlap) = FiveCounters(0, 0, 0, 0, 0)
|
|
||||||
@inline function count!(::MorisitaOverlap, c::FiveCounters, n1::Integer, n2::Integer)
|
|
||||||
c.leftsum += n1
|
|
||||||
c.rightsum += n2
|
|
||||||
c.leftsq += n1^2
|
|
||||||
c.rightsq += n2^2
|
|
||||||
c.shared += n1 * n2
|
|
||||||
end
|
|
||||||
calculate(::MorisitaOverlap, c::FiveCounters) =
|
|
||||||
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
|
|
||||||
|
|
||||||
|
|
|
@ -51,23 +51,23 @@ end
|
||||||
|
|
||||||
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
|
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
|
||||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
|
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
|
||||||
counter = newcounter(dist)
|
counter = eval_start(dist)
|
||||||
d1, d2 = qc1.counts, qc2.counts
|
d1, d2 = qc1.counts, qc2.counts
|
||||||
for (k1, c1) in d1
|
for (k1, c1) in d1
|
||||||
index = Base.ht_keyindex2!(d2, k1)
|
index = Base.ht_keyindex2!(d2, k1)
|
||||||
if index > 0
|
if index > 0
|
||||||
count!(dist, counter, c1, d2.vals[index])
|
counter = eval_op(dist, counter, c1, d2.vals[index])
|
||||||
else
|
else
|
||||||
count!(dist, counter, c1, 0)
|
counter = eval_op(dist, counter, c1, 0)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
for (k2, c2) in d2
|
for (k2, c2) in d2
|
||||||
index = Base.ht_keyindex2!(d1, k2)
|
index = Base.ht_keyindex2!(d1, k2)
|
||||||
if index <= 0
|
if index <= 0
|
||||||
count!(dist, counter, 0, c2)
|
counter = eval_op(dist, counter, 0, c2)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
calculate(dist, counter)
|
eval_reduce(dist, counter)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -118,19 +118,19 @@ end
|
||||||
# specialied by subtypes for best performance.
|
# specialied by subtypes for best performance.
|
||||||
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
|
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
|
||||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
|
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
|
||||||
counter = newcounter(dist)
|
counter = eval_start(dist)
|
||||||
d1, d2 = qc1.counts, qc2.counts
|
d1, d2 = qc1.counts, qc2.counts
|
||||||
i1 = i2 = 1
|
i1 = i2 = 1
|
||||||
while true
|
while true
|
||||||
# length can be zero
|
# length can be zero
|
||||||
if i2 > length(d2)
|
if i2 > length(d2)
|
||||||
for i in i1:length(d1)
|
for i in i1:length(d1)
|
||||||
@inbounds count!(dist, counter, d1[i][2], 0)
|
@inbounds counter = eval_op(dist, counter, d1[i][2], 0)
|
||||||
end
|
end
|
||||||
break
|
break
|
||||||
elseif i1 > length(d1)
|
elseif i1 > length(d1)
|
||||||
for i in i2:length(d2)
|
for i in i2:length(d2)
|
||||||
@inbounds count!(dist, counter, 0, d2[i][2])
|
@inbounds counter = eval_op(dist, counter, 0, d2[i][2])
|
||||||
end
|
end
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
@ -138,17 +138,17 @@ function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedV
|
||||||
@inbounds k2, n2 = d2[i2]
|
@inbounds k2, n2 = d2[i2]
|
||||||
cmpval = Base.cmp(k1, k2)
|
cmpval = Base.cmp(k1, k2)
|
||||||
if cmpval == -1 # k1 < k2
|
if cmpval == -1 # k1 < k2
|
||||||
count!(dist, counter, n1, 0)
|
counter = eval_op(dist, counter, n1, 0)
|
||||||
i1 += 1
|
i1 += 1
|
||||||
elseif cmpval == +1 # k2 < k1
|
elseif cmpval == 1 # k2 < k1
|
||||||
count!(dist, counter, 0, n2)
|
counter = eval_op(dist, counter, 0, n2)
|
||||||
i2 += 1
|
i2 += 1
|
||||||
else
|
else
|
||||||
count!(dist, counter, n1, n2)
|
counter = eval_op(dist, counter, n1, n2)
|
||||||
i1 += 1
|
i1 += 1
|
||||||
i2 += 1
|
i2 += 1
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
calculate(dist, counter)
|
eval_reduce(dist, counter)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Partial)(s1, s2)
|
function (dist::Partial)(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
out = dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
|
@ -33,7 +33,7 @@ function (dist::Partial)(s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
function (dist::Partial{RatcliffObershelp})(s1, s2)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return dist.dist(s1, s2)
|
len1 == len2 && return dist.dist(s1, s2)
|
||||||
|
@ -79,7 +79,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
s1 = join(sort!(split(s1)), " ")
|
s1 = join(sort!(split(s1)), " ")
|
||||||
s2 = join(sort!(split(s2)), " ")
|
s2 = join(sort!(split(s2)), " ")
|
||||||
out = dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
|
@ -111,7 +111,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
(s1 === missing) | (s2 === missing) && return missing
|
||||||
v1 = unique!(sort!(split(s1)))
|
v1 = unique!(sort!(split(s1)))
|
||||||
v2 = unique!(sort!(split(s2)))
|
v2 = unique!(sort!(split(s2)))
|
||||||
v0 = intersect(v1, v2)
|
v0 = intersect(v1, v2)
|
||||||
|
|
Loading…
Reference in New Issue