Simplify Counter (#53)

simplify code by removing counters + use Ints everywhere
parametric^2
Matthieu Gomez 2021-09-06 10:19:29 -04:00 committed by GitHub
parent 2aff23fd6c
commit 254e5e15f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 113 deletions

View File

@ -11,7 +11,7 @@ end
Hamming() = Hamming(nothing)
function (dist::Hamming)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
out = abs(length(s2) - length(s1))
dist.max_dist !== nothing && out > dist.max_dist && return dist.max_dist + 1
for (ch1, ch2) in zip(s1, s2)
@ -39,7 +39,7 @@ struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::Jaro)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# If both are empty, the formula in Wikipedia gives 0
@ -92,7 +92,7 @@ JaroWinkler(; p = 0.1, threshold = 0.3, maxlength = 4) = JaroWinkler(p, threshol
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::JaroWinkler)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
out = Jaro()(s1, s2)
@ -120,7 +120,7 @@ Levenshtein() = Levenshtein(nothing)
# Return max_dist + 1 if distance higher than max_dist
# to differentiate distance equal to max_dist or not, which is important for find fctions.
function (dist::Levenshtein)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
@ -174,7 +174,7 @@ DamerauLevenshtein() = DamerauLevenshtein(nothing)
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::DamerauLevenshtein)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
@ -241,7 +241,7 @@ region on either side of the longest common subsequence.
struct RatcliffObershelp <: SemiMetric end
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)

View File

@ -90,12 +90,12 @@ abstract type AbstractQGramMatchCounter end
abstract type AbstractQGramDistance <: SemiMetric end
function (dist::AbstractQGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist)
(s1 === missing) | (s2 === missing) && return missing
counter = eval_start(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
count!(dist, counter, n1, n2)
counter = eval_op(dist, counter, n1, n2)
end
calculate(dist, counter)
eval_reduce(dist, counter)
end
@ -114,16 +114,9 @@ that contains the number of times a q-gram appears for the string s
struct QGram <: AbstractQGramDistance
q::Int
end
mutable struct SingleCounter <: AbstractQGramMatchCounter
shared::Int
end
newcounter(::QGram) = SingleCounter(0)
@inline function count!(::QGram, c::SingleCounter, n1::Integer, n2::Integer)
c.shared += abs(n1 - n2)
end
calculate(::QGram, c::SingleCounter) = c.shared
eval_start(::QGram) = 0
@inline eval_op(::QGram, c, n1::Integer, n2::Integer) = c + abs(n1 - n2)
eval_reduce(::QGram, c) = c
"""
Cosine(q::Int)
@ -140,21 +133,9 @@ that contains the number of times a q-gram appears for the string s
struct Cosine <: AbstractQGramDistance
q::Int
end
mutable struct ThreeCounters <: AbstractQGramMatchCounter
left::Int
right::Int
shared::Int
end
newcounter(::Cosine) = ThreeCounters(0, 0, 0)
@inline function count!(::Cosine, c::ThreeCounters, n1::Integer, n2::Integer)
c.left += n1^2
c.right += n2^2
c.shared += n1 * n2
end
calculate(::Cosine, c::ThreeCounters) =
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
eval_start(::Cosine) = (0, 0, 0)
@inline eval_op(::Cosine, c, n1::Integer, n2::Integer) = (c[1] + n1^2, c[2] + n2^2, c[3] + n1 * n2)
eval_reduce(::Cosine, c) = 1 - c[3] / sqrt(c[1] * c[2])
"""
Jaccard(q::Int)
@ -170,14 +151,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct Jaccard <: AbstractQGramDistance
q::Int
end
newcounter(::Jaccard) = ThreeCounters(0, 0, 0)
@inline function count!(::Jaccard, c::ThreeCounters, n1::Integer, n2::Integer)
c.left += n1 > 0
c.right += n2 > 0
c.shared += (n1 > 0) & (n2 > 0)
end
calculate(::Jaccard, c::ThreeCounters) =
1.0 - c.shared / (c.left + c.right - c.shared)
eval_start(::Jaccard) = (0, 0, 0)
@inline eval_op(::Jaccard, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
eval_reduce(::Jaccard, c) = 1 - c[3] / (c[1] + c[2] - c[3])
"""
SorensenDice(q::Int)
@ -193,14 +169,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct SorensenDice <: AbstractQGramDistance
q::Int
end
newcounter(::SorensenDice) = ThreeCounters(0, 0, 0)
@inline function count!(::SorensenDice, c::ThreeCounters, n1::Integer, n2::Integer)
c.left += n1 > 0
c.right += n2 > 0
c.shared += (n1 > 0) & (n2 > 0)
end
calculate(::SorensenDice, c::ThreeCounters) =
1.0 - 2.0 * c.shared / (c.left + c.right)
eval_start(::SorensenDice) = (0, 0, 0)
@inline eval_op(::SorensenDice, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
eval_reduce(::SorensenDice, c) = 1 - 2 * c[3] / (c[1] + c[2])
"""
Overlap(q::Int)
@ -216,14 +187,9 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct Overlap <: AbstractQGramDistance
q::Int
end
newcounter(::Overlap) = ThreeCounters(0, 0, 0)
@inline function count!(::Overlap, c::ThreeCounters, n1::Integer, n2::Integer)
c.left += n1 > 0
c.right += n2 > 0
c.shared += (n1 > 0) & (n2 > 0)
end
calculate(::Overlap, c::ThreeCounters) =
1.0 - c.shared / min(c.left, c.right)
eval_start(::Overlap) = (0, 0, 0)
@inline eval_op(::Overlap, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
eval_reduce(::Overlap, c) = 1 - c[3] / min(c[1], c[2])
"""
NMD(q::Int)
@ -247,16 +213,9 @@ https://www.sciencedirect.com/science/article/pii/S1047320313001417
struct NMD <: AbstractQGramDistance
q::Int
end
newcounter(::NMD) = ThreeCounters(0, 0, 0)
@inline function count!(::NMD, c::ThreeCounters, n1::Integer, n2::Integer)
c.left += n1
c.right += n2
c.shared += max(n1, n2)
end
calculate(::NMD, c::ThreeCounters) =
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
eval_start(::NMD) = (0, 0, 0)
@inline eval_op(::NMD, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + max(n1, n2))
eval_reduce(::NMD, c) = (c[3] - min(c[1], c[2])) / max(c[1], c[2])
"""
MorisitaOverlap(q::Int)
@ -278,23 +237,6 @@ sum of those counts.
struct MorisitaOverlap <: AbstractQGramDistance
q::Int
end
mutable struct FiveCounters <: AbstractQGramMatchCounter
leftsum::Int # sum(m(s1))
rightsum::Int # sum(m(s2))
leftsq::Int # sum(m(s1).^2)
rightsq::Int # sum(m(s2).^2)
shared::Int # sum(m(s1) .* m(s2))
end
newcounter(::MorisitaOverlap) = FiveCounters(0, 0, 0, 0, 0)
@inline function count!(::MorisitaOverlap, c::FiveCounters, n1::Integer, n2::Integer)
c.leftsum += n1
c.rightsum += n2
c.leftsq += n1^2
c.rightsq += n2^2
c.shared += n1 * n2
end
calculate(::MorisitaOverlap, c::FiveCounters) =
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
eval_start(::MorisitaOverlap) = (0, 0, 0, 0, 0)
@inline eval_op(::MorisitaOverlap, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + n1^2, c[4] + n2^2, c[5] + n1 * n2)
eval_reduce(::MorisitaOverlap, c) = 1 - 2 * c[5] / (c[3] * c[2] / c[1] + c[4] * c[1] / c[2])

View File

@ -51,23 +51,23 @@ end
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
counter = newcounter(dist)
counter = eval_start(dist)
d1, d2 = qc1.counts, qc2.counts
for (k1, c1) in d1
index = Base.ht_keyindex2!(d2, k1)
for (s1, n1) in d1
index = Base.ht_keyindex2!(d2, s1)
if index > 0
count!(dist, counter, c1, d2.vals[index])
counter = eval_op(dist, counter, n1, d2.vals[index])
else
count!(dist, counter, c1, 0)
counter = eval_op(dist, counter, n1, 0)
end
end
for (k2, c2) in d2
index = Base.ht_keyindex2!(d1, k2)
for (s2, n2) in d2
index = Base.ht_keyindex2!(d1, s2)
if index <= 0
count!(dist, counter, 0, c2)
counter = eval_op(dist, counter, 0, n2)
end
end
calculate(dist, counter)
eval_reduce(dist, counter)
end
"""
@ -118,37 +118,37 @@ end
# specialied by subtypes for best performance.
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
counter = newcounter(dist)
counter = eval_start(dist)
d1, d2 = qc1.counts, qc2.counts
i1 = i2 = 1
while true
# length can be zero
if i2 > length(d2)
for i in i1:length(d1)
@inbounds count!(dist, counter, d1[i][2], 0)
@inbounds counter = eval_op(dist, counter, d1[i][2], 0)
end
break
elseif i1 > length(d1)
for i in i2:length(d2)
@inbounds count!(dist, counter, 0, d2[i][2])
@inbounds counter = eval_op(dist, counter, 0, d2[i][2])
end
break
end
@inbounds k1, n1 = d1[i1]
@inbounds k2, n2 = d2[i2]
cmpval = Base.cmp(k1, k2)
@inbounds s1, n1 = d1[i1]
@inbounds s2, n2 = d2[i2]
cmpval = Base.cmp(s1, s2)
if cmpval == -1 # k1 < k2
count!(dist, counter, n1, 0)
counter = eval_op(dist, counter, n1, 0)
i1 += 1
elseif cmpval == +1 # k2 < k1
count!(dist, counter, 0, n2)
elseif cmpval == 1 # k2 < k1
counter = eval_op(dist, counter, 0, n2)
i2 += 1
else
count!(dist, counter, n1, n2)
counter = eval_op(dist, counter, n1, n2)
i1 += 1
i2 += 1
end
end
calculate(dist, counter)
eval_reduce(dist, counter)
end

View File

@ -20,7 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
end
function (dist::Partial)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
out = dist.dist(s1, s2)
@ -33,7 +33,7 @@ function (dist::Partial)(s1, s2)
end
function (dist::Partial{RatcliffObershelp})(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2)
@ -79,7 +79,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
end
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
out = dist.dist(s1, s2)
@ -111,7 +111,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
end
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
((s1 === missing) | (s2 === missing)) && return missing
(s1 === missing) | (s2 === missing) && return missing
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)