Simplify Counter (#53)

simplify code by removing counters + use Ints everywhere
2021-09-06 10:19:29 -04:00 · 2021-09-06 10:19:29 -04:00 · 254e5e15f6
parent 2aff23fd6c
commit 254e5e15f6
4 changed files with 55 additions and 113 deletions
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -11,7 +11,7 @@ end
 Hamming() = Hamming(nothing)

 function (dist::Hamming)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    out = abs(length(s2) - length(s1))
    dist.max_dist !== nothing && out > dist.max_dist && return dist.max_dist + 1
    for (ch1, ch2) in zip(s1, s2)
@ -39,7 +39,7 @@ struct Jaro <: SemiMetric end

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 function (dist::Jaro)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    # If both are empty, the formula in Wikipedia gives 0
@ -92,7 +92,7 @@ JaroWinkler(; p = 0.1, threshold = 0.3, maxlength = 4) = JaroWinkler(p, threshol

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 function (dist::JaroWinkler)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    out = Jaro()(s1, s2)
@ -120,7 +120,7 @@ Levenshtein() = Levenshtein(nothing)
 # Return max_dist + 1 if distance higher than max_dist 
 # to differentiate distance equal to max_dist or not, which is important for find fctions.
 function (dist::Levenshtein)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
@ -174,7 +174,7 @@ DamerauLevenshtein() = DamerauLevenshtein(nothing)
 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 # Return max_dist + 1 if distance higher than max_dist
 function (dist::DamerauLevenshtein)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    dist.max_dist !== nothing && len2 - len1 > dist.max_dist && return dist.max_dist + 1
@ -241,7 +241,7 @@ region on either side of the longest common subsequence.
 struct RatcliffObershelp <: SemiMetric end

 function (dist::RatcliffObershelp)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -90,12 +90,12 @@ abstract type AbstractQGramMatchCounter end
 abstract type AbstractQGramDistance <: SemiMetric end

 function (dist::AbstractQGramDistance)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	counter = newcounter(dist)
+	(s1 === missing) | (s2 === missing) && return missing
+	counter = eval_start(dist)
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		count!(dist, counter, n1, n2)
+		counter = eval_op(dist, counter, n1, n2)
 	end
-	calculate(dist, counter)
+	eval_reduce(dist, counter)
 end


@ -114,16 +114,9 @@ that contains the number of times a q-gram appears for the string s
 struct QGram <: AbstractQGramDistance
 	q::Int
 end
-
-mutable struct SingleCounter <: AbstractQGramMatchCounter
-	shared::Int
-end
-
-newcounter(::QGram) = SingleCounter(0)
-@inline function count!(::QGram, c::SingleCounter, n1::Integer, n2::Integer)
-	c.shared += abs(n1 - n2)
-end
-calculate(::QGram, c::SingleCounter) = c.shared
+eval_start(::QGram) = 0
+@inline eval_op(::QGram, c, n1::Integer, n2::Integer) = c + abs(n1 - n2)
+eval_reduce(::QGram, c) = c

 """
 	Cosine(q::Int)
@ -140,21 +133,9 @@ that contains the  number of times a q-gram appears for the string s
 struct Cosine <: AbstractQGramDistance
 	q::Int
 end
-
-mutable struct ThreeCounters <: AbstractQGramMatchCounter
-	left::Int
-	right::Int
-	shared::Int
-end
-
-newcounter(::Cosine) = ThreeCounters(0, 0, 0)
-@inline function count!(::Cosine, c::ThreeCounters, n1::Integer, n2::Integer)
-	c.left += n1^2
-	c.right += n2^2
-	c.shared += n1 * n2
-end
-calculate(::Cosine, c::ThreeCounters) =
-	1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
+eval_start(::Cosine) = (0, 0, 0)
+@inline eval_op(::Cosine, c, n1::Integer, n2::Integer) = (c[1] + n1^2, c[2] + n2^2, c[3] + n1 * n2)
+eval_reduce(::Cosine, c) = 1 - c[3] / sqrt(c[1] * c[2])

 """
 	Jaccard(q::Int)
@ -170,14 +151,9 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct Jaccard <: AbstractQGramDistance
 	q::Int
 end
-newcounter(::Jaccard) = ThreeCounters(0, 0, 0)
-@inline function count!(::Jaccard, c::ThreeCounters, n1::Integer, n2::Integer)
-	c.left += n1 > 0
-	c.right += n2 > 0
-	c.shared += (n1 > 0) & (n2 > 0)
-end
-calculate(::Jaccard, c::ThreeCounters) =
-	1.0 - c.shared / (c.left + c.right - c.shared)
+eval_start(::Jaccard) = (0, 0, 0)
+@inline eval_op(::Jaccard, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
+eval_reduce(::Jaccard, c) = 1 - c[3] / (c[1] + c[2] - c[3])

 """
 	SorensenDice(q::Int)
@ -193,14 +169,9 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct SorensenDice <: AbstractQGramDistance
 	q::Int
 end
-newcounter(::SorensenDice) = ThreeCounters(0, 0, 0)
-@inline function count!(::SorensenDice, c::ThreeCounters, n1::Integer, n2::Integer)
-	c.left += n1 > 0
-	c.right += n2 > 0
-	c.shared += (n1 > 0) & (n2 > 0)
-end
-calculate(::SorensenDice, c::ThreeCounters) =
-	1.0 - 2.0 * c.shared / (c.left + c.right)
+eval_start(::SorensenDice) = (0, 0, 0)
+@inline eval_op(::SorensenDice, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
+eval_reduce(::SorensenDice, c) = 1 - 2 * c[3] / (c[1] + c[2])

 """
 	Overlap(q::Int)
@ -216,14 +187,9 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct Overlap <: AbstractQGramDistance
 	q::Int
 end
-newcounter(::Overlap) = ThreeCounters(0, 0, 0)
-@inline function count!(::Overlap, c::ThreeCounters, n1::Integer, n2::Integer)
-	c.left += n1 > 0
-	c.right += n2 > 0
-	c.shared += (n1 > 0) & (n2 > 0)
-end
-calculate(::Overlap, c::ThreeCounters) =
-	1.0 - c.shared / min(c.left, c.right)
+eval_start(::Overlap) = (0, 0, 0)
+@inline eval_op(::Overlap, c, n1::Integer, n2::Integer) = (c[1] + (n1 > 0), c[2] + (n2 > 0), c[3] + (n1 > 0) * (n2 > 0))
+eval_reduce(::Overlap, c) = 1 - c[3] / min(c[1], c[2])

 """
 	NMD(q::Int)
@ -247,16 +213,9 @@ https://www.sciencedirect.com/science/article/pii/S1047320313001417
 struct NMD <: AbstractQGramDistance
 	q::Int
 end
-
-newcounter(::NMD) = ThreeCounters(0, 0, 0)
-@inline function count!(::NMD, c::ThreeCounters, n1::Integer, n2::Integer)
-	c.left += n1
-	c.right += n2
-	c.shared += max(n1, n2)
-end
-calculate(::NMD, c::ThreeCounters) =
-	(c.shared - min(c.left, c.right)) / max(c.left, c.right)
-
+eval_start(::NMD) = (0, 0, 0)
+@inline eval_op(::NMD, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + max(n1, n2))
+eval_reduce(::NMD, c) = (c[3] - min(c[1], c[2])) / max(c[1], c[2])

 """
 	MorisitaOverlap(q::Int)
@ -278,23 +237,6 @@ sum of those counts.
 struct MorisitaOverlap <: AbstractQGramDistance
 	q::Int
 end
-
-mutable struct FiveCounters <: AbstractQGramMatchCounter
-	leftsum::Int    # sum(m(s1))
-	rightsum::Int   # sum(m(s2))
-	leftsq::Int     # sum(m(s1).^2)
-	rightsq::Int    # sum(m(s2).^2)
-	shared::Int     # sum(m(s1) .* m(s2))
-end
-
-newcounter(::MorisitaOverlap) = FiveCounters(0, 0, 0, 0, 0)
-@inline function count!(::MorisitaOverlap, c::FiveCounters, n1::Integer, n2::Integer)
-	c.leftsum += n1
-	c.rightsum += n2
-	c.leftsq += n1^2
-	c.rightsq += n2^2
-	c.shared += n1 * n2
-end
-calculate(::MorisitaOverlap, c::FiveCounters) =
-	1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
-
+eval_start(::MorisitaOverlap) = (0, 0, 0, 0, 0)
+@inline eval_op(::MorisitaOverlap, c, n1::Integer, n2::Integer) = (c[1] + n1, c[2] + n2, c[3] + n1^2, c[4] + n2^2, c[5] + n1 * n2)
+eval_reduce(::MorisitaOverlap, c) = 1 - 2 * c[5] / (c[3] * c[2] / c[1] + c[4] * c[1] / c[2])
--- a/src/distances/qgram_preprocessed.jl
+++ b/src/distances/qgram_preprocessed.jl
@ -51,23 +51,23 @@ end

 function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
    dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
-    counter = newcounter(dist)
+    counter = eval_start(dist)
    d1, d2 = qc1.counts, qc2.counts
-    for (k1, c1) in d1
-        index = Base.ht_keyindex2!(d2, k1)
+    for (s1, n1) in d1
+        index = Base.ht_keyindex2!(d2, s1)
 		if index > 0
-			count!(dist, counter, c1, d2.vals[index])
+			counter = eval_op(dist, counter, n1, d2.vals[index])
 		else
-			count!(dist, counter, c1, 0)
+			counter = eval_op(dist, counter, n1, 0)
        end
    end
-    for (k2, c2) in d2
-        index = Base.ht_keyindex2!(d1, k2)
+    for (s2, n2) in d2
+        index = Base.ht_keyindex2!(d1, s2)
 		if index <= 0
-			count!(dist, counter, 0, c2)
+			counter = eval_op(dist, counter, 0, n2)
        end
    end
-    calculate(dist, counter)
+    eval_reduce(dist, counter)
 end

 """
@ -118,37 +118,37 @@ end
 # specialied by subtypes for best performance.
 function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
    dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
-    counter = newcounter(dist)
+    counter = eval_start(dist)
    d1, d2 = qc1.counts, qc2.counts
    i1 = i2 = 1
    while true
    	# length can be zero
        if i2 > length(d2)
 			for i in i1:length(d1)
-				@inbounds count!(dist, counter, d1[i][2], 0)
+				@inbounds counter = eval_op(dist, counter, d1[i][2], 0)
            end
            break
        elseif i1 > length(d1)
 			for i in i2:length(d2)
-				@inbounds count!(dist, counter, 0, d2[i][2])
+				@inbounds counter = eval_op(dist, counter, 0, d2[i][2])
            end
            break
        end
-        @inbounds k1, n1 = d1[i1]
-        @inbounds k2, n2 = d2[i2]
-        cmpval = Base.cmp(k1, k2)
+        @inbounds s1, n1 = d1[i1]
+        @inbounds s2, n2 = d2[i2]
+        cmpval = Base.cmp(s1, s2)
 		if cmpval == -1 # k1 < k2
-			count!(dist, counter, n1, 0)
+			counter = eval_op(dist, counter, n1, 0)
            i1 += 1
-        elseif cmpval == +1 # k2 < k1
-        	count!(dist, counter, 0, n2)
+        elseif cmpval == 1 # k2 < k1
+        	counter = eval_op(dist, counter, 0, n2)
            i2 += 1
 		else
-			count!(dist, counter, n1, n2)
+			counter = eval_op(dist, counter, n1, n2)
            i1 += 1
            i2 += 1
        end
    end
-    calculate(dist, counter)
+    eval_reduce(dist, counter)
 end

--- a/src/modifiers.jl
+++ b/src/modifiers.jl
@ -20,7 +20,7 @@ struct Partial{S <: SemiMetric} <: SemiMetric
 end

 function (dist::Partial)(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    out = dist.dist(s1, s2)
@ -33,7 +33,7 @@ function (dist::Partial)(s1, s2)
 end

 function (dist::Partial{RatcliffObershelp})(s1, s2)
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return dist.dist(s1, s2)
@ -79,7 +79,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
 end

 function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
    out = dist.dist(s1, s2)
@ -111,7 +111,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
 end

 function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
-    ((s1 === missing) | (s2 === missing)) && return missing
+    (s1 === missing) | (s2 === missing) && return missing
    v1 = unique!(sort!(split(s1)))
    v2 = unique!(sort!(split(s2)))
    v0 = intersect(v1, v2)