Merge pull request #36 from robertfeldt/master

Precounting QGrams
2020-10-24 12:07:32 -07:00 · 2020-10-24 12:07:32 -07:00 · 610a67313a
parent aed1fc2ad8 9d28c36ed5
commit 610a67313a
5 changed files with 309 additions and 61 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,7 +2,7 @@ language: julia
 os:
  - linux
 julia:
-  - 1.0
+  - 1.3
  - 1.5
  - nightly
 matrix:
--- a/Project.toml
+++ b/Project.toml
@ -7,11 +7,12 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

 [compat]
 Distances = "0.8.1, 0.9, 0.10"
-julia = "1"
+julia = "1.3"

 [extras]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

 [targets]
-test = ["Test", "Unicode"]
+test = ["Test", "Unicode", "Random"]
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -30,6 +30,8 @@ Cosine,
 Jaccard,
 SorensenDice,
 Overlap,
+QGramDict,
+QGramSortedVector,
 Winkler,
 Partial,
 TokenSort,
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -29,7 +29,6 @@ Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
 qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
 qgrams(s, q::Integer) = QGramIterator(collect(s), q)

-
@doc """
 Return an iterator corresponding to the the q-gram of an iterator. 
 When the iterator is a String, qgrams are SubStrings.
@ -47,7 +46,6 @@ end
 """ 
 qgrams

-
 # For two iterators s1 and s2, that define a length and eltype method,
 # this returns an iterator that,
 # for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
@ -80,9 +78,184 @@ function _count(s1, s2)
 	return values(d)
 end

+# Turn a sequence of qgrams to a count dict for them, i.e. map each
+# qgram to the number of times it has been seen.
+function countdict(qgrams)
+    d = Dict{eltype(qgrams), Int32}()
+    for qg in qgrams
+        index = Base.ht_keyindex2!(d, qg)
+		if index > 0
+			d.age += 1
+			@inbounds d.keys[index] = qg
+			@inbounds d.vals[index] = d.vals[index][1] + 1
+		else
+			@inbounds Base._setindex!(d, 1, qg, -index)
+		end
+    end
+    d
+end
+
+abstract type AbstractQGramCounts{Q,K} end
+q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
+counts(qc::AbstractQGramCounts) = qc.counts
+
+"""
+	QGramDict(s, q::Integer = 2)
+
+Creates a QGramDict that pre-calculates (pre-counts) the qgrams
+of a string or stream. This enables faster calculation of QGram 
+distances.
+
+Note that the qgram length must correspond with the q length used
+in the distance.
+
+## Examples
+```julia
+str1, str2 = "my string", "another string"
+qd1 = QGramDict(str1, 2)
+qd2 = QGramDict(str2, 2)
+evaluate(Overlap(2), qd1, qd2)
+```
+"""
+struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
+    counts::Dict{K,Int}
+end
+function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
+    @assert q >= 1
+    qgs = qgrams(s, q)
+    QGramDict{q, eltype(qgs)}(countdict(qgs))
+end
+QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
+
+"""
+	QGramSortedVector(s, q::Integer = 2)
+
+Creates a QGramSortedVector that pre-calculates (pre-counts) the 
+qgrams of a string or stream. This enables faster calculation of
+QGram distances.
+
+Since qgrams are sorted in lexicographic order QGram distances can be 
+calculated even faster than when using a QGramDict. However, the 
+sorting means that updating the counts after creation is less 
+efficient. However, for most use cases QGramSortedVector is preferred
+over a QgramDict.
+
+Note that the qgram length must correspond with the q length used
+in the distance.
+
+## Examples
+```julia
+str1, str2 = "my string", "another string"
+qs1 = QGramSortedVector(str1, 2)
+qs2 = QGramSortedVector(str2, 2)
+evaluate(Jaccard(2), qs1, qs2)
+```
+"""
+struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
+    counts::Vector{Pair{K,Int}}
+end
+function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
+    @assert q >= 1
+    qgs = qgrams(s, q)
+    countpairs = collect(countdict(qgs))
+    sort!(countpairs, by = first)
+    QGramSortedVector{q, eltype(qgs)}(countpairs)
+end
+QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
+
+# To implement the distances we will count qgram matches
+# between strings or pre-calculated AbstractQgramCounts objects.
+# The abstract type defines different fallback versions which can be
+# specialied by subtypes for best performance.
+abstract type AbstractQGramMatchCounter end
+@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
+@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
+@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
+	countboth!(c, n1, n2)
+@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
+	countleft!(c, n1)
+	countright!(c, n2)
+	countshared!(c, n1, n2)
+end
+@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
+
+# Subtypes must implement these methods:
+@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
+	error("countleft! not implemented for $(typeof(c))")
+@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
+	error("countright! not implemented for $(typeof(c))")
+
+# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
+@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
+	error("countshared! not implemented for $(typeof(c))")
+
+function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
+    i1 = i2 = 1
+    while i1 <= length(d1) || i2 <= length(d2)
+        if i2 > length(d2)
+			for i in i1:length(d1)
+				@inbounds countleft!(mc, d1[i][1], d1[i][2])
+            end
+            return
+        elseif i1 > length(d1)
+			for i in i2:length(d2)
+				@inbounds countright!(mc, d2[i][1], d2[i][2])
+            end
+            return
+        end
+        @inbounds k1, n1 = d1[i1]
+        @inbounds k2, n2 = d2[i2]
+        cmpval = Base.cmp(k1, k2)
+		if cmpval == -1 # k1 < k2
+			countleft!(mc, k1, n1)
+            i1 += 1
+        elseif cmpval == +1 # k2 < k1
+			countright!(mc, k2, n2)
+            i2 += 1
+		else
+			countboth!(mc, k1, n1, n2)
+            i1 += 1
+            i2 += 1
+        end
+    end
+end
+
+function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
+    for (k1, c1) in d1
+        index = Base.ht_keyindex2!(d2, k1)
+		if index > 0
+			countboth!(mc, k1, c1, d2.vals[index])
+		else
+			countleft!(mc, k1, c1)
+        end
+    end
+    for (k2, c2) in d2
+        index = Base.ht_keyindex2!(d1, k2)
+		if index <= 0
+			countright!(mc, k2, c2)
+        end
+    end
+end

 abstract type QGramDistance <: SemiMetric end

+function (dist::QGramDistance)(s1, s2)
+	((s1 === missing) | (s2 === missing)) && return missing
+	counter = newcounter(dist)
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
+		countboth!(counter, n1, n2)
+	end
+	calculate(dist, counter)
+end
+
+function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
+    @assert dist.q == q(qc1)
+	@assert dist.q == q(qc2)
+	counter = newcounter(dist)
+	countmatches!(counter, counts(qc1), counts(qc2))
+    calculate(dist, counter)
+end
+
 """
 	QGram(q::Int)

@ -99,15 +272,17 @@ struct QGram <: QGramDistance
 	q::Int
 end

-function (dist::QGram)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	n = 0
-	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		n += abs(n1 - n2)
-	end
-	n
+mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+	n::T
 end

+newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
+
+@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
+@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
+@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
+
+calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n

 """
 	Cosine(q::Int)
@ -125,17 +300,20 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function (dist::Cosine)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		norm1 += n1^2
-		norm2 += n2^2
-		prodnorm += n1 * n2
-	end
-	1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
+mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
+	left::T
+	right::T
+	shared::T
 end

+newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
+
+@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
+@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
+@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
+
+calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
+	1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))

 """
 	Jaccard(q::Int)
@ -152,17 +330,8 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function (dist::Jaccard)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		ndistinct1 += n1 > 0
-		ndistinct2 += n2 > 0
-		nintersect += (n1 > 0) & (n2 > 0)
-	end
-	1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
-end
-
+calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
+	1.0 - c.shared / (c.left + c.right - c.shared)

 """
 	SorensenDice(q::Int)
@ -179,17 +348,8 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function (dist::SorensenDice)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in  _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		ndistinct1 += n1 > 0
-		ndistinct2 += n2 > 0
-		nintersect += (n1 > 0) & (n2 > 0)
-	end
-	1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
-end
-
+calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
+	1.0 - 2.0 * c.shared / (c.left + c.right)

 """
 	Overlap(q::Int)
@ -206,14 +366,15 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function (dist::Overlap)(s1, s2)
-	((s1 === missing) | (s2 === missing)) && return missing
-	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		ndistinct1 += n1 > 0
-		ndistinct2 += n2 > 0
-		nintersect += (n1 > 0) & (n2 > 0)
-	end
-	1.0 - nintersect / min(ndistinct1, ndistinct2)
-end
+const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
+newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)

+@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
+	c.left += (n1 > 0)
+@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
+	c.right += (n2 > 0)
+@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
+	c.shared += (n1 > 0) & (n2 > 0)
+
+calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
+	1.0 - c.shared / min(c.left, c.right)
--- a/test/distances.jl
+++ b/test/distances.jl
@ -1,5 +1,4 @@
-
-using StringDistances, Unicode, Test
+using StringDistances, Unicode, Test, Random

@testset "Distances" begin

@ -15,9 +14,6 @@ using StringDistances, Unicode, Test
 		@test ismissing(evaluate(Jaro(), "", missing))
 	end

-
-
-
 	@testset "Levenshtein" begin
 		@test evaluate(Levenshtein(), "", "") == 0
 		@test evaluate(Levenshtein(), "abc", "") == 3
@ -70,7 +66,6 @@ using StringDistances, Unicode, Test
 		@test ismissing(evaluate(RatcliffObershelp(), "", missing))
 	end

-
 	@testset "QGram" begin
 		@test evaluate(QGram(1), "abc", "abc") == 0
 		@test evaluate(QGram(1), "", "abc") == 3
@ -85,8 +80,6 @@ using StringDistances, Unicode, Test
 		@inferred evaluate(QGram(1), "", "")
 	end

-
-
 	@testset "Cosine" begin
 		@test isnan(evaluate(Cosine(2), "", "abc"))
 		@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
@ -130,8 +123,99 @@ using StringDistances, Unicode, Test
 		@test ismissing(evaluate(Overlap(1), "", missing))
 	end

+	@testset "QGramDict and QGramSortedVector counts qgrams" begin
+		# To get something we can more easily compare to:
+		stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
+		stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
+		sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
+		totuples(qc) = map(stringify, sortedcounts(qc))

+		s1, s2   = "arnearne", "arnebeda"

+		qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2)
+		@test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
+		@test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
+
+		qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2)
+		@test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
+		@test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
+
+		s3 = "rgówów"
+		qd3a = QGramDict(s3, 2)
+		@test totuples(qd3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)]
+
+		qd3b = QGramDict(graphemes(s3), 2)
+		@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
+
+		qc3a = QGramSortedVector(s3, 2)
+		@test totuples(qc3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)]
+
+		qd3b = QGramDict(graphemes(s3), 2)
+		@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
+	end
+
+	function partlyoverlappingstrings(sizerange, chars = [])
+		str1 = if length(chars) < 1
+			randstring(rand(sizerange))
+		else
+			randstring(chars, rand(sizerange))
+		end
+		elems = collect(str1)
+		ci1 = prevind(str1, rand(2:div(length(elems), 2)))
+		ci2 = prevind(str1, rand((ci1+1):(length(elems)-1)))
+		str2 = if length(chars) < 1
+			randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2)
+		else
+			randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2)
+		end
+		return str1, str2
+	end
+
+	@testset "Precalculation on unicode strings" begin
+		Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...)
+		for _ in 1:100
+			str1, str2 = partlyoverlappingstrings(10:100, Chars)
+			qlen = rand(2:5)
+			d = Jaccard(qlen)
+
+			qd1 = QGramDict(str1, qlen)
+			qd2 = QGramDict(str2, qlen)
+			@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2)
+
+			qd1b = QGramDict(graphemes(str1), qlen)
+			qd2b = QGramDict(graphemes(str2), qlen)
+			@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b)
+
+			qc1 = QGramSortedVector(str1, qlen)
+			qc2 = QGramSortedVector(str2, qlen)
+			@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2)
+
+			qc1b = QGramSortedVector(graphemes(str1), qlen)
+			qc2b = QGramSortedVector(graphemes(str2), qlen)
+			@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b)
+		end
+	end
+
+	@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
+		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
+			for _ in 1:100
+				qlen = rand(2:9)
+				dist = D(qlen)
+				str1, str2 = partlyoverlappingstrings(5:10000)
+
+				# QGramDict gets same result as for standard string
+				qd1 = QGramDict(str1, qlen)
+				qd2 = QGramDict(str2, qlen)
+				expected = evaluate(dist, str1, str2)
+				@test expected == evaluate(dist, qd1, qd2)
+
+				# QGramSortedVector gets same result as for standard string
+				qc1 = QGramSortedVector(str1, qlen)
+				qc2 = QGramSortedVector(str2, qlen)
+				@test expected == evaluate(dist, qc1, qc2)
+			end
+		end
+	end

 	strings = [
 	("martha", "marhta"),