simplify a bit preprocessed qgrams (#50)

2021-08-08 06:58:42 +02:00 · 2021-08-08 06:58:42 +02:00 · e9b224f03f
parent 633a2d85dc
commit e9b224f03f
9 changed files with 281 additions and 297 deletions
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "StringDistances"
 uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
-version = "0.10.0"
+version = "0.10.1"

 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -14,47 +14,46 @@ end



-@time f(Jaro(), x, y)
+@time f(Jaro(), x, y);
 #0.3s 
-@time f(Levenshtein(), x, y)
+@time f(Levenshtein(), x, y);
 # 0.4s
-@time f(Levenshtein(), x, y, min_score = 0.8)
+@time f(Levenshtein(), x, y, min_score = 0.8);
 # 0.11 
-@time f(DamerauLevenshtein(), x, y)
+@time f(DamerauLevenshtein(), x, y);
 # 0.58s.
-@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
+@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
 # 0.08 (now 0.09)
-@time f(RatcliffObershelp(), x, y)
+@time f(RatcliffObershelp(), x, y);
 # 1.35s




-@time findnearest(x[1], y, Levenshtein())
+@time findnearest(x[1], y, Levenshtein());
 # 0.02
-@time findnearest(x[1], y, DamerauLevenshtein())
+@time findnearest(x[1], y, DamerauLevenshtein());
 # 0.05
-@time findnearest(x[1], y, QGram(2))
+@time findnearest(x[1], y, QGram(2));
 # 0.75



-@time findall(x[1], y, Levenshtein())
+@time findall(x[1], y, Levenshtein());
 # 0.05
-@time findall(x[1], y, DamerauLevenshtein())
+@time findall(x[1], y, DamerauLevenshtein());
 # 0.05
-@time findall(x[1], y, Partial(DamerauLevenshtein()))
+@time findall(x[1], y, Partial(DamerauLevenshtein()));
 # 0.96
-@time findall(x[1], y, QGram(2))
+@time findall(x[1], y, QGram(2));
 # 0.81
-@time findall(x[1], y, TokenSort(DamerauLevenshtein()))
+@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
 # 0.27 (now 0.32)
-@time findall(x[1], y, TokenSet(DamerauLevenshtein()))
+@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
 # 0.55
-@time findall(x[1], y, TokenMax(DamerauLevenshtein()))
+@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
 # 2.25 (now 3.6)
-@time findnearest(x[1], y, DamerauLevenshtein())
-# 0.15
+

 x = map(Random.randstring, rand(5:25,1000))
 y = map(Random.randstring, rand(5:25,1000))
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -6,6 +6,8 @@ import StatsAPI: pairwise, pairwise!
 include("distances/utils.jl")
 include("distances/edit.jl")
 include("distances/qgram.jl")
+include("distances/qgram_preprocessed.jl")
+
 include("modifiers.jl")
 include("normalize.jl")
 include("pairwise.jl")
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -1,6 +1,13 @@
 struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
 	s::S   # Collection
 	q::Int # Length of Qgram
+	function QGramIterator{S}(s, q) where {S <: Union{AbstractString, AbstractVector}}
+		q > 0 || throw(ArgumentError("The qgram length must be higher than zero"))
+		new(s, q)
+	end
+end
+function QGramIterator(s::Union{AbstractString, AbstractVector}, q::Integer)
+	QGramIterator{typeof(s)}(s, q)
 end
 Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)

@ -51,7 +58,7 @@ qgrams
 # for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
 function _count(s1, s2)
 	K = promote_type(eltype(s1), eltype(s2))
-	d = Dict{K, Tuple{Int, Int}}()
+	d = Dict{K, Tuple{Int32, Int32}}()
 	sizehint!(d, length(s1) + length(s2))
 	# I use a faster way to change a dictionary key
 	# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
@ -78,164 +85,7 @@ function _count(s1, s2)
 	return values(d)
 end

-# Turn a sequence of qgrams to a count dict for them, i.e. map each
-# qgram to the number of times it has been seen.
-function countdict(qgrams)
-    d = Dict{eltype(qgrams), Int32}()
-    for qg in qgrams
-        index = Base.ht_keyindex2!(d, qg)
-		if index > 0
-			d.age += 1
-			@inbounds d.keys[index] = qg
-			@inbounds d.vals[index] = d.vals[index][1] + 1
-		else
-			@inbounds Base._setindex!(d, 1, qg, -index)
-		end
-    end
-    d
-end
-
-abstract type AbstractQGramCounts{Q,K} end
-q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
-counts(qc::AbstractQGramCounts) = qc.counts
-Base.length(qc::AbstractQGramCounts{Q}) where Q = length(qc.counts) + Q - 1
-"""
-	QGramDict(s, q::Integer = 2)
-
-Creates a QGramDict that pre-calculates (pre-counts) the qgrams
-of a string or stream. This enables faster calculation of QGram 
-distances.
-
-Note that the qgram length must correspond with the q length used
-in the distance.
-
-## Examples
-```julia
-str1, str2 = "my string", "another string"
-qd1 = QGramDict(str1, 2)
-qd2 = QGramDict(str2, 2)
-evaluate(Overlap(2), qd1, qd2)
-```
-"""
-struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
-    counts::Dict{K,Int}
-end
-function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
-    @assert q >= 1
-    qgs = qgrams(s, q)
-    QGramDict{q, eltype(qgs)}(countdict(qgs))
-end
-QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
-
-"""
-	QGramSortedVector(s, q::Integer = 2)
-
-Creates a QGramSortedVector that pre-calculates (pre-counts) the 
-qgrams of a string or stream. This enables faster calculation of
-QGram distances.
-
-Since qgrams are sorted in lexicographic order QGram distances can be 
-calculated even faster than when using a QGramDict. However, the 
-sorting means that updating the counts after creation is less 
-efficient. However, for most use cases QGramSortedVector is preferred
-over a QgramDict.
-
-Note that the qgram length must correspond with the q length used
-in the distance.
-
-## Examples
-```julia
-str1, str2 = "my string", "another string"
-qs1 = QGramSortedVector(str1, 2)
-qs2 = QGramSortedVector(str2, 2)
-evaluate(Jaccard(2), qs1, qs2)
-```
-"""
-struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
-    counts::Vector{Pair{K,Int}}
-end
-function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
-    @assert q >= 1
-    qgs = qgrams(s, q)
-    countpairs = collect(countdict(qgs))
-    sort!(countpairs, by = first)
-    QGramSortedVector{q, eltype(qgs)}(countpairs)
-end
-QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
-
-# To implement the distances we will count qgram matches
-# between strings or pre-calculated AbstractQgramCounts objects.
-# The abstract type defines different fallback versions which can be
-# specialied by subtypes for best performance.
 abstract type AbstractQGramMatchCounter end
-@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
-@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
-@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
-	countboth!(c, n1, n2)
-@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
-	countleft!(c, n1)
-	countright!(c, n2)
-	countshared!(c, n1, n2)
-end
-@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
-
-# Subtypes must implement these methods:
-@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
-	error("countleft! not implemented for $(typeof(c))")
-@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
-	error("countright! not implemented for $(typeof(c))")
-
-# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
-@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
-	error("countshared! not implemented for $(typeof(c))")
-
-function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
-    i1 = i2 = 1
-    while i1 <= length(d1) || i2 <= length(d2)
-        if i2 > length(d2)
-			for i in i1:length(d1)
-				@inbounds countleft!(mc, d1[i][1], d1[i][2])
-            end
-            return
-        elseif i1 > length(d1)
-			for i in i2:length(d2)
-				@inbounds countright!(mc, d2[i][1], d2[i][2])
-            end
-            return
-        end
-        @inbounds k1, n1 = d1[i1]
-        @inbounds k2, n2 = d2[i2]
-        cmpval = Base.cmp(k1, k2)
-		if cmpval == -1 # k1 < k2
-			countleft!(mc, k1, n1)
-            i1 += 1
-        elseif cmpval == +1 # k2 < k1
-			countright!(mc, k2, n2)
-            i2 += 1
-		else
-			countboth!(mc, k1, n1, n2)
-            i1 += 1
-            i2 += 1
-        end
-    end
-end
-
-function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
-    for (k1, c1) in d1
-        index = Base.ht_keyindex2!(d2, k1)
-		if index > 0
-			countboth!(mc, k1, c1, d2.vals[index])
-		else
-			countleft!(mc, k1, c1)
-        end
-    end
-    for (k2, c2) in d2
-        index = Base.ht_keyindex2!(d1, k2)
-		if index <= 0
-			countright!(mc, k2, c2)
-        end
-    end
-end

 abstract type AbstractQGramDistance <: SemiMetric end

@ -243,18 +93,11 @@ function (dist::AbstractQGramDistance)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	counter = newcounter(dist)
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
-		countboth!(counter, n1, n2)
+		count!(counter, n1, n2)
 	end
 	calculate(dist, counter)
 end

-function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
-    @assert dist.q == q(qc1)
-	@assert dist.q == q(qc2)
-	counter = newcounter(dist)
-	countmatches!(counter, counts(qc1), counts(qc2))
-    calculate(dist, counter)
-end

 """
 	QGram(q::Int)
@ -272,17 +115,15 @@ struct QGram <: AbstractQGramDistance
 	q::Int
 end

-mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
-	n::T
+mutable struct SingleCounter{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
+	shared::Int
 end

-newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
-
-@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
-@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
-@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
-
-calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
+newcounter(d::QGram) = SingleCounter{QGram}(0)
+@inline function count!(c::SingleCounter{QGram}, n1::Integer, n2::Integer)
+	c.shared += abs(n1 - n2)
+end
+calculate(dist::QGram, c::SingleCounter{QGram}) = c.shared

 """
 	Cosine(q::Int)
@ -300,19 +141,19 @@ struct Cosine <: AbstractQGramDistance
 	q::Int
 end

-mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
-	left::T
-	right::T
-	shared::T
+mutable struct ThreeCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
+	left::Int
+	right::Int
+	shared::Int
 end

-newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
-
-@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
-@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
-@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
-
-calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
+newcounter(d::Cosine) = ThreeCounters{Cosine}(0, 0, 0)
+@inline function count!(c::ThreeCounters{Cosine}, n1::Integer, n2::Integer)
+	c.left += n1^2
+	c.right += n2^2
+	c.shared += n1 * n2
+end
+calculate(d::Cosine, c::ThreeCounters{Cosine}) =
 	1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))

 """
@ -329,8 +170,13 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct Jaccard <: AbstractQGramDistance
 	q::Int
 end
-
-calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
+newcounter(d::Jaccard) = ThreeCounters{Jaccard}(0, 0, 0)
+@inline function count!(c::ThreeCounters{Jaccard}, n1::Integer, n2::Integer)
+	c.left += (n1 > 0)
+	c.right += (n2 > 0)
+	c.shared += (n1 > 0) & (n2 > 0)
+end
+calculate(d::Jaccard, c::ThreeCounters{Jaccard}) =
 	1.0 - c.shared / (c.left + c.right - c.shared)

 """
@ -347,8 +193,13 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct SorensenDice <: AbstractQGramDistance
 	q::Int
 end
-
-calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
+newcounter(d::SorensenDice) = ThreeCounters{SorensenDice}(0, 0, 0)
+@inline function count!(c::ThreeCounters{SorensenDice}, n1::Integer, n2::Integer)
+	c.left += (n1 > 0)
+	c.right += (n2 > 0)
+	c.shared += (n1 > 0) & (n2 > 0)
+end
+calculate(d::SorensenDice, c::ThreeCounters{SorensenDice}) =
 	1.0 - 2.0 * c.shared / (c.left + c.right)

 """
@ -365,67 +216,15 @@ where ``Q(s, q)``  denotes the set of q-grams of length n for the string s
 struct Overlap <: AbstractQGramDistance
 	q::Int
 end
-
-const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
-newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
-
-@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
+newcounter(d::Overlap) = ThreeCounters{Overlap}(0, 0, 0)
+@inline function count!(c::ThreeCounters{Overlap}, n1::Integer, n2::Integer)
 	c.left += (n1 > 0)
-@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
 	c.right += (n2 > 0)
-@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
 	c.shared += (n1 > 0) & (n2 > 0)
-
-calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
+end
+calculate(d::Overlap, c::ThreeCounters{Overlap}) =
 	1.0 - c.shared / min(c.left, c.right)

-"""
-	MorisitaOverlap(q::Int)
-
-Creates a MorisitaOverlap distance, a general, statistical measure of
-dispersion which can also be used on dictionaries such as created
-from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
-This is more fine-grained than many of the other QGramDistances since
-it is based on the counts per q-gram rather than only which q-grams are
-in the strings.
-
-The distance corresponds to
-
-``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
-
-where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
-sum of those counts.
-"""
-struct MorisitaOverlap <: AbstractQGramDistance
-	q::Int
-end
-
-mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
-	leftsum::T    # sum(m(s1))
-	rightsum::T   # sum(m(s2))
-	leftsq::T     # sum(m(s1).^2)
-	rightsq::T    # sum(m(s2).^2)
-	shared::T     # sum(m(s1) .* m(s2))
-end
-
-newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
-
-@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
-	c.leftsum += n1
-	c.leftsq += (n1^2)
-end
-
-@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
-	c.rightsum += n2
-	c.rightsq += (n2^2)
-end
-
-@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
-	c.shared += (n1 * n2)
-
-calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
-	1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
-
 """
 	NMD(q::Int)
 	NMD(q::Int)
@ -449,23 +248,53 @@ struct NMD <: AbstractQGramDistance
 	q::Int
 end

-newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
-
-@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
-	c.left += n1
-	c.shared += n1 # max(n1, 0) == n1
-end
-
-@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
-	c.right += n2
-	c.shared += n2 # max(n2, 0) == n2
-end
-
-@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
+newcounter(d::NMD) = ThreeCounters{NMD}(0, 0, 0)
+@inline function count!(c::ThreeCounters{NMD}, n1::Integer, n2::Integer)
 	c.left += n1
 	c.right += n2
 	c.shared += max(n1, n2)
 end
-
-calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
+calculate(d::NMD, c::ThreeCounters{NMD}) =
 	(c.shared - min(c.left, c.right)) / max(c.left, c.right)
+
+
+"""
+	MorisitaOverlap(q::Int)
+
+Creates a MorisitaOverlap distance, a general, statistical measure of
+dispersion which can also be used on dictionaries such as created
+from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
+This is more fine-grained than many of the other QGramDistances since
+it is based on the counts per q-gram rather than only which q-grams are
+in the strings.
+
+The distance corresponds to
+
+``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
+
+where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
+sum of those counts.
+"""
+struct MorisitaOverlap <: AbstractQGramDistance
+	q::Int
+end
+
+mutable struct FiveCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
+	leftsum::Int    # sum(m(s1))
+	rightsum::Int   # sum(m(s2))
+	leftsq::Int     # sum(m(s1).^2)
+	rightsq::Int    # sum(m(s2).^2)
+	shared::Int     # sum(m(s1) .* m(s2))
+end
+
+newcounter(d::MorisitaOverlap) = FiveCounters{MorisitaOverlap}(0, 0, 0, 0, 0)
+@inline function count!(c::FiveCounters{MorisitaOverlap}, n1::Integer, n2::Integer)
+	c.leftsum += n1
+	c.rightsum += n2
+	c.leftsq += (n1^2)
+	c.rightsq += (n2^2)
+	c.shared += (n1 * n2)
+end
+calculate(d::MorisitaOverlap, c::FiveCounters{MorisitaOverlap}) =
+	1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
+
--- a/src/distances/qgram_preprocessed.jl
+++ b/src/distances/qgram_preprocessed.jl
@ -0,0 +1,154 @@
+# sometimes we already preprocess the strings
+# We now define special methods for these special string types
+"""
+	QGramDict(s, q::Integer = 2)
+
+An iterator with a pre-computed dictionary of its qgrams. This enables faster calculation of QGram 
+distances.
+
+Note that the qgram length must correspond with the q length used
+in the distance.
+
+## Examples
+```julia
+str1, str2 = "my string", "another string"
+qd1 = QGramDict(str1, 2)
+qd2 = QGramDict(str2, 2)
+evaluate(Overlap(2), qd1, qd2)
+```
+"""
+struct QGramDict{S, K}
+    s::S
+    q::Int
+    counts::Dict{K,Int}
+end
+Base.length(s::QGramDict) = length(s.s)
+Base.iterate(s::QGramDict) = iterate(s.s)
+Base.iterate(s::QGramDict, state) = iterate(s.s, state)
+
+function QGramDict(s, q::Integer = 2)
+    (s isa QGramDict) && (s.q == q) && return s
+    qgs = qgrams(s, q)
+    QGramDict{typeof(s), eltype(qgs)}(s, q, countdict(qgs))
+end
+
+# Turn a sequence of qgrams to a count dict for them, i.e. map each
+# qgram to the number of times it has been seen.
+function countdict(qgrams)
+    d = Dict{eltype(qgrams), Int32}()
+    for qg in qgrams
+        index = Base.ht_keyindex2!(d, qg)
+		if index > 0
+			d.age += 1
+			@inbounds d.keys[index] = qg
+			@inbounds d.vals[index] = d.vals[index][1] + 1
+		else
+			@inbounds Base._setindex!(d, 1, qg, -index)
+		end
+    end
+    d
+end
+
+function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
+    dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
+    counter = newcounter(dist)
+    d1, d2 = qc1.counts, qc2.counts
+    for (k1, c1) in d1
+        index = Base.ht_keyindex2!(d2, k1)
+		if index > 0
+			count!(counter, c1, d2.vals[index])
+		else
+			count!(counter, c1, 0)
+        end
+    end
+    for (k2, c2) in d2
+        index = Base.ht_keyindex2!(d1, k2)
+		if index <= 0
+			count!(counter, 0, c2)
+        end
+    end
+    calculate(dist, counter)
+end
+
+"""
+	QGramSortedVector(s, q::Integer = 2)
+
+An iterator with a pre-computed sorted vector of its qgrams. This enables faster calculation of QGram 
+distances.
+
+Since qgrams are sorted in lexicographic order QGram distances can be 
+calculated even faster than when using a QGramDict. However, the 
+sorting means that updating the counts after creation is less 
+efficient. However, for most use cases QGramSortedVector is preferred
+over a QgramDict.
+
+Note that the qgram length must correspond with the q length used
+in the distance.
+
+## Examples
+```julia
+str1, str2 = "my string", "another string"
+qs1 = QGramSortedVector(str1, 2)
+qs2 = QGramSortedVector(str2, 2)
+evaluate(Jaccard(2), qs1, qs2)
+```
+"""
+struct QGramSortedVector{S, K}
+    s::S
+    q::Int
+    counts::Vector{Pair{K,Int}}
+end
+Base.length(s::QGramSortedVector) = length(s.s)
+Base.iterate(s::QGramSortedVector) = iterate(s.s)
+Base.iterate(s::QGramSortedVector, state) = iterate(s.s, state)
+
+function QGramSortedVector(s, q::Integer = 2)
+    (s isa QGramSortedVector) && (s.q == q) && return s
+    qgs = qgrams(s, q)
+    countpairs = collect(countdict(qgs))
+    sort!(countpairs, by = first)
+    QGramSortedVector{typeof(s), eltype(qgs)}(s, q, countpairs)
+end
+
+
+
+# To implement the distances we will count qgram matches
+# between strings or pre-calculated AbstractQgramCounts objects.
+# The abstract type defines different fallback versions which can be
+# specialied by subtypes for best performance.
+function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
+    dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
+    counter = newcounter(dist)
+    d1, d2 = qc1.counts, qc2.counts
+    i1 = i2 = 1
+    while true
+    	# length can be zero
+        if i2 > length(d2)
+			for i in i1:length(d1)
+				@inbounds count!(counter, d1[i][2], 0)
+            end
+            break
+        elseif i1 > length(d1)
+			for i in i2:length(d2)
+				@inbounds count!(counter, 0, d2[i][2])
+            end
+            break
+        end
+        @inbounds k1, n1 = d1[i1]
+        @inbounds k2, n2 = d2[i2]
+        cmpval = Base.cmp(k1, k2)
+		if cmpval == -1 # k1 < k2
+			count!(counter, n1, 0)
+            i1 += 1
+        elseif cmpval == +1 # k2 < k1
+        	count!(counter, 0, n2)
+            i2 += 1
+		else
+			count!(counter, n1, n2)
+            i1 += 1
+            i2 += 1
+        end
+    end
+    calculate(dist, counter)
+end
+
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -170,10 +170,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
    min_score_atomic = Threads.Atomic{Float64}(min_score)
    scores = [0.0 for _ in 1:Threads.nthreads()]
    is = [0 for _ in 1:Threads.nthreads()]
-    s = _helper(s, dist)
+    s = _helper(dist, s)
    # need collect since @threads requires a length method
-    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(itr[i], dist), dist; min_score = min_score_atomic[])
+    for i in collect(eachindex(itr))
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
        score_old = Threads.atomic_max!(min_score_atomic, score)
        if score >= score_old
            scores[Threads.threadid()] = score
@ -183,12 +183,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
    imax = is[argmax(scores)]
    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
 end
-
-function _helper(s, dist::AbstractQGramDistance)
-    s !== missing ? QGramSortedVector(s, dist.q) : s
-end
-_helper(s, dist::StringDistance) = s
-
+_helper(dist::AbstractQGramDistance, ::Missing) = missing
+_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
+_helper(dist::StringDistance, s) = s

 function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
@ -218,10 +215,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
 """
 function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
    out = [Int[] for _ in 1:Threads.nthreads()]
-    s = _helper(s, dist)
+    s = _helper(dist, s)
    # need collect since @threads requires a length method
    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(itr[i], dist), dist; min_score = min_score)
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
        if score >= min_score
            push!(out[Threads.threadid()], i)
        end
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -38,7 +38,7 @@ end
    pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)

 Compute distances between all pairs of elements in `xs` and `ys` according to the
-`StringDistance` `dist` and write the result in `R`. `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
+`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.

 For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
 to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
@ -75,11 +75,13 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
    return R
 end

-function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
-    if preprocess === nothing ? length(xs) >= 5 : preprocess 
-        return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
+function _preprocess(xs, dist::StringDistance, preprocess)
+    if preprocess === nothing
+        preprocess = length(xs) >= 5
+    end
+    if (dist isa AbstractQGramDistance) && preprocess
+        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
    else
        return xs
    end
 end
-_preprocess(xs, dist::StringDistance, preprocess) = xs
--- a/test/distances.jl
+++ b/test/distances.jl
@ -174,7 +174,7 @@ using StringDistances, Unicode, Test, Random
 		# To get something we can more easily compare to:
 		stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
 		stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
-		sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
+		sortedcounts(qc) = sort(collect(qc.counts), by = first)
 		totuples(qc) = map(stringify, sortedcounts(qc))

 		s1, s2   = "arnearne", "arnebeda"
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -133,6 +133,7 @@ end
 	@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
 	@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
 	@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1)
+	@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], QGram(2)) == ("NewYork", 3)

 	# findall
 	@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]