faster calculation of QGramDistances if precounting qgrams either as Dict or sorted array, with tests

pull/36/head
Robert Feldt 2020-10-23 23:33:08 +02:00
parent 99a5e87d36
commit c0bedf89a6
4 changed files with 196 additions and 59 deletions

View File

@ -10,8 +10,9 @@ Distances = "0.8.1, 0.9, 0.10"
julia = "1"
[extras]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[targets]
test = ["Test", "Unicode"]
test = ["Test", "Unicode", "Random"]

View File

@ -30,6 +30,8 @@ Cosine,
Jaccard,
SorensenDice,
Overlap,
QGramDict,
QGramSortedArray,
Winkler,
Partial,
TokenSort,

View File

@ -29,7 +29,6 @@ Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@doc """
Return an iterator corresponding to the the q-gram of an iterator.
When the iterator is a String, qgrams are SubStrings.
@ -47,7 +46,6 @@ end
"""
qgrams
# For two iterators s1 and s2, that define a length and eltype method,
# this returns an iterator that,
# for each element in s1 s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
@ -80,9 +78,141 @@ function _count(s1, s2)
return values(d)
end
# Turn a sequence of qgrams to a count dict for them, i.e. map each
# qgram to the number of times it has been seen.
function countdict(qgrams)
d = Dict{eltype(qgrams), Int32}()
for qg in qgrams
index = Base.ht_keyindex2!(d, qg)
if index > 0
d.age += 1
@inbounds d.keys[index] = qg
@inbounds d.vals[index] = d.vals[index][1] + 1
else
@inbounds Base._setindex!(d, 1, qg, -index)
end
end
d
end
abstract type AbstractQGramCounts{Q,K} end
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
counts(qc::AbstractQGramCounts) = qc.counts
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
counts::Dict{K,Int}
end
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
QGramDict{q, eltype(qgs)}(countdict(qgs))
end
# Faster (than QgramDict) with the qgrams presorted
struct QGramSortedArray{Q,K} <: AbstractQGramCounts{Q,K}
counts::Vector{Pair{K,Int}}
end
function QGramSortedArray(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
countpairs = collect(countdict(qgs))
sort!(countpairs, by = first)
QGramSortedArray{q, eltype(qgs)}(countpairs)
end
# To implement the distances we will count qgram matches
# between strings or pre-calculated AbstractQgramCounts objects.
# The abstract type defines different fallback versions which can be
# specialied by subtypes for best performance.
abstract type AbstractQGramMatchCounter end
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
countboth!(c, n1, n2)
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
countleft!(c, n1)
countright!(c, n2)
countshared!(c, n1, n2)
end
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
# Subtypes must implement these methods:
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
error("countleft! not implemented for $(typeof(c))")
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
error("countright! not implemented for $(typeof(c))")
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
error("countshared! not implemented for $(typeof(c))")
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
i1 = i2 = 1
while i1 <= length(d1) || i2 <= length(d2)
if i2 > length(d2)
for i in i1:length(d1)
@inbounds countleft!(mc, d1[i][1], d1[i][2])
end
return
elseif i1 > length(d1)
for i in i2:length(d2)
@inbounds countright!(mc, d2[i][1], d2[i][2])
end
return
end
@inbounds k1, n1 = d1[i1]
@inbounds k2, n2 = d2[i2]
cmpval = Base.cmp(k1, k2)
if cmpval == -1 # k1 < k2
countleft!(mc, k1, n1)
i1 += 1
elseif cmpval == +1 # k2 < k1
countright!(mc, k2, n2)
i2 += 1
else
countboth!(mc, k1, n1, n2)
i1 += 1
i2 += 1
end
end
end
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
for (k1, c1) in d1
index = Base.ht_keyindex2!(d2, k1)
if index > 0
countboth!(mc, k1, c1, d2.vals[index])
else
countleft!(mc, k1, c1)
end
end
for (k2, c2) in d2
index = Base.ht_keyindex2!(d1, k2)
if index <= 0
countright!(mc, k2, c2)
end
end
end
abstract type QGramDistance <: SemiMetric end
function (dist::QGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
countboth!(counter, n1, n2)
end
calculate(dist, counter)
end
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
@assert dist.q == q(qc1)
@assert dist.q == q(qc2)
counter = newcounter(dist)
countmatches!(counter, counts(qc1), counts(qc2))
calculate(dist, counter)
end
"""
QGram(q::Int)
@ -99,15 +229,17 @@ struct QGram <: QGramDistance
q::Int
end
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
n += abs(n1 - n2)
end
n
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
n::T
end
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
"""
Cosine(q::Int)
@ -125,17 +257,20 @@ struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
end
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
left::T
right::T
shared::T
end
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
"""
Jaccard(q::Int)
@ -152,17 +287,8 @@ struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
1.0 - c.shared / (c.left + c.right - c.shared)
"""
SorensenDice(q::Int)
@ -179,17 +305,8 @@ struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
1.0 - 2.0 * c.shared / (c.left + c.right)
"""
Overlap(q::Int)
@ -206,14 +323,15 @@ struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / min(ndistinct1, ndistinct2)
end
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
c.left += (n1 > 0)
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
c.right += (n2 > 0)
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
c.shared += (n1 > 0) & (n2 > 0)
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
1.0 - c.shared / min(c.left, c.right)

View File

@ -1,5 +1,4 @@
using StringDistances, Unicode, Test
using StringDistances, Unicode, Test, Random
@testset "Distances" begin
@ -15,9 +14,6 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(Jaro(), "", missing))
end
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@ -70,7 +66,6 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
end
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@ -85,8 +80,6 @@ using StringDistances, Unicode, Test
@inferred evaluate(QGram(1), "", "")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@ -130,8 +123,31 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(Overlap(1), "", missing))
end
@testset "Differential testing of String, QGramDict, and QGramSortedArray" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)
# 2 random strings with some overlap
str1 = randstring(rand(5:10000))
ci1 = rand(2:div(length(str1), 2))
ci2 = rand((ci1+1):(length(str1)-1))
str2 = randstring(ci1-1) * str1[ci1:ci2] * randstring(length(str1)-ci2)
# QGramDict gets same result as for standard string
qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen)
expected = evaluate(dist, str1, str2)
@test expected == evaluate(dist, qd1, qd2)
# QGramSortedArray gets same result as for standard string
qc1 = QGramSortedArray(str1, qlen)
qc2 = QGramSortedArray(str2, qlen)
@test expected == evaluate(dist, qc1, qc2)
end
end
end
strings = [
("martha", "marhta"),