faster calculation of QGramDistances if precounting qgrams either as Dict or sorted array, with tests
parent
99a5e87d36
commit
c0bedf89a6
|
@ -10,8 +10,9 @@ Distances = "0.8.1, 0.9, 0.10"
|
|||
julia = "1"
|
||||
|
||||
[extras]
|
||||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
|
||||
|
||||
[targets]
|
||||
test = ["Test", "Unicode"]
|
||||
test = ["Test", "Unicode", "Random"]
|
||||
|
|
|
@ -30,6 +30,8 @@ Cosine,
|
|||
Jaccard,
|
||||
SorensenDice,
|
||||
Overlap,
|
||||
QGramDict,
|
||||
QGramSortedArray,
|
||||
Winkler,
|
||||
Partial,
|
||||
TokenSort,
|
||||
|
|
|
@ -29,7 +29,6 @@ Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
|||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
|
||||
|
||||
@doc """
|
||||
Return an iterator corresponding to the the q-gram of an iterator.
|
||||
When the iterator is a String, qgrams are SubStrings.
|
||||
|
@ -47,7 +46,6 @@ end
|
|||
"""
|
||||
qgrams
|
||||
|
||||
|
||||
# For two iterators s1 and s2, that define a length and eltype method,
|
||||
# this returns an iterator that,
|
||||
# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
|
||||
|
@ -80,9 +78,141 @@ function _count(s1, s2)
|
|||
return values(d)
|
||||
end
|
||||
|
||||
# Turn a sequence of qgrams to a count dict for them, i.e. map each
|
||||
# qgram to the number of times it has been seen.
|
||||
function countdict(qgrams)
|
||||
d = Dict{eltype(qgrams), Int32}()
|
||||
for qg in qgrams
|
||||
index = Base.ht_keyindex2!(d, qg)
|
||||
if index > 0
|
||||
d.age += 1
|
||||
@inbounds d.keys[index] = qg
|
||||
@inbounds d.vals[index] = d.vals[index][1] + 1
|
||||
else
|
||||
@inbounds Base._setindex!(d, 1, qg, -index)
|
||||
end
|
||||
end
|
||||
d
|
||||
end
|
||||
|
||||
abstract type AbstractQGramCounts{Q,K} end
|
||||
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
|
||||
counts(qc::AbstractQGramCounts) = qc.counts
|
||||
|
||||
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
counts::Dict{K,Int}
|
||||
end
|
||||
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
@assert q >= 1
|
||||
qgs = qgrams(s, q)
|
||||
QGramDict{q, eltype(qgs)}(countdict(qgs))
|
||||
end
|
||||
|
||||
# Faster (than QgramDict) with the qgrams presorted
|
||||
struct QGramSortedArray{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
counts::Vector{Pair{K,Int}}
|
||||
end
|
||||
function QGramSortedArray(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
@assert q >= 1
|
||||
qgs = qgrams(s, q)
|
||||
countpairs = collect(countdict(qgs))
|
||||
sort!(countpairs, by = first)
|
||||
QGramSortedArray{q, eltype(qgs)}(countpairs)
|
||||
end
|
||||
|
||||
# To implement the distances we will count qgram matches
|
||||
# between strings or pre-calculated AbstractQgramCounts objects.
|
||||
# The abstract type defines different fallback versions which can be
|
||||
# specialied by subtypes for best performance.
|
||||
abstract type AbstractQGramMatchCounter end
|
||||
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
|
||||
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
|
||||
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
|
||||
countboth!(c, n1, n2)
|
||||
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
|
||||
countleft!(c, n1)
|
||||
countright!(c, n2)
|
||||
countshared!(c, n1, n2)
|
||||
end
|
||||
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
|
||||
|
||||
# Subtypes must implement these methods:
|
||||
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
|
||||
error("countleft! not implemented for $(typeof(c))")
|
||||
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
|
||||
error("countright! not implemented for $(typeof(c))")
|
||||
|
||||
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
|
||||
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
|
||||
error("countshared! not implemented for $(typeof(c))")
|
||||
|
||||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
|
||||
i1 = i2 = 1
|
||||
while i1 <= length(d1) || i2 <= length(d2)
|
||||
if i2 > length(d2)
|
||||
for i in i1:length(d1)
|
||||
@inbounds countleft!(mc, d1[i][1], d1[i][2])
|
||||
end
|
||||
return
|
||||
elseif i1 > length(d1)
|
||||
for i in i2:length(d2)
|
||||
@inbounds countright!(mc, d2[i][1], d2[i][2])
|
||||
end
|
||||
return
|
||||
end
|
||||
@inbounds k1, n1 = d1[i1]
|
||||
@inbounds k2, n2 = d2[i2]
|
||||
cmpval = Base.cmp(k1, k2)
|
||||
if cmpval == -1 # k1 < k2
|
||||
countleft!(mc, k1, n1)
|
||||
i1 += 1
|
||||
elseif cmpval == +1 # k2 < k1
|
||||
countright!(mc, k2, n2)
|
||||
i2 += 1
|
||||
else
|
||||
countboth!(mc, k1, n1, n2)
|
||||
i1 += 1
|
||||
i2 += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
|
||||
for (k1, c1) in d1
|
||||
index = Base.ht_keyindex2!(d2, k1)
|
||||
if index > 0
|
||||
countboth!(mc, k1, c1, d2.vals[index])
|
||||
else
|
||||
countleft!(mc, k1, c1)
|
||||
end
|
||||
end
|
||||
for (k2, c2) in d2
|
||||
index = Base.ht_keyindex2!(d1, k2)
|
||||
if index <= 0
|
||||
countright!(mc, k2, c2)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
abstract type QGramDistance <: SemiMetric end
|
||||
|
||||
function (dist::QGramDistance)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
counter = newcounter(dist)
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
countboth!(counter, n1, n2)
|
||||
end
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||||
@assert dist.q == q(qc1)
|
||||
@assert dist.q == q(qc2)
|
||||
counter = newcounter(dist)
|
||||
countmatches!(counter, counts(qc1), counts(qc2))
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
"""
|
||||
QGram(q::Int)
|
||||
|
||||
|
@ -99,15 +229,17 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::QGram)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
n = 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
n
|
||||
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
n::T
|
||||
end
|
||||
|
||||
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
|
||||
|
||||
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
|
||||
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
|
||||
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
|
||||
|
||||
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
|
||||
|
||||
"""
|
||||
Cosine(q::Int)
|
||||
|
@ -125,17 +257,20 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Cosine)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
norm1 += n1^2
|
||||
norm2 += n2^2
|
||||
prodnorm += n1 * n2
|
||||
end
|
||||
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
||||
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
left::T
|
||||
right::T
|
||||
shared::T
|
||||
end
|
||||
|
||||
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
|
||||
|
||||
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
|
||||
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
|
||||
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
|
||||
|
||||
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
|
||||
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
|
||||
|
||||
"""
|
||||
Jaccard(q::Int)
|
||||
|
@ -152,17 +287,8 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Jaccard)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||
end
|
||||
|
||||
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
|
||||
1.0 - c.shared / (c.left + c.right - c.shared)
|
||||
|
||||
"""
|
||||
SorensenDice(q::Int)
|
||||
|
@ -179,17 +305,8 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::SorensenDice)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
||||
end
|
||||
|
||||
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
|
||||
1.0 - 2.0 * c.shared / (c.left + c.right)
|
||||
|
||||
"""
|
||||
Overlap(q::Int)
|
||||
|
@ -206,14 +323,15 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Overlap)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
||||
end
|
||||
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
|
||||
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
|
||||
|
||||
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
|
||||
c.left += (n1 > 0)
|
||||
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
|
||||
c.right += (n2 > 0)
|
||||
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
|
||||
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
|
||||
1.0 - c.shared / min(c.left, c.right)
|
|
@ -1,5 +1,4 @@
|
|||
|
||||
using StringDistances, Unicode, Test
|
||||
using StringDistances, Unicode, Test, Random
|
||||
|
||||
@testset "Distances" begin
|
||||
|
||||
|
@ -15,9 +14,6 @@ using StringDistances, Unicode, Test
|
|||
@test ismissing(evaluate(Jaro(), "", missing))
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
|
@ -70,7 +66,6 @@ using StringDistances, Unicode, Test
|
|||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
end
|
||||
|
||||
|
||||
@testset "QGram" begin
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
|
@ -85,8 +80,6 @@ using StringDistances, Unicode, Test
|
|||
@inferred evaluate(QGram(1), "", "")
|
||||
end
|
||||
|
||||
|
||||
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
|
@ -130,8 +123,31 @@ using StringDistances, Unicode, Test
|
|||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedArray" begin
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||
for _ in 1:100
|
||||
qlen = rand(2:9)
|
||||
dist = D(qlen)
|
||||
|
||||
# 2 random strings with some overlap
|
||||
str1 = randstring(rand(5:10000))
|
||||
ci1 = rand(2:div(length(str1), 2))
|
||||
ci2 = rand((ci1+1):(length(str1)-1))
|
||||
str2 = randstring(ci1-1) * str1[ci1:ci2] * randstring(length(str1)-ci2)
|
||||
|
||||
# QGramDict gets same result as for standard string
|
||||
qd1 = QGramDict(str1, qlen)
|
||||
qd2 = QGramDict(str2, qlen)
|
||||
expected = evaluate(dist, str1, str2)
|
||||
@test expected == evaluate(dist, qd1, qd2)
|
||||
|
||||
# QGramSortedArray gets same result as for standard string
|
||||
qc1 = QGramSortedArray(str1, qlen)
|
||||
qc2 = QGramSortedArray(str2, qlen)
|
||||
@test expected == evaluate(dist, qc1, qc2)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
|
|
Loading…
Reference in New Issue