Merge pull request #36 from robertfeldt/master

Precounting QGrams
pull/38/head
Matthieu Gomez 2020-10-24 12:07:32 -07:00 committed by GitHub
commit 610a67313a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 309 additions and 61 deletions

View File

@ -2,7 +2,7 @@ language: julia
os:
- linux
julia:
- 1.0
- 1.3
- 1.5
- nightly
matrix:

View File

@ -7,11 +7,12 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
[compat]
Distances = "0.8.1, 0.9, 0.10"
julia = "1"
julia = "1.3"
[extras]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
[targets]
test = ["Test", "Unicode"]
test = ["Test", "Unicode", "Random"]

View File

@ -30,6 +30,8 @@ Cosine,
Jaccard,
SorensenDice,
Overlap,
QGramDict,
QGramSortedVector,
Winkler,
Partial,
TokenSort,

View File

@ -29,7 +29,6 @@ Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@doc """
Return an iterator corresponding to the the q-gram of an iterator.
When the iterator is a String, qgrams are SubStrings.
@ -47,7 +46,6 @@ end
"""
qgrams
# For two iterators s1 and s2, that define a length and eltype method,
# this returns an iterator that,
# for each element in s1 s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
@ -80,9 +78,184 @@ function _count(s1, s2)
return values(d)
end
# Turn a sequence of qgrams to a count dict for them, i.e. map each
# qgram to the number of times it has been seen.
function countdict(qgrams)
d = Dict{eltype(qgrams), Int32}()
for qg in qgrams
index = Base.ht_keyindex2!(d, qg)
if index > 0
d.age += 1
@inbounds d.keys[index] = qg
@inbounds d.vals[index] = d.vals[index][1] + 1
else
@inbounds Base._setindex!(d, 1, qg, -index)
end
end
d
end
abstract type AbstractQGramCounts{Q,K} end
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
counts(qc::AbstractQGramCounts) = qc.counts
"""
QGramDict(s, q::Integer = 2)
Creates a QGramDict that pre-calculates (pre-counts) the qgrams
of a string or stream. This enables faster calculation of QGram
distances.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qd1 = QGramDict(str1, 2)
qd2 = QGramDict(str2, 2)
evaluate(Overlap(2), qd1, qd2)
```
"""
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
counts::Dict{K,Int}
end
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
QGramDict{q, eltype(qgs)}(countdict(qgs))
end
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
"""
QGramSortedVector(s, q::Integer = 2)
Creates a QGramSortedVector that pre-calculates (pre-counts) the
qgrams of a string or stream. This enables faster calculation of
QGram distances.
Since qgrams are sorted in lexicographic order QGram distances can be
calculated even faster than when using a QGramDict. However, the
sorting means that updating the counts after creation is less
efficient. However, for most use cases QGramSortedVector is preferred
over a QgramDict.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qs1 = QGramSortedVector(str1, 2)
qs2 = QGramSortedVector(str2, 2)
evaluate(Jaccard(2), qs1, qs2)
```
"""
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
counts::Vector{Pair{K,Int}}
end
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
countpairs = collect(countdict(qgs))
sort!(countpairs, by = first)
QGramSortedVector{q, eltype(qgs)}(countpairs)
end
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
# To implement the distances we will count qgram matches
# between strings or pre-calculated AbstractQgramCounts objects.
# The abstract type defines different fallback versions which can be
# specialied by subtypes for best performance.
abstract type AbstractQGramMatchCounter end
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
countboth!(c, n1, n2)
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
countleft!(c, n1)
countright!(c, n2)
countshared!(c, n1, n2)
end
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
# Subtypes must implement these methods:
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
error("countleft! not implemented for $(typeof(c))")
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
error("countright! not implemented for $(typeof(c))")
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
error("countshared! not implemented for $(typeof(c))")
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
i1 = i2 = 1
while i1 <= length(d1) || i2 <= length(d2)
if i2 > length(d2)
for i in i1:length(d1)
@inbounds countleft!(mc, d1[i][1], d1[i][2])
end
return
elseif i1 > length(d1)
for i in i2:length(d2)
@inbounds countright!(mc, d2[i][1], d2[i][2])
end
return
end
@inbounds k1, n1 = d1[i1]
@inbounds k2, n2 = d2[i2]
cmpval = Base.cmp(k1, k2)
if cmpval == -1 # k1 < k2
countleft!(mc, k1, n1)
i1 += 1
elseif cmpval == +1 # k2 < k1
countright!(mc, k2, n2)
i2 += 1
else
countboth!(mc, k1, n1, n2)
i1 += 1
i2 += 1
end
end
end
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
for (k1, c1) in d1
index = Base.ht_keyindex2!(d2, k1)
if index > 0
countboth!(mc, k1, c1, d2.vals[index])
else
countleft!(mc, k1, c1)
end
end
for (k2, c2) in d2
index = Base.ht_keyindex2!(d1, k2)
if index <= 0
countright!(mc, k2, c2)
end
end
end
abstract type QGramDistance <: SemiMetric end
function (dist::QGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
countboth!(counter, n1, n2)
end
calculate(dist, counter)
end
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
@assert dist.q == q(qc1)
@assert dist.q == q(qc2)
counter = newcounter(dist)
countmatches!(counter, counts(qc1), counts(qc2))
calculate(dist, counter)
end
"""
QGram(q::Int)
@ -99,15 +272,17 @@ struct QGram <: QGramDistance
q::Int
end
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
n += abs(n1 - n2)
end
n
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
n::T
end
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
"""
Cosine(q::Int)
@ -125,17 +300,20 @@ struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
end
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
left::T
right::T
shared::T
end
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
"""
Jaccard(q::Int)
@ -152,17 +330,8 @@ struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
1.0 - c.shared / (c.left + c.right - c.shared)
"""
SorensenDice(q::Int)
@ -179,17 +348,8 @@ struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
1.0 - 2.0 * c.shared / (c.left + c.right)
"""
Overlap(q::Int)
@ -206,14 +366,15 @@ struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / min(ndistinct1, ndistinct2)
end
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
c.left += (n1 > 0)
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
c.right += (n2 > 0)
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
c.shared += (n1 > 0) & (n2 > 0)
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
1.0 - c.shared / min(c.left, c.right)

View File

@ -1,5 +1,4 @@
using StringDistances, Unicode, Test
using StringDistances, Unicode, Test, Random
@testset "Distances" begin
@ -15,9 +14,6 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(Jaro(), "", missing))
end
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@ -70,7 +66,6 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
end
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@ -85,8 +80,6 @@ using StringDistances, Unicode, Test
@inferred evaluate(QGram(1), "", "")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@ -130,8 +123,99 @@ using StringDistances, Unicode, Test
@test ismissing(evaluate(Overlap(1), "", missing))
end
@testset "QGramDict and QGramSortedVector counts qgrams" begin
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
totuples(qc) = map(stringify, sortedcounts(qc))
s1, s2 = "arnearne", "arnebeda"
qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2)
@test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
@test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2)
@test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
@test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
s3 = "rgówów"
qd3a = QGramDict(s3, 2)
@test totuples(qd3a) == [("", 1), ("rg", 1), ("", 1), ("ów", 2)]
qd3b = QGramDict(graphemes(s3), 2)
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
qc3a = QGramSortedVector(s3, 2)
@test totuples(qc3a) == [("", 1), ("rg", 1), ("", 1), ("ów", 2)]
qd3b = QGramDict(graphemes(s3), 2)
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
end
function partlyoverlappingstrings(sizerange, chars = [])
str1 = if length(chars) < 1
randstring(rand(sizerange))
else
randstring(chars, rand(sizerange))
end
elems = collect(str1)
ci1 = prevind(str1, rand(2:div(length(elems), 2)))
ci2 = prevind(str1, rand((ci1+1):(length(elems)-1)))
str2 = if length(chars) < 1
randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2)
else
randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2)
end
return str1, str2
end
@testset "Precalculation on unicode strings" begin
Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...)
for _ in 1:100
str1, str2 = partlyoverlappingstrings(10:100, Chars)
qlen = rand(2:5)
d = Jaccard(qlen)
qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2)
qd1b = QGramDict(graphemes(str1), qlen)
qd2b = QGramDict(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b)
qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2)
qc1b = QGramSortedVector(graphemes(str1), qlen)
qc2b = QGramSortedVector(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b)
end
end
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)
str1, str2 = partlyoverlappingstrings(5:10000)
# QGramDict gets same result as for standard string
qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen)
expected = evaluate(dist, str1, str2)
@test expected == evaluate(dist, qd1, qd2)
# QGramSortedVector gets same result as for standard string
qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen)
@test expected == evaluate(dist, qc1, qc2)
end
end
end
strings = [
("martha", "marhta"),