diff --git a/.travis.yml b/.travis.yml index 45353e4..e9c590b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: julia os: - linux julia: - - 1.0 + - 1.3 - 1.5 - nightly matrix: diff --git a/Project.toml b/Project.toml index a39b4b3..1a5d77b 100644 --- a/Project.toml +++ b/Project.toml @@ -7,11 +7,12 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" [compat] Distances = "0.8.1, 0.9, 0.10" -julia = "1" +julia = "1.3" [extras] +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [targets] -test = ["Test", "Unicode"] +test = ["Test", "Unicode", "Random"] diff --git a/src/StringDistances.jl b/src/StringDistances.jl index f135480..dc4a215 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -30,6 +30,8 @@ Cosine, Jaccard, SorensenDice, Overlap, +QGramDict, +QGramSortedVector, Winkler, Partial, TokenSort, diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 448d101..a420877 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -29,7 +29,6 @@ Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram)) qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q) qgrams(s, q::Integer) = QGramIterator(collect(s), q) - @doc """ Return an iterator corresponding to the the q-gram of an iterator. When the iterator is a String, qgrams are SubStrings. @@ -47,7 +46,6 @@ end """ qgrams - # For two iterators s1 and s2, that define a length and eltype method, # this returns an iterator that, # for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2) @@ -80,9 +78,184 @@ function _count(s1, s2) return values(d) end +# Turn a sequence of qgrams to a count dict for them, i.e. map each +# qgram to the number of times it has been seen. +function countdict(qgrams) + d = Dict{eltype(qgrams), Int32}() + for qg in qgrams + index = Base.ht_keyindex2!(d, qg) + if index > 0 + d.age += 1 + @inbounds d.keys[index] = qg + @inbounds d.vals[index] = d.vals[index][1] + 1 + else + @inbounds Base._setindex!(d, 1, qg, -index) + end + end + d +end + +abstract type AbstractQGramCounts{Q,K} end +q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q +counts(qc::AbstractQGramCounts) = qc.counts + +""" + QGramDict(s, q::Integer = 2) + +Creates a QGramDict that pre-calculates (pre-counts) the qgrams +of a string or stream. This enables faster calculation of QGram +distances. + +Note that the qgram length must correspond with the q length used +in the distance. + +## Examples +```julia +str1, str2 = "my string", "another string" +qd1 = QGramDict(str1, 2) +qd2 = QGramDict(str2, 2) +evaluate(Overlap(2), qd1, qd2) +``` +""" +struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K} + counts::Dict{K,Int} +end +function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2) + @assert q >= 1 + qgs = qgrams(s, q) + QGramDict{q, eltype(qgs)}(countdict(qgs)) +end +QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q) + +""" + QGramSortedVector(s, q::Integer = 2) + +Creates a QGramSortedVector that pre-calculates (pre-counts) the +qgrams of a string or stream. This enables faster calculation of +QGram distances. + +Since qgrams are sorted in lexicographic order QGram distances can be +calculated even faster than when using a QGramDict. However, the +sorting means that updating the counts after creation is less +efficient. However, for most use cases QGramSortedVector is preferred +over a QgramDict. + +Note that the qgram length must correspond with the q length used +in the distance. + +## Examples +```julia +str1, str2 = "my string", "another string" +qs1 = QGramSortedVector(str1, 2) +qs2 = QGramSortedVector(str2, 2) +evaluate(Jaccard(2), qs1, qs2) +``` +""" +struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K} + counts::Vector{Pair{K,Int}} +end +function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2) + @assert q >= 1 + qgs = qgrams(s, q) + countpairs = collect(countdict(qgs)) + sort!(countpairs, by = first) + QGramSortedVector{q, eltype(qgs)}(countpairs) +end +QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q) + +# To implement the distances we will count qgram matches +# between strings or pre-calculated AbstractQgramCounts objects. +# The abstract type defines different fallback versions which can be +# specialied by subtypes for best performance. +abstract type AbstractQGramMatchCounter end +@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1) +@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2) +@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = + countboth!(c, n1, n2) +@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) + countleft!(c, n1) + countright!(c, n2) + countshared!(c, n1, n2) +end +@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2) + +# Subtypes must implement these methods: +@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) = + error("countleft! not implemented for $(typeof(c))") +@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) = + error("countright! not implemented for $(typeof(c))") + +# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement: +@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) = + error("countshared! not implemented for $(typeof(c))") + +function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer} + i1 = i2 = 1 + while i1 <= length(d1) || i2 <= length(d2) + if i2 > length(d2) + for i in i1:length(d1) + @inbounds countleft!(mc, d1[i][1], d1[i][2]) + end + return + elseif i1 > length(d1) + for i in i2:length(d2) + @inbounds countright!(mc, d2[i][1], d2[i][2]) + end + return + end + @inbounds k1, n1 = d1[i1] + @inbounds k2, n2 = d2[i2] + cmpval = Base.cmp(k1, k2) + if cmpval == -1 # k1 < k2 + countleft!(mc, k1, n1) + i1 += 1 + elseif cmpval == +1 # k2 < k1 + countright!(mc, k2, n2) + i2 += 1 + else + countboth!(mc, k1, n1, n2) + i1 += 1 + i2 += 1 + end + end +end + +function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer} + for (k1, c1) in d1 + index = Base.ht_keyindex2!(d2, k1) + if index > 0 + countboth!(mc, k1, c1, d2.vals[index]) + else + countleft!(mc, k1, c1) + end + end + for (k2, c2) in d2 + index = Base.ht_keyindex2!(d1, k2) + if index <= 0 + countright!(mc, k2, c2) + end + end +end abstract type QGramDistance <: SemiMetric end +function (dist::QGramDistance)(s1, s2) + ((s1 === missing) | (s2 === missing)) && return missing + counter = newcounter(dist) + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) + countboth!(counter, n1, n2) + end + calculate(dist, counter) +end + +function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts} + @assert dist.q == q(qc1) + @assert dist.q == q(qc2) + counter = newcounter(dist) + countmatches!(counter, counts(qc1), counts(qc2)) + calculate(dist, counter) +end + """ QGram(q::Int) @@ -99,15 +272,17 @@ struct QGram <: QGramDistance q::Int end -function (dist::QGram)(s1, s2) - ((s1 === missing) | (s2 === missing)) && return missing - n = 0 - for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) - n += abs(n1 - n2) - end - n +mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter + n::T end +newcounter(d::QGram) = SingleCounter{Int, QGram}(0) + +@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0) +@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2) +@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2) + +calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n """ Cosine(q::Int) @@ -125,17 +300,20 @@ struct Cosine <: QGramDistance q::Int end -function (dist::Cosine)(s1, s2) - ((s1 === missing) | (s2 === missing)) && return missing - norm1, norm2, prodnorm = 0, 0, 0 - for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) - norm1 += n1^2 - norm2 += n2^2 - prodnorm += n1 * n2 - end - 1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2)) +mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter + left::T + right::T + shared::T end +newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0) + +@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2 +@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2 +@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2 + +calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) = + 1.0 - c.shared / (sqrt(c.left) * sqrt(c.right)) """ Jaccard(q::Int) @@ -152,17 +330,8 @@ struct Jaccard <: QGramDistance q::Int end -function (dist::Jaccard)(s1, s2) - ((s1 === missing) | (s2 === missing)) && return missing - ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) - ndistinct1 += n1 > 0 - ndistinct2 += n2 > 0 - nintersect += (n1 > 0) & (n2 > 0) - end - 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect) -end - +calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) = + 1.0 - c.shared / (c.left + c.right - c.shared) """ SorensenDice(q::Int) @@ -179,17 +348,8 @@ struct SorensenDice <: QGramDistance q::Int end -function (dist::SorensenDice)(s1, s2) - ((s1 === missing) | (s2 === missing)) && return missing - ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) - ndistinct1 += n1 > 0 - ndistinct2 += n2 > 0 - nintersect += (n1 > 0) & (n2 > 0) - end - 1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2) -end - +calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) = + 1.0 - 2.0 * c.shared / (c.left + c.right) """ Overlap(q::Int) @@ -206,14 +366,15 @@ struct Overlap <: QGramDistance q::Int end -function (dist::Overlap)(s1, s2) - ((s1 === missing) | (s2 === missing)) && return missing - ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) - ndistinct1 += n1 > 0 - ndistinct2 += n2 > 0 - nintersect += (n1 > 0) & (n2 > 0) - end - 1.0 - nintersect / min(ndistinct1, ndistinct2) -end +const IntersectionDist = Union{Jaccard, SorensenDice, Overlap} +newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0) +@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} = + c.left += (n1 > 0) +@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} = + c.right += (n2 > 0) +@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} = + c.shared += (n1 > 0) & (n2 > 0) + +calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) = + 1.0 - c.shared / min(c.left, c.right) \ No newline at end of file diff --git a/test/distances.jl b/test/distances.jl index e688b48..42bafae 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -1,5 +1,4 @@ - -using StringDistances, Unicode, Test +using StringDistances, Unicode, Test, Random @testset "Distances" begin @@ -15,9 +14,6 @@ using StringDistances, Unicode, Test @test ismissing(evaluate(Jaro(), "", missing)) end - - - @testset "Levenshtein" begin @test evaluate(Levenshtein(), "", "") == 0 @test evaluate(Levenshtein(), "abc", "") == 3 @@ -70,7 +66,6 @@ using StringDistances, Unicode, Test @test ismissing(evaluate(RatcliffObershelp(), "", missing)) end - @testset "QGram" begin @test evaluate(QGram(1), "abc", "abc") == 0 @test evaluate(QGram(1), "", "abc") == 3 @@ -85,8 +80,6 @@ using StringDistances, Unicode, Test @inferred evaluate(QGram(1), "", "") end - - @testset "Cosine" begin @test isnan(evaluate(Cosine(2), "", "abc")) @test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4 @@ -130,8 +123,99 @@ using StringDistances, Unicode, Test @test ismissing(evaluate(Overlap(1), "", missing)) end + @testset "QGramDict and QGramSortedVector counts qgrams" begin + # To get something we can more easily compare to: + stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p)) + stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p)) + sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first) + totuples(qc) = map(stringify, sortedcounts(qc)) + s1, s2 = "arnearne", "arnebeda" + qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2) + @test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)] + @test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)] + + qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2) + @test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)] + @test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)] + + s3 = "rgówów" + qd3a = QGramDict(s3, 2) + @test totuples(qd3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)] + + qd3b = QGramDict(graphemes(s3), 2) + @test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)] + + qc3a = QGramSortedVector(s3, 2) + @test totuples(qc3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)] + + qd3b = QGramDict(graphemes(s3), 2) + @test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)] + end + + function partlyoverlappingstrings(sizerange, chars = []) + str1 = if length(chars) < 1 + randstring(rand(sizerange)) + else + randstring(chars, rand(sizerange)) + end + elems = collect(str1) + ci1 = prevind(str1, rand(2:div(length(elems), 2))) + ci2 = prevind(str1, rand((ci1+1):(length(elems)-1))) + str2 = if length(chars) < 1 + randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2) + else + randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2) + end + return str1, str2 + end + + @testset "Precalculation on unicode strings" begin + Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...) + for _ in 1:100 + str1, str2 = partlyoverlappingstrings(10:100, Chars) + qlen = rand(2:5) + d = Jaccard(qlen) + + qd1 = QGramDict(str1, qlen) + qd2 = QGramDict(str2, qlen) + @test evaluate(d, str1, str2) == evaluate(d, qd1, qd2) + + qd1b = QGramDict(graphemes(str1), qlen) + qd2b = QGramDict(graphemes(str2), qlen) + @test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b) + + qc1 = QGramSortedVector(str1, qlen) + qc2 = QGramSortedVector(str2, qlen) + @test evaluate(d, str1, str2) == evaluate(d, qc1, qc2) + + qc1b = QGramSortedVector(graphemes(str1), qlen) + qc2b = QGramSortedVector(graphemes(str2), qlen) + @test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b) + end + end + + @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin + for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap] + for _ in 1:100 + qlen = rand(2:9) + dist = D(qlen) + str1, str2 = partlyoverlappingstrings(5:10000) + + # QGramDict gets same result as for standard string + qd1 = QGramDict(str1, qlen) + qd2 = QGramDict(str2, qlen) + expected = evaluate(dist, str1, str2) + @test expected == evaluate(dist, qd1, qd2) + + # QGramSortedVector gets same result as for standard string + qc1 = QGramSortedVector(str1, qlen) + qc2 = QGramSortedVector(str2, qlen) + @test expected == evaluate(dist, qc1, qc2) + end + end + end strings = [ ("martha", "marhta"),