380 lines
11 KiB
Julia
Executable File
380 lines
11 KiB
Julia
Executable File
struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
|
||
s::S # Collection
|
||
q::Int # Length of Qgram
|
||
end
|
||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||
|
||
# q-grams of AbstractString
|
||
function Base.iterate(qgram::QGramIterator{<: AbstractString},
|
||
state = (1, nextind(qgram.s, 0, qgram.q)))
|
||
istart, iend = state
|
||
iend > ncodeunits(qgram.s) && return nothing
|
||
element = SubString(qgram.s, istart, iend)
|
||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||
element, nextstate
|
||
end
|
||
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||
|
||
|
||
#q-grams of AbstractVector
|
||
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
|
||
# so it does not seem to be worth it.
|
||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||
end
|
||
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||
|
||
@doc """
|
||
Return an iterator corresponding to the the q-gram of an iterator.
|
||
When the iterator is a String, qgrams are SubStrings.
|
||
|
||
### Arguments
|
||
* `s` iterator
|
||
* `q::Integer`: length of q-gram
|
||
|
||
## Examples
|
||
```julia
|
||
for x in qgrams("hello", 2)
|
||
println(x)
|
||
end
|
||
```
|
||
"""
|
||
qgrams
|
||
|
||
# For two iterators s1 and s2, that define a length and eltype method,
|
||
# this returns an iterator that,
|
||
# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
|
||
function _count(s1, s2)
|
||
K = promote_type(eltype(s1), eltype(s2))
|
||
d = Dict{K, Tuple{Int, Int}}()
|
||
sizehint!(d, length(s1) + length(s2))
|
||
# I use a faster way to change a dictionary key
|
||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||
for x1 in s1
|
||
index = Base.ht_keyindex2!(d, x1)
|
||
if index > 0
|
||
d.age += 1
|
||
@inbounds d.keys[index] = x1
|
||
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
|
||
else
|
||
@inbounds Base._setindex!(d, (1, 0), x1, -index)
|
||
end
|
||
end
|
||
for x2 in s2
|
||
index = Base.ht_keyindex2!(d, x2)
|
||
if index > 0
|
||
d.age += 1
|
||
@inbounds d.keys[index] = x2
|
||
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
|
||
else
|
||
@inbounds Base._setindex!(d, (0, 1), x2, -index)
|
||
end
|
||
end
|
||
return values(d)
|
||
end
|
||
|
||
# Turn a sequence of qgrams to a count dict for them, i.e. map each
|
||
# qgram to the number of times it has been seen.
|
||
function countdict(qgrams)
|
||
d = Dict{eltype(qgrams), Int32}()
|
||
for qg in qgrams
|
||
index = Base.ht_keyindex2!(d, qg)
|
||
if index > 0
|
||
d.age += 1
|
||
@inbounds d.keys[index] = qg
|
||
@inbounds d.vals[index] = d.vals[index][1] + 1
|
||
else
|
||
@inbounds Base._setindex!(d, 1, qg, -index)
|
||
end
|
||
end
|
||
d
|
||
end
|
||
|
||
abstract type AbstractQGramCounts{Q,K} end
|
||
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
|
||
counts(qc::AbstractQGramCounts) = qc.counts
|
||
|
||
"""
|
||
QGramDict(s, q::Integer = 2)
|
||
|
||
Creates a QGramDict that pre-calculates (pre-counts) the qgrams
|
||
of a string or stream. This enables faster calculation of QGram
|
||
distances.
|
||
|
||
Note that the qgram length must correspond with the q length used
|
||
in the distance.
|
||
|
||
## Examples
|
||
```julia
|
||
str1, str2 = "my string", "another string"
|
||
qd1 = QGramDict(str1, 2)
|
||
qd2 = QGramDict(str2, 2)
|
||
evaluate(Overlap(2), qd1, qd2)
|
||
```
|
||
"""
|
||
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
|
||
counts::Dict{K,Int}
|
||
end
|
||
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||
@assert q >= 1
|
||
qgs = qgrams(s, q)
|
||
QGramDict{q, eltype(qgs)}(countdict(qgs))
|
||
end
|
||
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
|
||
|
||
"""
|
||
QGramSortedVector(s, q::Integer = 2)
|
||
|
||
Creates a QGramSortedVector that pre-calculates (pre-counts) the
|
||
qgrams of a string or stream. This enables faster calculation of
|
||
QGram distances.
|
||
|
||
Since qgrams are sorted in lexicographic order QGram distances can be
|
||
calculated even faster than when using a QGramDict. However, the
|
||
sorting means that updating the counts after creation is less
|
||
efficient. However, for most use cases QGramSortedVector is preferred
|
||
over a QgramDict.
|
||
|
||
Note that the qgram length must correspond with the q length used
|
||
in the distance.
|
||
|
||
## Examples
|
||
```julia
|
||
str1, str2 = "my string", "another string"
|
||
qs1 = QGramSortedVector(str1, 2)
|
||
qs2 = QGramSortedVector(str2, 2)
|
||
evaluate(Jaccard(2), qs1, qs2)
|
||
```
|
||
"""
|
||
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
|
||
counts::Vector{Pair{K,Int}}
|
||
end
|
||
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||
@assert q >= 1
|
||
qgs = qgrams(s, q)
|
||
countpairs = collect(countdict(qgs))
|
||
sort!(countpairs, by = first)
|
||
QGramSortedVector{q, eltype(qgs)}(countpairs)
|
||
end
|
||
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
|
||
|
||
# To implement the distances we will count qgram matches
|
||
# between strings or pre-calculated AbstractQgramCounts objects.
|
||
# The abstract type defines different fallback versions which can be
|
||
# specialied by subtypes for best performance.
|
||
abstract type AbstractQGramMatchCounter end
|
||
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
|
||
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
|
||
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
|
||
countboth!(c, n1, n2)
|
||
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
|
||
countleft!(c, n1)
|
||
countright!(c, n2)
|
||
countshared!(c, n1, n2)
|
||
end
|
||
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
|
||
|
||
# Subtypes must implement these methods:
|
||
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
|
||
error("countleft! not implemented for $(typeof(c))")
|
||
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
|
||
error("countright! not implemented for $(typeof(c))")
|
||
|
||
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
|
||
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
|
||
error("countshared! not implemented for $(typeof(c))")
|
||
|
||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
|
||
i1 = i2 = 1
|
||
while i1 <= length(d1) || i2 <= length(d2)
|
||
if i2 > length(d2)
|
||
for i in i1:length(d1)
|
||
@inbounds countleft!(mc, d1[i][1], d1[i][2])
|
||
end
|
||
return
|
||
elseif i1 > length(d1)
|
||
for i in i2:length(d2)
|
||
@inbounds countright!(mc, d2[i][1], d2[i][2])
|
||
end
|
||
return
|
||
end
|
||
@inbounds k1, n1 = d1[i1]
|
||
@inbounds k2, n2 = d2[i2]
|
||
cmpval = Base.cmp(k1, k2)
|
||
if cmpval == -1 # k1 < k2
|
||
countleft!(mc, k1, n1)
|
||
i1 += 1
|
||
elseif cmpval == +1 # k2 < k1
|
||
countright!(mc, k2, n2)
|
||
i2 += 1
|
||
else
|
||
countboth!(mc, k1, n1, n2)
|
||
i1 += 1
|
||
i2 += 1
|
||
end
|
||
end
|
||
end
|
||
|
||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
|
||
for (k1, c1) in d1
|
||
index = Base.ht_keyindex2!(d2, k1)
|
||
if index > 0
|
||
countboth!(mc, k1, c1, d2.vals[index])
|
||
else
|
||
countleft!(mc, k1, c1)
|
||
end
|
||
end
|
||
for (k2, c2) in d2
|
||
index = Base.ht_keyindex2!(d1, k2)
|
||
if index <= 0
|
||
countright!(mc, k2, c2)
|
||
end
|
||
end
|
||
end
|
||
|
||
abstract type QGramDistance <: SemiMetric end
|
||
|
||
function (dist::QGramDistance)(s1, s2)
|
||
((s1 === missing) | (s2 === missing)) && return missing
|
||
counter = newcounter(dist)
|
||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||
countboth!(counter, n1, n2)
|
||
end
|
||
calculate(dist, counter)
|
||
end
|
||
|
||
function (dist::QGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||
@assert dist.q == q(qc1)
|
||
@assert dist.q == q(qc2)
|
||
counter = newcounter(dist)
|
||
countmatches!(counter, counts(qc1), counts(qc2))
|
||
calculate(dist, counter)
|
||
end
|
||
|
||
"""
|
||
QGram(q::Int)
|
||
|
||
Creates a QGram distance.
|
||
|
||
The distance corresponds to
|
||
|
||
``||v(s1, q) - v(s2, q)||``
|
||
|
||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||
that contains the number of times a q-gram appears for the string s
|
||
"""
|
||
struct QGram <: QGramDistance
|
||
q::Int
|
||
end
|
||
|
||
mutable struct SingleCounter{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||
n::T
|
||
end
|
||
|
||
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
|
||
|
||
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
|
||
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
|
||
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
|
||
|
||
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
|
||
|
||
"""
|
||
Cosine(q::Int)
|
||
|
||
Creates a Cosine distance.
|
||
|
||
The distance corresponds to
|
||
|
||
`` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||``
|
||
|
||
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
||
that contains the number of times a q-gram appears for the string s
|
||
"""
|
||
struct Cosine <: QGramDistance
|
||
q::Int
|
||
end
|
||
|
||
mutable struct ThreeCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||
left::T
|
||
right::T
|
||
shared::T
|
||
end
|
||
|
||
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
|
||
|
||
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
|
||
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
|
||
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
|
||
|
||
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
|
||
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
|
||
|
||
"""
|
||
Jaccard(q::Int)
|
||
|
||
Creates a Jaccard distance.
|
||
|
||
The distance corresponds to
|
||
|
||
``1 - |Q(s1, q) ∩ Q(s2, q)| / |Q(s1, q) ∪ Q(s2, q))|``
|
||
|
||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||
"""
|
||
struct Jaccard <: QGramDistance
|
||
q::Int
|
||
end
|
||
|
||
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
|
||
1.0 - c.shared / (c.left + c.right - c.shared)
|
||
|
||
"""
|
||
SorensenDice(q::Int)
|
||
|
||
Creates a SorensenDice distance.
|
||
|
||
The distance corresponds to
|
||
|
||
``1 - 2 * |Q(s1, q) ∩ Q(s2, q)| / (|Q(s1, q)| + |Q(s2, q))|)``
|
||
|
||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||
"""
|
||
struct SorensenDice <: QGramDistance
|
||
q::Int
|
||
end
|
||
|
||
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
|
||
1.0 - 2.0 * c.shared / (c.left + c.right)
|
||
|
||
"""
|
||
Overlap(q::Int)
|
||
|
||
Creates a Overlap distance.
|
||
|
||
The distance corresponds to
|
||
|
||
``1 - |Q(s1, q) ∩ Q(s2, q)| / min(|Q(s1, q)|, |Q(s2, q)|)``
|
||
|
||
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
||
"""
|
||
struct Overlap <: QGramDistance
|
||
q::Int
|
||
end
|
||
|
||
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
|
||
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
|
||
|
||
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
|
||
c.left += (n1 > 0)
|
||
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
|
||
c.right += (n2 > 0)
|
||
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
|
||
c.shared += (n1 > 0) & (n2 > 0)
|
||
|
||
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
|
||
1.0 - c.shared / min(c.left, c.right) |