StringDistances.jl/src/qgram.jl

217 lines
5.4 KiB
Julia
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
s::S # Collection
q::Int # Length of Qgram
end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
# q-grams of AbstractString
function Base.iterate(qgram::QGramIterator{<: AbstractString},
state = (1, nextind(qgram.s, 0, qgram.q)))
istart, iend = state
iend > ncodeunits(qgram.s) && return nothing
element = SubString(qgram.s, istart, iend)
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
#q-grams of AbstractVector
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
# so it does not seem to be worth it.
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
end
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
"""
Return an iterator on the q-gram of a string
### Arguments
* `s` iterator
* `q::Integer`: length of q-gram
## Examples
```julia
for x in qgrams("hello", 2)
println(x)
end
```
"""
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
# For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2,
# returns a tuple with the numbers of times it appears in x1 and x2
function count_map(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
sizehint!(d, length(s1) + length(s2))
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
for x1 in s1
index = Base.ht_keyindex2!(d, x1)
if index > 0
d.age += 1
@inbounds d.keys[index] = x1
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
else
@inbounds Base._setindex!(d, (1, 0), x1, -index)
end
end
for x2 in s2
index = Base.ht_keyindex2!(d, x2)
if index > 0
d.age += 1
@inbounds d.keys[index] = x2
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
else
@inbounds Base._setindex!(d, (0, 1), x2, -index)
end
end
return d
end
abstract type QGramDistance <: SemiMetric end
"""
QGram(q::Int)
Creates a QGram metric.
The distance corresponds to
``||v(s1, q) - v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct QGram <: QGramDistance
q::Int
end
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
for (n1, n2) in itr
n += abs(n1 - n2)
end
n
end
"""
Cosine(q::Int)
Creates a Cosine metric.
The distance corresponds to
`` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
end
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
end
"""
Jaccard(q::Int)
Creates a Jaccard metric.
The distance corresponds to
``1 - |Q(s1, q) ∩ Q(s2, q)| / |Q(s1, q) Q(s2, q))|``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
"""
SorensenDice(q::Int)
Creates a SorensenDice metric
The distance corresponds to
``1 - 2 * |Q(s1, q) ∩ Q(s2, q)| / (|Q(s1, q)| + |Q(s2, q))|)``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
"""
Overlap(q::Int)
Creates a Overlap metric
The distance corresponds to
``1 - |Q(s1, q) ∩ Q(s2, q)| / min(|Q(s1, q)|, |Q(s2, q)|)``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / min(ndistinct1, ndistinct2)
end