StringDistances.jl/src/qgram.jl

214 lines
5.4 KiB
Julia
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
s::S # Collection
q::Int # Length of Qgram
end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
# q-grams of AbstractString
function Base.iterate(qgram::QGramIterator{<: AbstractString},
state = (1, nextind(qgram.s, 0, qgram.q)))
istart, iend = state
iend > ncodeunits(qgram.s) && return nothing
element = SubString(qgram.s, istart, iend)
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
#q-grams of AbstractVector
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
# so it does not seem to be worth it.
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
end
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@doc """
Return an iterator on the q-gram of a string
### Arguments
* `s` iterator
* `q::Integer`: length of q-gram
## Examples
```julia
for x in qgrams("hello", 2)
println(x)
end
```
"""
qgrams
# For two iterators s1 and s2, that define a length and eltype method,
# this returns an iterator that,
# for each element in s1 s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
function _count(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
sizehint!(d, length(s1) + length(s2))
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
for x1 in s1
index = Base.ht_keyindex2!(d, x1)
if index > 0
d.age += 1
@inbounds d.keys[index] = x1
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
else
@inbounds Base._setindex!(d, (1, 0), x1, -index)
end
end
for x2 in s2
index = Base.ht_keyindex2!(d, x2)
if index > 0
d.age += 1
@inbounds d.keys[index] = x2
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
else
@inbounds Base._setindex!(d, (0, 1), x2, -index)
end
end
return values(d)
end
abstract type QGramDistance <: SemiMetric end
"""
QGram(q::Int)
Creates a QGram distance.
The distance corresponds to
``||v(s1, q) - v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct QGram <: QGramDistance
q::Int
end
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
n += abs(n1 - n2)
end
n
end
"""
Cosine(q::Int)
Creates a Cosine distance.
The distance corresponds to
`` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
end
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
end
"""
Jaccard(q::Int)
Creates a Jaccard distance.
The distance corresponds to
``1 - |Q(s1, q) ∩ Q(s2, q)| / |Q(s1, q) Q(s2, q))|``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
"""
SorensenDice(q::Int)
Creates a SorensenDice distance.
The distance corresponds to
``1 - 2 * |Q(s1, q) ∩ Q(s2, q)| / (|Q(s1, q)| + |Q(s2, q))|)``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
"""
Overlap(q::Int)
Creates a Overlap distance.
The distance corresponds to
``1 - |Q(s1, q) ∩ Q(s2, q)| / min(|Q(s1, q)|, |Q(s2, q)|)``
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
"""
struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
1.0 - nintersect / min(ndistinct1, ndistinct2)
end