2020-02-08 18:03:02 +01:00
|
|
|
|
struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
|
|
|
|
|
s::S # Collection
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int # Length of Qgram
|
2015-10-24 18:45:24 +02:00
|
|
|
|
end
|
2020-02-08 17:38:06 +01:00
|
|
|
|
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
2015-11-06 20:43:04 +01:00
|
|
|
|
|
2020-02-08 17:38:06 +01:00
|
|
|
|
# q-grams of AbstractString
|
|
|
|
|
function Base.iterate(qgram::QGramIterator{<: AbstractString},
|
2019-08-19 19:12:55 +02:00
|
|
|
|
state = (1, nextind(qgram.s, 0, qgram.q)))
|
2019-08-17 19:13:54 +02:00
|
|
|
|
istart, iend = state
|
|
|
|
|
iend > ncodeunits(qgram.s) && return nothing
|
2019-08-18 16:40:04 +02:00
|
|
|
|
element = SubString(qgram.s, istart, iend)
|
2019-08-17 19:13:54 +02:00
|
|
|
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
|
|
|
|
element, nextstate
|
|
|
|
|
end
|
2019-08-18 16:40:04 +02:00
|
|
|
|
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
2020-02-08 17:38:06 +01:00
|
|
|
|
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
|
|
|
|
|
|
|
|
|
|
2020-02-08 17:49:53 +01:00
|
|
|
|
#q-grams of AbstractVector
|
2020-02-09 19:37:37 +01:00
|
|
|
|
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
|
|
|
|
|
# so it does not seem to be worth it.
|
2020-02-08 17:38:06 +01:00
|
|
|
|
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
|
|
|
|
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
|
|
|
|
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
|
|
|
|
end
|
|
|
|
|
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
2019-08-17 19:13:54 +02:00
|
|
|
|
|
2020-02-09 19:37:37 +01:00
|
|
|
|
|
2019-08-17 18:15:14 +02:00
|
|
|
|
"""
|
2019-12-18 16:17:08 +01:00
|
|
|
|
Return an iterator on the q-gram of a string
|
2019-08-17 18:15:14 +02:00
|
|
|
|
|
|
|
|
|
### Arguments
|
2020-02-08 17:38:06 +01:00
|
|
|
|
* `s` iterator
|
2019-12-18 16:17:08 +01:00
|
|
|
|
* `q::Integer`: length of q-gram
|
2019-08-17 18:15:14 +02:00
|
|
|
|
|
|
|
|
|
## Examples
|
|
|
|
|
```julia
|
2019-12-18 16:17:08 +01:00
|
|
|
|
for x in qgrams("hello", 2)
|
2019-12-12 15:38:20 +01:00
|
|
|
|
println(x)
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2019-08-17 18:15:14 +02:00
|
|
|
|
```
|
|
|
|
|
"""
|
2020-02-08 17:49:53 +01:00
|
|
|
|
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
|
2020-02-08 17:38:06 +01:00
|
|
|
|
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
2019-08-17 17:56:54 +02:00
|
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
|
|
2019-12-18 16:17:08 +01:00
|
|
|
|
# For two iterators x1 and x2, that define a length and eltype method,
|
|
|
|
|
# this returns a dictionary which, for each element in x1 or x2,
|
2019-12-13 00:55:41 +01:00
|
|
|
|
# returns a tuple with the numbers of times it appears in x1 and x2
|
2019-08-17 17:56:54 +02:00
|
|
|
|
function count_map(s1, s2)
|
2019-08-18 18:52:37 +02:00
|
|
|
|
K = promote_type(eltype(s1), eltype(s2))
|
|
|
|
|
d = Dict{K, Tuple{Int, Int}}()
|
2019-12-18 16:17:08 +01:00
|
|
|
|
sizehint!(d, length(s1) + length(s2))
|
2019-08-18 18:52:37 +02:00
|
|
|
|
# I use a faster way to change a dictionary key
|
|
|
|
|
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
2019-08-18 16:40:04 +02:00
|
|
|
|
for x1 in s1
|
|
|
|
|
index = Base.ht_keyindex2!(d, x1)
|
2019-01-23 23:22:08 +01:00
|
|
|
|
if index > 0
|
|
|
|
|
d.age += 1
|
2019-08-18 16:40:04 +02:00
|
|
|
|
@inbounds d.keys[index] = x1
|
2019-03-29 14:04:23 +01:00
|
|
|
|
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
|
2018-07-04 23:27:40 +02:00
|
|
|
|
else
|
2019-08-18 16:40:04 +02:00
|
|
|
|
@inbounds Base._setindex!(d, (1, 0), x1, -index)
|
2018-07-04 23:27:40 +02:00
|
|
|
|
end
|
|
|
|
|
end
|
2019-08-18 16:40:04 +02:00
|
|
|
|
for x2 in s2
|
|
|
|
|
index = Base.ht_keyindex2!(d, x2)
|
2019-01-23 23:22:08 +01:00
|
|
|
|
if index > 0
|
|
|
|
|
d.age += 1
|
2019-08-18 16:40:04 +02:00
|
|
|
|
@inbounds d.keys[index] = x2
|
2019-03-29 14:04:23 +01:00
|
|
|
|
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
|
2018-07-04 23:27:40 +02:00
|
|
|
|
else
|
2019-08-18 16:40:04 +02:00
|
|
|
|
@inbounds Base._setindex!(d, (0, 1), x2, -index)
|
2018-07-04 23:27:40 +02:00
|
|
|
|
end
|
|
|
|
|
end
|
2019-08-18 18:52:37 +02:00
|
|
|
|
return d
|
2018-07-04 23:27:40 +02:00
|
|
|
|
end
|
|
|
|
|
|
2020-02-12 15:41:46 +01:00
|
|
|
|
|
|
|
|
|
abstract type QGramDistance <: SemiMetric end
|
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-08-18 03:41:20 +02:00
|
|
|
|
QGram(q::Int)
|
2015-10-23 16:12:51 +02:00
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
Creates a QGram metric.
|
|
|
|
|
|
|
|
|
|
The distance corresponds to
|
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
``||v(s1, q) - v(s2, q)||``
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
|
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
|
|
|
|
that contains the number of times a q-gram appears for the string s
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
|
struct QGram <: QGramDistance
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2015-10-24 14:59:44 +02:00
|
|
|
|
|
2020-02-12 15:41:46 +01:00
|
|
|
|
function (dist::QGram)(s1, s2)
|
2020-02-13 15:44:27 +01:00
|
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-12-18 16:17:08 +01:00
|
|
|
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
2015-11-02 18:54:47 +01:00
|
|
|
|
n = 0
|
2019-12-13 16:33:06 +01:00
|
|
|
|
for (n1, n2) in itr
|
2019-08-17 17:40:26 +02:00
|
|
|
|
n += abs(n1 - n2)
|
2015-11-02 18:52:23 +01:00
|
|
|
|
end
|
2015-11-06 02:47:17 +01:00
|
|
|
|
n
|
2015-10-23 16:12:51 +02:00
|
|
|
|
end
|
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-08-18 03:41:20 +02:00
|
|
|
|
Cosine(q::Int)
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
|
|
|
|
Creates a Cosine metric.
|
|
|
|
|
|
|
|
|
|
The distance corresponds to
|
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
`` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||``
|
2015-10-23 16:12:51 +02:00
|
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
|
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
|
|
|
|
|
that contains the number of times a q-gram appears for the string s
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
|
struct Cosine <: QGramDistance
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
|
2020-02-13 15:44:27 +01:00
|
|
|
|
function (dist::Cosine)(s1, s2)
|
|
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-12-18 16:17:08 +01:00
|
|
|
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
2015-11-02 18:54:47 +01:00
|
|
|
|
norm1, norm2, prodnorm = 0, 0, 0
|
2019-12-13 16:33:06 +01:00
|
|
|
|
for (n1, n2) in itr
|
2015-11-02 18:54:47 +01:00
|
|
|
|
norm1 += n1^2
|
|
|
|
|
norm2 += n2^2
|
|
|
|
|
prodnorm += n1 * n2
|
|
|
|
|
end
|
2015-11-06 02:47:17 +01:00
|
|
|
|
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
2015-10-23 16:12:51 +02:00
|
|
|
|
end
|
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-08-18 03:41:20 +02:00
|
|
|
|
Jaccard(q::Int)
|
2015-11-02 18:54:47 +01:00
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
Creates a Jaccard metric.
|
|
|
|
|
|
|
|
|
|
The distance corresponds to
|
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
``1 - |Q(s1, q) ∩ Q(s2, q)| / |Q(s1, q) ∪ Q(s2, q))|``
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
|
struct Jaccard <: QGramDistance
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
|
2020-02-13 15:44:27 +01:00
|
|
|
|
function (dist::Jaccard)(s1, s2)
|
|
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-12-18 16:17:08 +01:00
|
|
|
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
2015-11-02 18:54:47 +01:00
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
2019-12-13 16:33:06 +01:00
|
|
|
|
for (n1, n2) in itr
|
2015-11-02 18:54:47 +01:00
|
|
|
|
ndistinct1 += n1 > 0
|
|
|
|
|
ndistinct2 += n2 > 0
|
|
|
|
|
nintersect += (n1 > 0) & (n2 > 0)
|
|
|
|
|
end
|
2015-11-06 02:47:17 +01:00
|
|
|
|
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
2015-11-02 18:54:47 +01:00
|
|
|
|
end
|
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-08-18 03:41:20 +02:00
|
|
|
|
SorensenDice(q::Int)
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
|
|
|
|
Creates a SorensenDice metric
|
|
|
|
|
|
|
|
|
|
The distance corresponds to
|
2015-11-05 16:51:32 +01:00
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
``1 - 2 * |Q(s1, q) ∩ Q(s2, q)| / (|Q(s1, q)| + |Q(s2, q))|)``
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
|
struct SorensenDice <: QGramDistance
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2015-11-05 16:51:32 +01:00
|
|
|
|
|
2020-02-13 15:44:27 +01:00
|
|
|
|
function (dist::SorensenDice)(s1, s2)
|
|
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-12-18 16:17:08 +01:00
|
|
|
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
2015-11-05 16:51:32 +01:00
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
2019-12-13 16:33:06 +01:00
|
|
|
|
for (n1, n2) in itr
|
2015-11-05 16:51:32 +01:00
|
|
|
|
ndistinct1 += n1 > 0
|
|
|
|
|
ndistinct2 += n2 > 0
|
|
|
|
|
nintersect += (n1 > 0) & (n2 > 0)
|
|
|
|
|
end
|
2015-11-06 02:47:17 +01:00
|
|
|
|
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
2015-11-05 16:51:32 +01:00
|
|
|
|
end
|
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-08-18 03:41:20 +02:00
|
|
|
|
Overlap(q::Int)
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
|
|
|
|
Creates a Overlap metric
|
2015-11-05 16:51:32 +01:00
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
|
The distance corresponds to
|
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
``1 - |Q(s1, q) ∩ Q(s2, q)| / min(|Q(s1, q)|, |Q(s2, q)|)``
|
2019-08-18 01:45:31 +02:00
|
|
|
|
|
2019-08-18 03:41:20 +02:00
|
|
|
|
where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
2019-08-18 01:45:31 +02:00
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
|
struct Overlap <: QGramDistance
|
2019-08-18 03:41:20 +02:00
|
|
|
|
q::Int
|
2019-08-17 17:56:54 +02:00
|
|
|
|
end
|
2015-11-05 16:51:32 +01:00
|
|
|
|
|
2020-02-13 15:44:27 +01:00
|
|
|
|
function (dist::Overlap)(s1, s2)
|
|
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-12-18 16:17:08 +01:00
|
|
|
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
2015-11-05 16:51:32 +01:00
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
2019-12-13 16:33:06 +01:00
|
|
|
|
for (n1, n2) in itr
|
2015-11-05 16:51:32 +01:00
|
|
|
|
ndistinct1 += n1 > 0
|
|
|
|
|
ndistinct2 += n2 > 0
|
|
|
|
|
nintersect += (n1 > 0) & (n2 > 0)
|
|
|
|
|
end
|
2015-11-06 02:47:17 +01:00
|
|
|
|
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
2015-11-05 16:51:32 +01:00
|
|
|
|
end
|