|
|
|
@ -4,73 +4,22 @@
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct QGramIterator{S <: AbstractString, T <: Integer}
|
|
|
|
|
struct QGramIterator{S <: AbstractString, N}
|
|
|
|
|
s::S # grapheme
|
|
|
|
|
l::Int # length of string
|
|
|
|
|
q::T # length of q-grams
|
|
|
|
|
end
|
|
|
|
|
param(x::QGramIterator{S, N}) where {S, N} = N
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function Base.iterate(qgram::QGramIterator,
|
|
|
|
|
state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q)))
|
|
|
|
|
function Base.iterate(qgram::QGramIterator{S, N},
|
|
|
|
|
state = (1, qgram.l < N ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
|
|
|
|
|
istart, iend = state
|
|
|
|
|
iend > ncodeunits(qgram.s) && return nothing
|
|
|
|
|
element = SubString(qgram.s, istart, iend)
|
|
|
|
|
element = qgram.s[istart:iend]
|
|
|
|
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
|
|
|
|
element, nextstate
|
|
|
|
|
end
|
|
|
|
|
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
|
|
|
|
Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S
|
|
|
|
|
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
|
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
|
##
|
|
|
|
|
## CountedIterator that use Binary Search
|
|
|
|
|
##
|
|
|
|
|
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
|
|
|
|
## v1 and v2 must be sorted vectors
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
struct CountIteratorBinary{T1, T2}
|
|
|
|
|
v1::Vector{T1}
|
|
|
|
|
v2::Vector{T2}
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
function Base.collect(qgram::QGramIterator)
|
|
|
|
|
x = Array{eltype(qgram)}(undef, length(qgram))
|
|
|
|
|
i = 0
|
|
|
|
|
for q in qgram
|
|
|
|
|
i += 1
|
|
|
|
|
@inbounds x[i] = q
|
|
|
|
|
end
|
|
|
|
|
x
|
|
|
|
|
end
|
|
|
|
|
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
|
|
|
|
|
CountIteratorBinary(sort(s1), sort(s2))
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
function Base.iterate(s::CountIteratorBinary, state = (1, 1))
|
|
|
|
|
state1, state2 = state
|
|
|
|
|
iter1 = state2 > length(s.v2)
|
|
|
|
|
iter2 = state1 > length(s.v1)
|
|
|
|
|
iter2 && iter1 && return nothing
|
|
|
|
|
if iter1
|
|
|
|
|
x1 = s.v1[state1]
|
|
|
|
|
elseif iter2
|
|
|
|
|
x2 = s.v2[state2]
|
|
|
|
|
else
|
|
|
|
|
x1 = s.v1[state1]
|
|
|
|
|
x2 = s.v2[state2]
|
|
|
|
|
iter1 = x1 <= x2
|
|
|
|
|
iter2 = x2 <= x1
|
|
|
|
|
end
|
|
|
|
|
nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1
|
|
|
|
|
nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2
|
|
|
|
|
((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
|
|
|
|
|
end
|
|
|
|
|
Base.eltype(qgram::QGramIterator{S}) where {S} = S
|
|
|
|
|
Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
@ -85,22 +34,37 @@ struct CountIteratorDictionary{T}
|
|
|
|
|
d::T
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator)
|
|
|
|
|
d = Dict{eltype(s1), Tuple{Int, Int}}()
|
|
|
|
|
for ch1 in s1
|
|
|
|
|
if haskey(d, ch1)
|
|
|
|
|
t = d[ch1]
|
|
|
|
|
d[ch1] = (t[1] + 1, 0)
|
|
|
|
|
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
|
|
|
|
function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
|
|
|
|
|
K = eltype(s1)
|
|
|
|
|
d = Dict{K, NTuple{2, UInt8}}()
|
|
|
|
|
sizehint!(d, length(s1))
|
|
|
|
|
for ch10 in s1
|
|
|
|
|
ch1 = convert(K, ch10)
|
|
|
|
|
if !isequal(ch1, ch10)
|
|
|
|
|
throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
|
|
|
|
|
end
|
|
|
|
|
index = Base.ht_keyindex2!(d, ch1)
|
|
|
|
|
if index > 0
|
|
|
|
|
d.age += 1
|
|
|
|
|
@inbounds d.keys[index] = ch1
|
|
|
|
|
@inbounds d.vals[index] = (d.vals[index][1] + UInt8(1), UInt8(0))
|
|
|
|
|
else
|
|
|
|
|
d[ch1] = (1, 0)
|
|
|
|
|
Base._setindex!(d, (UInt8(1), UInt8(0)), ch1, -index)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
for ch2 in s2
|
|
|
|
|
if haskey(d, ch2)
|
|
|
|
|
t = d[ch2]
|
|
|
|
|
d[ch2] = (t[1], t[2] + 1)
|
|
|
|
|
for ch20 in s2
|
|
|
|
|
ch2 = convert(K, ch20)
|
|
|
|
|
if !isequal(ch2, ch20)
|
|
|
|
|
throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
|
|
|
|
|
end
|
|
|
|
|
index = Base.ht_keyindex2!(d, ch2)
|
|
|
|
|
if index > 0
|
|
|
|
|
d.age += 1
|
|
|
|
|
@inbounds d.keys[index] = ch2
|
|
|
|
|
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + UInt8(1))
|
|
|
|
|
else
|
|
|
|
|
d[ch2] = (0, 1)
|
|
|
|
|
Base._setindex!(d, (UInt8(0), UInt8(1)), ch2, -index)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
return values(d)
|
|
|
|
@ -113,12 +77,14 @@ end
|
|
|
|
|
## Distance on strings is computed by set distance on qgram sets
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
abstract type AbstractQGram <: SemiMetric end
|
|
|
|
|
abstract type AbstractQGram{N} <: SemiMetric end
|
|
|
|
|
param(x::AbstractQGram{N}) where N = N
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
|
|
|
|
evaluate(dist,
|
|
|
|
|
CountIteratorBinary(QGramIterator(s1, length(s1), dist.q),
|
|
|
|
|
QGramIterator(s2, length(s2), dist.q)))
|
|
|
|
|
CountIteratorDictionary(QGramIterator{typeof(s1), param(dist)}(s1, length(s1)),
|
|
|
|
|
QGramIterator{typeof(s2), param(dist)}(s2, length(s2))))
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
@ -130,15 +96,14 @@ end
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct QGram{T <: Integer} <: AbstractQGram
|
|
|
|
|
q::T
|
|
|
|
|
end
|
|
|
|
|
QGram() = QGram(2)
|
|
|
|
|
struct QGram{N} <: AbstractQGram{N} end
|
|
|
|
|
|
|
|
|
|
QGram(x::Integer) = QGram{x}()
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::QGram, countiterator)
|
|
|
|
|
n = 0
|
|
|
|
|
for (n1, n2) in countiterator
|
|
|
|
|
n += abs(n1 - n2)
|
|
|
|
|
n += abs(Int(n1) - Int(n2))
|
|
|
|
|
end
|
|
|
|
|
n
|
|
|
|
|
end
|
|
|
|
@ -150,10 +115,9 @@ end
|
|
|
|
|
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct Cosine{T <: Integer} <: AbstractQGram
|
|
|
|
|
q::T
|
|
|
|
|
end
|
|
|
|
|
Cosine() = Cosine(2)
|
|
|
|
|
struct Cosine{N} <: AbstractQGram{N} end
|
|
|
|
|
|
|
|
|
|
Cosine(n::Integer = 2) = Cosine{n}()
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::Cosine, countiterator)
|
|
|
|
|
norm1, norm2, prodnorm = 0, 0, 0
|
|
|
|
@ -174,10 +138,9 @@ end
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct Jaccard{T <: Integer} <: AbstractQGram
|
|
|
|
|
q::T
|
|
|
|
|
end
|
|
|
|
|
Jaccard() = Jaccard(2)
|
|
|
|
|
struct Jaccard{N} <: AbstractQGram{N} end
|
|
|
|
|
|
|
|
|
|
Jaccard(n::Integer = 2) = Jaccard{n}()
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::Jaccard, countiterator)
|
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
|
|
|
@ -196,10 +159,9 @@ end
|
|
|
|
|
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct SorensenDice{T <: Integer} <: AbstractQGram
|
|
|
|
|
q::T
|
|
|
|
|
end
|
|
|
|
|
SorensenDice() = SorensenDice(2)
|
|
|
|
|
struct SorensenDice{N} <: AbstractQGram{N} end
|
|
|
|
|
|
|
|
|
|
SorensenDice(n::Integer = 2) = SorensenDice{n}()
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::SorensenDice, countiterator)
|
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
|
|
|
@ -218,10 +180,9 @@ end
|
|
|
|
|
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
|
|
|
|
struct Overlap{T <: Integer} <: AbstractQGram
|
|
|
|
|
q::T
|
|
|
|
|
end
|
|
|
|
|
Overlap() = Overlap(2)
|
|
|
|
|
struct Overlap{N} <: AbstractQGram{N} end
|
|
|
|
|
|
|
|
|
|
Overlap(n::Integer = 2) = Overlap{n}()
|
|
|
|
|
|
|
|
|
|
function evaluate(dist::Overlap, countiterator)
|
|
|
|
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
|
|
|
|