221 lines
6.0 KiB
Julia
221 lines
6.0 KiB
Julia
##############################################################################
|
|
##
|
|
## Define QGram Distance type
|
|
##
|
|
##############################################################################
|
|
abstract AbstractQGram <: SemiMetric
|
|
|
|
##############################################################################
|
|
##
|
|
## Define a type that iterates through q-grams of a string
|
|
##
|
|
##############################################################################
|
|
|
|
type QGramIterator{S <: AbstractString, T <: Integer}
|
|
s::S # string
|
|
l::Int # length of string
|
|
q::T # length of q-grams
|
|
end
|
|
function Base.start(qgram::QGramIterator)
|
|
len = length(qgram.s)
|
|
(1, len < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
|
end
|
|
function Base.next(qgram::QGramIterator, state)
|
|
istart, iend = state
|
|
element = SubString(qgram.s, istart, iend)
|
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
|
return element, nextstate
|
|
end
|
|
function Base.done(qgram::QGramIterator, state)
|
|
istart, idend = state
|
|
done(qgram.s, idend)
|
|
end
|
|
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
|
|
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
|
function Base.collect(qgram::QGramIterator)
|
|
x = Array(eltype(qgram), length(qgram))
|
|
i = 0
|
|
for q in qgram
|
|
i += 1
|
|
@inbounds x[i] = q
|
|
end
|
|
return x
|
|
end
|
|
Base.sort(qgram::QGramIterator) = sort!(collect(qgram), alg = QuickSort)
|
|
|
|
##############################################################################
|
|
##
|
|
## Define a Tree
|
|
##
|
|
##############################################################################
|
|
|
|
abstract Tree{K, V}
|
|
type EmptyTree{K,V} <: Tree{K, V}
|
|
end
|
|
|
|
type TreeNode{K,V} <: Tree{K, V}
|
|
key:: K
|
|
data::Tuple{V, V}
|
|
left:: Tree{K,V}
|
|
right::Tree{K,V}
|
|
end
|
|
|
|
add1!{K,V}(t::EmptyTree{K,V}, k) = TreeNode{K,V}(k, (one(V), zero(V)), t, t)
|
|
function add1!{K, V}(t::TreeNode{K, V}, k)
|
|
if t.key == k
|
|
a, b = t.data
|
|
t.data = (a + one(V), b)
|
|
elseif k < t.key
|
|
t.left = add1!(t.left, k)
|
|
else
|
|
t.right = add1!(t.right, k)
|
|
end
|
|
return t
|
|
end
|
|
|
|
add2!{K,V}(t::EmptyTree{K,V}, k) = TreeNode{K,V}(k, (zero(V), one(V)), t, t)
|
|
function add2!{K, V}(t::TreeNode{K, V}, k)
|
|
if t.key == k
|
|
a, b = t.data
|
|
t.data = (a, b + one(V))
|
|
elseif k < t.key
|
|
t.left = add2!(t.left, k)
|
|
else
|
|
t.right = add2!(t.right, k)
|
|
end
|
|
return t
|
|
end
|
|
|
|
function Tree{S}(dist, s1::S, s2::S, len1::Integer, len2::Integer)
|
|
qgram1 = QGramIterator(s1, len1, dist.q)
|
|
qgram2 = QGramIterator(s2, len2, dist.q)
|
|
t = EmptyTree{SubString{S}, UInt}()
|
|
for x in qgram1
|
|
t = add1!(t, x)
|
|
end
|
|
for x in qgram2
|
|
t = add2!(t, x)
|
|
end
|
|
return t
|
|
end
|
|
|
|
|
|
##############################################################################
|
|
##
|
|
## q-gram
|
|
## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
|
|
## For instance v("leila")["il"] =1
|
|
## q-gram is ∑ |v(s1, p) - v(s2, p)|
|
|
##
|
|
##############################################################################
|
|
|
|
type QGram{T <: Integer} <: AbstractQGram
|
|
q::T
|
|
end
|
|
QGram() = QGram(2)
|
|
|
|
type QGramAccumulator
|
|
n::Int
|
|
end
|
|
|
|
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
|
isempty(s2) && return 0
|
|
t = Tree(dist, s1, s2, len1, len2)
|
|
acc = QGramAccumulator(0)
|
|
evalans(dist, t, acc)
|
|
return acc.n
|
|
end
|
|
evalans(dist::QGram, t::EmptyTree, acc::QGramAccumulator) = nothing
|
|
function evalans{K, V}(dist::QGram, t::TreeNode{K, V}, acc::QGramAccumulator)
|
|
n1, n2 = t.data
|
|
acc.n += n1 > n2 ? n1 - n2 : n2 - n1
|
|
evalans(dist, t.left, acc)
|
|
evalans(dist, t.right, acc)
|
|
end
|
|
|
|
##############################################################################
|
|
##
|
|
## cosine
|
|
##
|
|
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
|
##############################################################################
|
|
|
|
type Cosine{T <: Integer} <: AbstractQGram
|
|
q::T
|
|
end
|
|
Cosine() = Cosine(2)
|
|
|
|
type CosineAccumulator
|
|
norm1::Int
|
|
norm2::Int
|
|
prodnorm::Int
|
|
end
|
|
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
|
isempty(s2) && return 0.0
|
|
t = Tree(dist, s1, s2, len1, len2)
|
|
acc = CosineAccumulator(0, 0, 0)
|
|
evalans(dist, t, acc)
|
|
denominator = sqrt(acc.norm1) * sqrt(acc.norm2)
|
|
return denominator != 0 ? 1.0 - acc.prodnorm / denominator : s1 == s2 ? 0.0 : 1.0
|
|
end
|
|
|
|
evalans(dist::Cosine, t::EmptyTree, acc::CosineAccumulator) = nothing
|
|
function evalans{K, V}(dist::Cosine, t::TreeNode{K, V}, acc::CosineAccumulator)
|
|
n1, n2 = t.data
|
|
acc.norm1 += n1^2
|
|
acc.norm2 += n2^2
|
|
acc.prodnorm += n1 * n2
|
|
evalans(dist, t.left, acc)
|
|
evalans(dist, t.right, acc)
|
|
end
|
|
function cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
|
evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
|
end
|
|
|
|
##############################################################################
|
|
##
|
|
## Jaccard
|
|
##
|
|
## Denote Q(s, q) the set of tuple of length q in s
|
|
## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
|
##
|
|
## return 1.0 if smaller than qgram
|
|
##
|
|
##############################################################################
|
|
type Jaccard{T <: Integer} <: AbstractQGram
|
|
q::T
|
|
end
|
|
Jaccard() = Jaccard(2)
|
|
|
|
type JaccardAccumulator
|
|
ndistinct1::Int
|
|
ndistinct2::Int
|
|
nintersect::Int
|
|
end
|
|
|
|
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
|
isempty(s2) && return 0.0
|
|
t = Tree(dist, s1, s2, len1, len2)
|
|
acc = JaccardAccumulator(0, 0, 0)
|
|
evalans(dist, t, acc)
|
|
denominator = acc.ndistinct1 + acc.ndistinct2 - acc.nintersect
|
|
return denominator != 0 ? 1.0 - acc.nintersect / denominator : s1 == s2 ? 0.0 : 1.0
|
|
end
|
|
|
|
|
|
evalans(dist::Jaccard, t::EmptyTree, acc::JaccardAccumulator) = nothing
|
|
function evalans{K, V}(dist::Jaccard, t::TreeNode{K, V}, acc::JaccardAccumulator)
|
|
n1, n2 = t.data
|
|
acc.ndistinct1 += (n1 > zero(V))
|
|
acc.ndistinct2 += (n2 > zero(V))
|
|
acc.nintersect += ((n1 > zero(V)) & (n2 > zero(V)))
|
|
evalans(dist, t.left, acc)
|
|
evalans(dist, t.right, acc)
|
|
end
|
|
|
|
function jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
|
evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
|
end
|
|
|
|
|