Tree implementation
parent
e1d69ccfb0
commit
a680311200
|
@ -14,6 +14,7 @@ Hamming, hamming,
|
||||||
Levenshtein, levenshtein,
|
Levenshtein, levenshtein,
|
||||||
DamerauLevenshtein, damerau_levenshtein,
|
DamerauLevenshtein, damerau_levenshtein,
|
||||||
Jaro, jaro,
|
Jaro, jaro,
|
||||||
|
QGramIterator,
|
||||||
QGram, qgram,
|
QGram, qgram,
|
||||||
Cosine, cosine,
|
Cosine, cosine,
|
||||||
Jaccard, jaccard,
|
Jaccard, jaccard,
|
||||||
|
|
169
src/qgram.jl
169
src/qgram.jl
|
@ -45,40 +45,61 @@ Base.sort(qgram::QGramIterator) = sort!(collect(qgram), alg = QuickSort)
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Define a type that iterates through a pair of sorted vector
|
## Define a Tree
|
||||||
## For each element in either v1 or v2, output number of times it appears in v1 and the number of times it appears in v2
|
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type PairSortedIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
abstract Tree{K, V}
|
||||||
v1::T1
|
type EmptyTree{K,V} <: Tree{K, V}
|
||||||
v2::T2
|
|
||||||
end
|
end
|
||||||
Base.start(s::PairSortedIterator) = (1, 1)
|
|
||||||
|
|
||||||
function Base.next(s::PairSortedIterator, state)
|
type TreeNode{K,V} <: Tree{K, V}
|
||||||
state1, state2 = state
|
key:: K
|
||||||
iter1 = done(s.v2, state2)
|
data::Tuple{V, V}
|
||||||
iter2 = done(s.v1, state1)
|
left:: Tree{K,V}
|
||||||
if iter1
|
right::Tree{K,V}
|
||||||
@inbounds x1 = s.v1[state1]
|
end
|
||||||
elseif iter2
|
|
||||||
@inbounds x2 = s.v2[state2]
|
add1!{K,V}(t::EmptyTree{K,V}, k) = TreeNode{K,V}(k, (one(V), zero(V)), t, t)
|
||||||
else
|
function add1!{K, V}(t::TreeNode{K, V}, k)
|
||||||
@inbounds x1 = s.v1[state1]
|
if t.key == k
|
||||||
@inbounds x2 = s.v2[state2]
|
a, b = t.data
|
||||||
iter1 = x1 <= x2
|
t.data = (a + one(V), b)
|
||||||
iter2 = x2 <= x1
|
elseif k < t.key
|
||||||
|
t.left = add1!(t.left, k)
|
||||||
|
else
|
||||||
|
t.right = add1!(t.right, k)
|
||||||
|
end
|
||||||
|
return t
|
||||||
|
end
|
||||||
|
|
||||||
|
add2!{K,V}(t::EmptyTree{K,V}, k) = TreeNode{K,V}(k, (zero(V), one(V)), t, t)
|
||||||
|
function add2!{K, V}(t::TreeNode{K, V}, k)
|
||||||
|
if t.key == k
|
||||||
|
a, b = t.data
|
||||||
|
t.data = (a, b + one(V))
|
||||||
|
elseif k < t.key
|
||||||
|
t.left = add2!(t.left, k)
|
||||||
|
else
|
||||||
|
t.right = add2!(t.right, k)
|
||||||
|
end
|
||||||
|
return t
|
||||||
|
end
|
||||||
|
|
||||||
|
function Tree{S}(dist, s1::S, s2::S, len1::Integer, len2::Integer)
|
||||||
|
qgram1 = QGramIterator(s1, len1, dist.q)
|
||||||
|
qgram2 = QGramIterator(s2, len2, dist.q)
|
||||||
|
t = EmptyTree{SubString{S}, UInt}()
|
||||||
|
for x in qgram1
|
||||||
|
t = add1!(t, x)
|
||||||
end
|
end
|
||||||
nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1
|
for x in qgram2
|
||||||
nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2
|
t = add2!(t, x)
|
||||||
return ((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
|
end
|
||||||
|
return t
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.done(s::PairSortedIterator, state)
|
|
||||||
state1, state2 = state
|
|
||||||
done(s.v2, state2) && done(s.v1, state1)
|
|
||||||
end
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## q-gram
|
## q-gram
|
||||||
|
@ -93,19 +114,23 @@ type QGram{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
QGram() = QGram(2)
|
QGram() = QGram(2)
|
||||||
|
|
||||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
type QGramAccumulator
|
||||||
isempty(s2) && return 0
|
n::Int
|
||||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
|
||||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
|
||||||
n = 0
|
|
||||||
for (n1, n2) in PairSortedIterator(sort1, sort2)
|
|
||||||
n += abs(n1 - n2)
|
|
||||||
end
|
|
||||||
return n
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function qgram(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
|
isempty(s2) && return 0
|
||||||
|
t = Tree(dist, s1, s2, len1, len2)
|
||||||
|
acc = QGramAccumulator(0)
|
||||||
|
evalans(dist, t, acc)
|
||||||
|
return acc.n
|
||||||
|
end
|
||||||
|
evalans(dist::QGram, t::EmptyTree, acc::QGramAccumulator) = nothing
|
||||||
|
function evalans{K, V}(dist::QGram, t::TreeNode{K, V}, acc::QGramAccumulator)
|
||||||
|
n1, n2 = t.data
|
||||||
|
acc.n += n1 > n2 ? n1 - n2 : n2 - n1
|
||||||
|
evalans(dist, t.left, acc)
|
||||||
|
evalans(dist, t.right, acc)
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -120,20 +145,29 @@ type Cosine{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Cosine() = Cosine(2)
|
Cosine() = Cosine(2)
|
||||||
|
|
||||||
|
type CosineAccumulator
|
||||||
|
norm1::Int
|
||||||
|
norm2::Int
|
||||||
|
prodnorm::Int
|
||||||
|
end
|
||||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
isempty(s2) && return 0
|
isempty(s2) && return 0.0
|
||||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
t = Tree(dist, s1, s2, len1, len2)
|
||||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
acc = CosineAccumulator(0, 0, 0)
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
evalans(dist, t, acc)
|
||||||
for (n1, n2) in PairSortedIterator(sort1, sort2)
|
denominator = sqrt(acc.norm1) * sqrt(acc.norm2)
|
||||||
norm1 += n1^2
|
return denominator != 0 ? 1.0 - acc.prodnorm / denominator : s1 == s2 ? 0.0 : 1.0
|
||||||
norm2 += n2^2
|
|
||||||
prodnorm += n1 * n2
|
|
||||||
end
|
|
||||||
denominator = sqrt(norm1) * sqrt(norm2)
|
|
||||||
return denominator != 0 ? 1.0 - prodnorm / denominator : s1 == s2 ? 0.0 : 1.0
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
evalans(dist::Cosine, t::EmptyTree, acc::CosineAccumulator) = nothing
|
||||||
|
function evalans{K, V}(dist::Cosine, t::TreeNode{K, V}, acc::CosineAccumulator)
|
||||||
|
n1, n2 = t.data
|
||||||
|
acc.norm1 += n1^2
|
||||||
|
acc.norm2 += n2^2
|
||||||
|
acc.prodnorm += n1 * n2
|
||||||
|
evalans(dist, t.left, acc)
|
||||||
|
evalans(dist, t.right, acc)
|
||||||
|
end
|
||||||
function cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
function cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
||||||
evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
|
@ -148,26 +182,39 @@ end
|
||||||
## return 1.0 if smaller than qgram
|
## return 1.0 if smaller than qgram
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type Jaccard{T <: Integer} <: AbstractQGram
|
type Jaccard{T <: Integer} <: AbstractQGram
|
||||||
q::T
|
q::T
|
||||||
end
|
end
|
||||||
Jaccard() = Jaccard(2)
|
Jaccard() = Jaccard(2)
|
||||||
|
|
||||||
|
type JaccardAccumulator
|
||||||
|
ndistinct1::Int
|
||||||
|
ndistinct2::Int
|
||||||
|
nintersect::Int
|
||||||
|
end
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
isempty(s2) && return 0
|
isempty(s2) && return 0.0
|
||||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
t = Tree(dist, s1, s2, len1, len2)
|
||||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
acc = JaccardAccumulator(0, 0, 0)
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
evalans(dist, t, acc)
|
||||||
for (n1, n2) in PairSortedIterator(sort1, sort2)
|
denominator = acc.ndistinct1 + acc.ndistinct2 - acc.nintersect
|
||||||
ndistinct1 += n1 > 0
|
return denominator != 0 ? 1.0 - acc.nintersect / denominator : s1 == s2 ? 0.0 : 1.0
|
||||||
ndistinct2 += n2 > 0
|
end
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
|
||||||
end
|
|
||||||
denominator = ndistinct1 + ndistinct2 - nintersect
|
evalans(dist::Jaccard, t::EmptyTree, acc::JaccardAccumulator) = nothing
|
||||||
return denominator != 0 ? 1.0 - nintersect / denominator : s1 == s2 ? 0.0 : 1.0
|
function evalans{K, V}(dist::Jaccard, t::TreeNode{K, V}, acc::JaccardAccumulator)
|
||||||
|
n1, n2 = t.data
|
||||||
|
acc.ndistinct1 += (n1 > zero(V))
|
||||||
|
acc.ndistinct2 += (n2 > zero(V))
|
||||||
|
acc.nintersect += ((n1 > zero(V)) & (n2 > zero(V)))
|
||||||
|
evalans(dist, t.left, acc)
|
||||||
|
evalans(dist, t.right, acc)
|
||||||
end
|
end
|
||||||
|
|
||||||
function jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
function jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2)
|
||||||
evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue