add unicode support
parent
0ea1828288
commit
f7cb7f82dc
26
README.md
26
README.md
|
@ -4,25 +4,25 @@
|
||||||
|
|
||||||
# StringDistances
|
# StringDistances
|
||||||
|
|
||||||
Edit Distances
|
ASCII
|
||||||
|
|
||||||
- [x] Hamming Distance
|
- [x] Hamming Distance
|
||||||
- [x] Jaro Distance
|
- [x] Jaro Distance and Jaro-Winkler Distance
|
||||||
- [x] Jaro-Winkler Distance
|
|
||||||
- [x] Levenshtein Distance
|
- [x] Levenshtein Distance
|
||||||
- [x] Damerau-Levenshtein Distance
|
- [x] Damerau-Levenshtein Distance
|
||||||
|
- [x] Qgram Distance
|
||||||
|
- [x] Cosine Distance
|
||||||
|
- [x] Jaccard Distance
|
||||||
|
|
||||||
Q-gram Distances
|
AbstractString
|
||||||
|
|
||||||
- [x] qgram
|
- [x] Hamming Distance
|
||||||
- [x] cosine
|
- [] Jaro Distance and Jaro-Winkler Distance
|
||||||
- [x] jaccard
|
- [x] Levenshtein Distance
|
||||||
|
- [x] Damerau-Levenshtein Distance
|
||||||
Type supports
|
- [x] Qgram Distance
|
||||||
|
- [x] Cosine Distance
|
||||||
- [x] ASCIIString
|
- [x] Jaccard Distance
|
||||||
- [x] UTF8String
|
|
||||||
- [ ] Unicode
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,17 +5,21 @@
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
function evaluate{T}(dist::Hamming, s1::T, s2::T)
|
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
len1 > len2 && return evaluate(dist, s2, s1)
|
||||||
count = 0
|
count = 0
|
||||||
@inbounds for i in 1:length(s1)
|
|
||||||
count += s1[i] != s2[i]
|
state2 = start(s2)
|
||||||
|
for ch1 in s1
|
||||||
|
ch2, state2 = next(s2, state2)
|
||||||
|
count += ch1 != ch2
|
||||||
end
|
end
|
||||||
count += length(s2) - length(s1)
|
count += len2 - len1
|
||||||
return count
|
return count
|
||||||
end
|
end
|
||||||
|
|
||||||
hamming{T}(s1::T, s2::T) = evaluate(Hamming(), s1, s2)
|
hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
|
@ -24,111 +28,113 @@ hamming{T}(s1::T, s2::T) = evaluate(Hamming(), s1, s2)
|
||||||
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
# prefix common to both strings can be ignored
|
||||||
function common_suffix{T}(s1::T, s2::T)
|
function common_prefix(s1::AbstractString, s2::AbstractString)
|
||||||
len1 = length(s1)
|
start1 = start(s1)
|
||||||
len2 = length(s2)
|
start2 = start(s2)
|
||||||
while ((len1 > 0) && (s1[len1] == s2[len2]))
|
while !done(s1, start1)
|
||||||
len1 -= 1
|
ch1, nextstart1 = next(s1, start1)
|
||||||
len2 -= 1
|
ch2, nextstart2 = next(s2, start2)
|
||||||
|
ch1 != ch2 && break
|
||||||
|
start1, start2 = nextstart1, nextstart2
|
||||||
end
|
end
|
||||||
return len1, len2
|
return start1, start2
|
||||||
end
|
end
|
||||||
|
|
||||||
function common_prefix{T}(s1::T, s2::T, len1::Int, len2::Int)
|
|
||||||
start = 0
|
|
||||||
len1 == 0 && return len1, len2, start
|
|
||||||
if (s1[start + 1] == s2[start + 1])
|
|
||||||
while ((start < len1) && (s1[start + 1] == s2[start + 1]))
|
|
||||||
start += 1
|
|
||||||
end
|
|
||||||
len1 -= start
|
|
||||||
len2 -= start
|
|
||||||
len1 == 0 && return len1, len2, start
|
|
||||||
end
|
|
||||||
return len1, len2, start
|
|
||||||
end
|
|
||||||
|
|
||||||
type Levenshtein end
|
type Levenshtein end
|
||||||
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
|
len1, len2 = length(s1), length(s2)
|
||||||
|
|
||||||
function evaluate{T}(dist::Levenshtein, s1::T, s2::T)
|
len1 > len2 && return evaluate(dist, s2, s1)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
len2 == 0 && return 0
|
||||||
length(s2) == 0 && return 0
|
|
||||||
|
|
||||||
# common
|
# common
|
||||||
len1, len2 = common_suffix(s1, s2)
|
start1, start2 = common_prefix(s1, s2)
|
||||||
len1, len2, start = common_prefix(s1, s2, len1, len2)
|
done(s1, start1) && return len2
|
||||||
len1 == 0 && return len2
|
|
||||||
|
|
||||||
dist = Array(Int, len2)
|
# distance initialized to first row of matrix
|
||||||
|
# => distance between "" and s2[1:i}
|
||||||
|
v0 = Array(Int, len2)
|
||||||
@inbounds for i2 in 1:len2
|
@inbounds for i2 in 1:len2
|
||||||
dist[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
current = 0
|
current = zero(0)
|
||||||
for i1 in 1:len1
|
state1 = start1
|
||||||
ch1 = s1[start + i1]
|
i1 = 0
|
||||||
left = current = i1 - 1
|
while !done(s1, state1)
|
||||||
for i2 in 1:len2
|
i1 += 1
|
||||||
above = current
|
ch1, state1 = next(s1, state1)
|
||||||
current = left
|
left = (i1 - 1)
|
||||||
left = dist[i2]
|
current = (i1 - 1)
|
||||||
if ch1 != s2[start + i2]
|
state2 = start2
|
||||||
current += 1
|
i2 = 0
|
||||||
insDel = above + 1
|
while !done(s2, state2)
|
||||||
if insDel < current
|
i2 += 1
|
||||||
current = insDel
|
ch2, state2 = next(s2, state2)
|
||||||
|
# update
|
||||||
|
above, current, left = current, left, v0[i2]
|
||||||
|
if ch1 != ch2
|
||||||
|
# substitution
|
||||||
|
current = min(current + 1,
|
||||||
|
above + 1,
|
||||||
|
left + 1)
|
||||||
end
|
end
|
||||||
insDel = left + 1
|
v0[i2] = current
|
||||||
if insDel < current
|
|
||||||
current = insDel
|
|
||||||
end
|
|
||||||
end
|
|
||||||
dist[i2] = current
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
function levenshtein(s1::AbstractString, s2::AbstractString)
|
||||||
levenshtein{T}(s1::T, s2::T) = evaluate(Levenshtein(), s1, s2)
|
evaluate(Levenshtein(), s1, s2)
|
||||||
|
end
|
||||||
|
|
||||||
type DamerauLevenshtein end
|
type DamerauLevenshtein end
|
||||||
|
|
||||||
function evaluate{T}(dist::DamerauLevenshtein, s1::T, s2::T)
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
length(s2) == 0 && return 0
|
length(s2) == 0 && return 0
|
||||||
|
|
||||||
# common
|
# common
|
||||||
len1, len2 = common_suffix(s1, s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1, len2, start = common_prefix(s1, s2, len1, len2)
|
start1, start2 = common_prefix(s1, s2)
|
||||||
len1 == 0 && return len2
|
done(s1, start1) && return len2
|
||||||
|
|
||||||
dist = Array(Int, length(s2))
|
v0 = Array(Int, length(s2))
|
||||||
@inbounds for i2 in 1:len2
|
@inbounds for i2 in 1:len2
|
||||||
dist[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
dist2 = Array(Int, length(s2))
|
v2 = Array(Int, length(s2))
|
||||||
|
|
||||||
ch1 = s1[1]
|
ch1, = next(s1, start1)
|
||||||
current = 0
|
current = 0
|
||||||
for i1 in 1:len1
|
state1 = start1
|
||||||
|
i1 = 0
|
||||||
|
while !done(s1, state1)
|
||||||
|
i1 += 1
|
||||||
prevch1 = ch1
|
prevch1 = ch1
|
||||||
ch1 = s1[start + i1]
|
ch1, state1 = next(s1, state1)
|
||||||
ch2 = s2[start + 1]
|
ch2, = next(s2, start2)
|
||||||
left = i1 - 1
|
left = (i1 - 1)
|
||||||
current = i1
|
current = i1
|
||||||
nextTransCost = 0
|
nextTransCost = 0
|
||||||
for i2 in 1:len2
|
state2 = start2
|
||||||
|
i2 = 0
|
||||||
|
while !done(s2, state2)
|
||||||
|
i2 += 1
|
||||||
|
prevch2 = ch2
|
||||||
|
ch2, state2 = next(s2, state2)
|
||||||
above = current
|
above = current
|
||||||
thisTransCost = nextTransCost
|
thisTransCost = nextTransCost
|
||||||
nextTransCost = dist2[i2]
|
nextTransCost = v2[i2]
|
||||||
dist2[i2] = current = left
|
# cost of diagonal (substitution)
|
||||||
left = dist[i2]
|
v2[i2] = current = left
|
||||||
prevch2 = ch2
|
# left now equals current cost (which will be diagonal at next iteration)
|
||||||
ch2 = s2[start + i2]
|
left = v0[i2]
|
||||||
if ch1 != ch2
|
if ch1 != ch2
|
||||||
|
# insertion
|
||||||
if left < current
|
if left < current
|
||||||
current = left
|
current = left
|
||||||
end
|
end
|
||||||
|
# deletion
|
||||||
if above < current
|
if above < current
|
||||||
current = above
|
current = above
|
||||||
end
|
end
|
||||||
|
@ -140,13 +146,13 @@ function evaluate{T}(dist::DamerauLevenshtein, s1::T, s2::T)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
dist[i2] = current
|
v0[i2] = current
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
|
||||||
damerau_levenshtein{T}(s1::T, s2::T) = evaluate(DamerauLevenshtein(), s1, s2)
|
damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2)
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
|
@ -161,7 +167,7 @@ type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
|
||||||
end
|
end
|
||||||
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
|
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
|
||||||
|
|
||||||
function evaluate{T}(dist::JaroWinkler, s1::T, s2::T)
|
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
length(s2) == 0 && return 1.0
|
length(s2) == 0 && return 1.0
|
||||||
|
|
||||||
|
@ -209,11 +215,24 @@ function evaluate{T}(dist::JaroWinkler, s1::T, s2::T)
|
||||||
return 1 - score
|
return 1 - score
|
||||||
end
|
end
|
||||||
|
|
||||||
function jaro_winkler{T}(s1::T, s2::T;
|
function jaro_winkler(s1::AbstractString, s2::AbstractString;
|
||||||
scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5)
|
scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5)
|
||||||
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
|
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
jaro{T}(s1::T, s2::T) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,28 @@
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
|
## Gram Iterator iterates through q-grams of a string
|
||||||
## For instance v("leila")["il"] =1
|
##
|
||||||
## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
|
||||||
## q-gram is ∑ |v(s1, p) - v(s2, p)|
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
type QGramIterator{S <: AbstractString, T <: Integer}
|
||||||
|
s::S
|
||||||
|
q::T
|
||||||
|
end
|
||||||
|
function Base.start(qgram::QGramIterator)
|
||||||
|
len = length(qgram.s)
|
||||||
|
(1, len == 0 ? 1 : len < qgram.q ? nextind(chr2ind(qgram.s, len)) : chr2ind(qgram.s, qgram.q))
|
||||||
|
end
|
||||||
|
function Base.next{S, T}(qgram::QGramIterator{S, T}, state)
|
||||||
|
istart, iend = state
|
||||||
|
convert(S, SubString(qgram.s, istart, iend)), (nextind(qgram.s, istart), nextind(qgram.s, iend))
|
||||||
|
end
|
||||||
|
function Base.done(qgram::QGramIterator, state)
|
||||||
|
istart, idend = state
|
||||||
|
done(qgram.s, idend)
|
||||||
|
end
|
||||||
|
Base.eltype{S, T}(::QGramIterator{S, T}) = S
|
||||||
|
Base.length(qgram::QGramIterator) = length(qgram.s - qgram.q + 1)
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## A Bag is like Set that it allows duplicated values
|
## A Bag is like Set that it allows duplicated values
|
||||||
|
@ -34,14 +50,23 @@ end
|
||||||
|
|
||||||
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
|
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
|
||||||
|
|
||||||
function Bag(s::AbstractString, q::Integer)
|
function Bag(s)
|
||||||
bag = Bag{typeof(s), UInt}()
|
bag = Bag{eltype(s), UInt}()
|
||||||
@inbounds for i in 1:(length(s) - q + 1)
|
for x in s
|
||||||
push!(bag, s[i:(i + q - 1)])
|
push!(bag, x)
|
||||||
end
|
end
|
||||||
return bag
|
return bag
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
|
||||||
|
## For instance v("leila")["il"] =1
|
||||||
|
## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
||||||
|
## q-gram is ∑ |v(s1, p) - v(s2, p)|
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## q-gram
|
## q-gram
|
||||||
|
@ -58,11 +83,10 @@ function evaluate{T}(dist::QGram, s1::T, s2::T)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
length(s2) == 0 && return 0
|
length(s2) == 0 && return 0
|
||||||
n2 = length(s2) - dist.q + 1
|
n2 = length(s2) - dist.q + 1
|
||||||
bag = Bag(s2, dist.q)
|
bag = Bag(QGramIterator(s2, dist.q))
|
||||||
count = 0
|
count = 0
|
||||||
n1 = length(s1) - dist.q + 1
|
n1 = length(s1) - dist.q + 1
|
||||||
for i1 in 1:n1
|
for ch in QGramIterator(s1, dist.q)
|
||||||
@inbounds ch = s1[i1:(i1 + dist.q - 1)]
|
|
||||||
delete!(bag, ch)
|
delete!(bag, ch)
|
||||||
end
|
end
|
||||||
# number non matched in s1 : n1 - (n2 - length(bag))
|
# number non matched in s1 : n1 - (n2 - length(bag))
|
||||||
|
@ -87,9 +111,8 @@ function evaluate{T}(dist::Cosine, s1::T, s2::T)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
length(s2) == 0 && return 0.0
|
length(s2) == 0 && return 0.0
|
||||||
|
|
||||||
bag2 = Bag(s2, dist.q)
|
bag2 = Bag(QGramIterator(s2, dist.q))
|
||||||
bag1 = Bag(s1, dist.q)
|
bag1 = Bag(QGramIterator(s1, dist.q))
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
for (k, v1) in bag1.dict
|
for (k, v1) in bag1.dict
|
||||||
count += v1 * get(bag2.dict, k, 0)
|
count += v1 * get(bag2.dict, k, 0)
|
||||||
|
@ -119,17 +142,8 @@ function evaluate{T}(dist::Jaccard, s1::T, s2::T)
|
||||||
length(s2) == 0 && return 0.0
|
length(s2) == 0 && return 0.0
|
||||||
|
|
||||||
|
|
||||||
set2 = Set{T}()
|
set2 = Set(QGramIterator(s2, dist.q))
|
||||||
n2 = length(s2) - dist.q + 1
|
set1 = Set(QGramIterator(s1, dist.q))
|
||||||
@inbounds for i2 in 1:n2
|
|
||||||
push!(set2, s2[i2:(i2 + dist.q - 1)])
|
|
||||||
end
|
|
||||||
|
|
||||||
set1 = Set{T}()
|
|
||||||
n1 = length(s1) - dist.q + 1
|
|
||||||
@inbounds for i1 in 1:n1
|
|
||||||
push!(set1, s1[i1:(i1 + dist.q - 1)])
|
|
||||||
end
|
|
||||||
|
|
||||||
n_intersect = 0
|
n_intersect = 0
|
||||||
for x in set1
|
for x in set1
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
|
||||||
|
## For instance v("leila")["il"] =1
|
||||||
|
## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
||||||
|
## q-gram is ∑ |v(s1, p) - v(s2, p)|
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## A Bag is like Set that it allows duplicated values
|
||||||
|
## I implement it as dictionary from elements => number of duplicates
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
type Bag{Tv, Ti <: Integer}
|
||||||
|
dict::Dict{Tv, Ti}
|
||||||
|
Bag() = new(Dict{Tv, Ti}())
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||||
|
bag.dict[x] = get(bag.dict, x, zero(Ti)) + one(Ti)
|
||||||
|
return bag
|
||||||
|
end
|
||||||
|
|
||||||
|
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||||
|
v = get(bag.dict, x, zero(Ti))
|
||||||
|
if v > zero(Ti)
|
||||||
|
bag.dict[x] = v - one(Ti)
|
||||||
|
end
|
||||||
|
return x
|
||||||
|
end
|
||||||
|
|
||||||
|
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
|
||||||
|
|
||||||
|
function Bag(s::AbstractString, q::Integer)
|
||||||
|
bag = Bag{typeof(s), UInt}()
|
||||||
|
@inbounds for i in 1:(length(s) - q + 1)
|
||||||
|
push!(bag, s[i:(i + q - 1)])
|
||||||
|
end
|
||||||
|
return bag
|
||||||
|
end
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## q-gram
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
type QGram{T <: Integer}
|
||||||
|
q::T
|
||||||
|
end
|
||||||
|
QGram() = QGram(2)
|
||||||
|
|
||||||
|
|
||||||
|
function evaluate{T}(dist::QGram, s1::T, s2::T)
|
||||||
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
|
length(s2) == 0 && return 0
|
||||||
|
n2 = length(s2) - dist.q + 1
|
||||||
|
bag = Bag(s2, dist.q)
|
||||||
|
count = 0
|
||||||
|
n1 = length(s1) - dist.q + 1
|
||||||
|
for i1 in 1:n1
|
||||||
|
@inbounds ch = s1[i1:(i1 + dist.q - 1)]
|
||||||
|
delete!(bag, ch)
|
||||||
|
end
|
||||||
|
# number non matched in s1 : n1 - (n2 - length(bag))
|
||||||
|
# number non matched in s2 : length(bag)
|
||||||
|
return n1 - n2 + 2 * length(bag)
|
||||||
|
end
|
||||||
|
|
||||||
|
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## cosine
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
type Cosine{T <: Integer}
|
||||||
|
q::T
|
||||||
|
end
|
||||||
|
Cosine() = Cosine(2)
|
||||||
|
|
||||||
|
function evaluate{T}(dist::Cosine, s1::T, s2::T)
|
||||||
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
|
length(s2) == 0 && return 0.0
|
||||||
|
|
||||||
|
bag2 = Bag(s2, dist.q)
|
||||||
|
bag1 = Bag(s1, dist.q)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for (k, v1) in bag1.dict
|
||||||
|
count += v1 * get(bag2.dict, k, 0)
|
||||||
|
end
|
||||||
|
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
||||||
|
denominator == 0 ? 1.0 : 1.0 - count / denominator
|
||||||
|
end
|
||||||
|
|
||||||
|
cosine{T}(s1::T, s2::T; q = 2) = evaluate(Cosine(q), s1, s2)
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Jaccard
|
||||||
|
##
|
||||||
|
## Denote Q(s, q) the set of tuple of length q in s
|
||||||
|
## jaccard(s1, s2, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
type Jaccard{T <: Integer}
|
||||||
|
q::T
|
||||||
|
end
|
||||||
|
Jaccard() = Jaccard(2)
|
||||||
|
|
||||||
|
function evaluate{T}(dist::Jaccard, s1::T, s2::T)
|
||||||
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
|
length(s2) == 0 && return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
set2 = Set{T}()
|
||||||
|
n2 = length(s2) - dist.q + 1
|
||||||
|
@inbounds for i2 in 1:n2
|
||||||
|
push!(set2, s2[i2:(i2 + dist.q - 1)])
|
||||||
|
end
|
||||||
|
|
||||||
|
set1 = Set{T}()
|
||||||
|
n1 = length(s1) - dist.q + 1
|
||||||
|
@inbounds for i1 in 1:n1
|
||||||
|
push!(set1, s1[i1:(i1 + dist.q - 1)])
|
||||||
|
end
|
||||||
|
|
||||||
|
n_intersect = 0
|
||||||
|
for x in set1
|
||||||
|
if x in set2
|
||||||
|
n_intersect += 1
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return 1.0 - n_intersect / (length(set1) + length(set2) - n_intersect)
|
||||||
|
end
|
||||||
|
|
||||||
|
jaccard{T}(s1::T, s2::T; q = 2) = evaluate(Jaccard(q), s1, s2)
|
|
@ -20,6 +20,13 @@ using StringDistances, Base.Test
|
||||||
@test levenshtein("Saturday", "Sunday") == 3
|
@test levenshtein("Saturday", "Sunday") == 3
|
||||||
|
|
||||||
|
|
||||||
|
@test 4 == evaluate(Levenshtein(), "Hi, my name is", "my name is")
|
||||||
|
@test 21 == evaluate(Levenshtein(), "%^@!^@#^@#!! Snoooooooop", "Dro!p it!!!! like it's hot")
|
||||||
|
@test 7 == evaluate(Levenshtein(), "Alborgów", "amoniak")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@test damerau_levenshtein("", "") == 0
|
@test damerau_levenshtein("", "") == 0
|
||||||
@test damerau_levenshtein("abc", "") == 3
|
@test damerau_levenshtein("abc", "") == 3
|
||||||
@test damerau_levenshtein("bc", "abc") == 1
|
@test damerau_levenshtein("bc", "abc") == 1
|
||||||
|
|
Loading…
Reference in New Issue