all encodings

pull/1/head
matthieugomez 2015-10-24 13:29:15 -04:00
parent cb5235cea9
commit 04831e0962
3 changed files with 53 additions and 65 deletions

View File

@ -14,17 +14,7 @@ ASCII
- [x] Cosine Distance
- [x] Jaccard Distance
UTF-8 and Unicode
- [x] Hamming Distance
- [ ] Jaro Distance and Jaro-Winkler Distance
- [x] Levenshtein Distance
- [x] Damerau-Levenshtein Distance
- [x] Qgram Distance
- [x] Cosine Distance
- [x] Jaccard Distance
Works with ASCII, UTF-8 and Unicode
Examples
```julia

View File

@ -168,20 +168,31 @@ end
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 1.0
len1, len2 = length(s1), length(s2)
len1 > len2 && return evaluate(dist, s2, s1)
len2 == 0 && return 1.0
maxdist = max(0, div(length(s2), 2) - 1)
maxdist = max(0, div(len2, 2) - 1)
m = 0 # matching characters
t = 0 # half number of transpositions
flag = fill(false, length(s2))
flag = fill(false, len2)
prevpos = 0
@inbounds for i1 in 1:length(s1)
ch = s1[i1]
i2low = max(1, i1 - maxdist)
i2high = min(length(s2), i1 + maxdist)
for i2 in i2low:i2high
if ch == s2[i2] && !flag[i2]
i1 = 0
startstate2 = start(s2)
starti2 = 0
for ch1 in s1
i1 += 1
if starti2 < i1 - maxdist - 1
startstate2 = nextind(s2, startstate2)
starti2 += 1
end
i2 = starti2
state2 = startstate2
while !done(s2, state2) && i2 < i1 + maxdist
ch2, state2 = next(s2, state2)
i2 += 1
if ch1 == ch2 && !flag[i2]
m += 1
# if match is before the index of previous match
if i2 < prevpos
@ -194,13 +205,18 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
end
end
m == 0.0 && return 1.0
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
score = (m / len1 + m / len2 + (m - t) / m) / 3.0
# common prefix adjustment
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (length(s1) >= dist.long_threshold)
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (len1 >= dist.long_threshold)
l = 0
last = min(4, length(s1))
while l < last && s1[l+1] == s2[l+1]
last = min(4, len1)
state1 = start(s1)
state2 = start(s2)
while l < last
ch1, state1 = next(s1, state1)
ch2, state2 = next(s2, state2)
ch1 != ch2 && break
l += 1
end
# common prefix adjustment
@ -208,8 +224,8 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
score += l * (1 - score) * dist.scaling_factor
end
# longer string adjustment
if (length(s1) >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
if (len1 >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (len1 - l) / 2)
score += (1 - score) * (m - (l + 1)) / (len1 + len2 - (2 * (l - 1)))
end
end
return 1 - score
@ -223,16 +239,3 @@ end
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)

View File

@ -21,7 +21,7 @@ function Base.done(qgram::QGramIterator, state)
done(qgram.s, idend)
end
Base.eltype{S, T}(::QGramIterator{S, T}) = S
Base.length(qgram::QGramIterator) = length(qgram.s - qgram.q + 1)
Base.length(qgram::QGramIterator) = length(qgram.s) - qgram.q + 1
##############################################################################
##
@ -79,22 +79,21 @@ end
QGram() = QGram(2)
function evaluate{T}(dist::QGram, s1::T, s2::T)
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
n2 = length(s2) - dist.q + 1
bag = Bag(QGramIterator(s2, dist.q))
count = 0
n1 = length(s1) - dist.q + 1
for ch in QGramIterator(s1, dist.q)
q1 = QGramIterator(s1, dist.q)
q2 = QGramIterator(s2, dist.q)
bag = Bag(q2)
for ch in q1
delete!(bag, ch)
end
# number non matched in s1 : n1 - (n2 - length(bag))
# number non matched in s2 : length(bag)
return n1 - n2 + 2 * length(bag)
return length(q1) - length(q2) + 2 * length(bag)
end
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)
qgram(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
##############################################################################
##
@ -107,28 +106,27 @@ type Cosine{T <: Integer}
end
Cosine() = Cosine(2)
function evaluate{T}(dist::Cosine, s1::T, s2::T)
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
bag2 = Bag(QGramIterator(s2, dist.q))
bag1 = Bag(QGramIterator(s1, dist.q))
count = 0
numerator = 0
for (k, v1) in bag1.dict
count += v1 * get(bag2.dict, k, 0)
numerator += v1 * get(bag2.dict, k, 0)
end
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
denominator == 0 ? 1.0 : 1.0 - count / denominator
denominator == 0 ? 1.0 : 1.0 - numerator / denominator
end
cosine{T}(s1::T, s2::T; q = 2) = evaluate(Cosine(q), s1, s2)
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
##############################################################################
##
## Jaccard
##
## Denote Q(s, q) the set of tuple of length q in s
## jaccard(s1, s2, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
## jaccard(s1::AbstractString, s2::AbstractString, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
##
##############################################################################
@ -137,22 +135,19 @@ type Jaccard{T <: Integer}
end
Jaccard() = Jaccard(2)
function evaluate{T}(dist::Jaccard, s1::T, s2::T)
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
set2 = Set(QGramIterator(s2, dist.q))
set1 = Set(QGramIterator(s1, dist.q))
n_intersect = 0
numerator = 0
for x in set1
if x in set2
n_intersect += 1
numerator += 1
end
end
return 1.0 - n_intersect / (length(set1) + length(set2) - n_intersect)
denominator = length(set1) + length(set2) - numerator
return 1.0 - numerator / denominator
end
jaccard{T}(s1::T, s2::T; q = 2) = evaluate(Jaccard(q), s1, s2)
jaccard(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)