all encodings
parent
cb5235cea9
commit
04831e0962
12
README.md
12
README.md
|
@ -14,17 +14,7 @@ ASCII
|
|||
- [x] Cosine Distance
|
||||
- [x] Jaccard Distance
|
||||
|
||||
UTF-8 and Unicode
|
||||
|
||||
- [x] Hamming Distance
|
||||
- [ ] Jaro Distance and Jaro-Winkler Distance
|
||||
- [x] Levenshtein Distance
|
||||
- [x] Damerau-Levenshtein Distance
|
||||
- [x] Qgram Distance
|
||||
- [x] Cosine Distance
|
||||
- [x] Jaccard Distance
|
||||
|
||||
|
||||
Works with ASCII, UTF-8 and Unicode
|
||||
|
||||
Examples
|
||||
```julia
|
||||
|
|
|
@ -168,20 +168,31 @@ end
|
|||
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
|
||||
|
||||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 1.0
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 > len2 && return evaluate(dist, s2, s1)
|
||||
len2 == 0 && return 1.0
|
||||
|
||||
maxdist = max(0, div(length(s2), 2) - 1)
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
m = 0 # matching characters
|
||||
t = 0 # half number of transpositions
|
||||
flag = fill(false, length(s2))
|
||||
flag = fill(false, len2)
|
||||
prevpos = 0
|
||||
@inbounds for i1 in 1:length(s1)
|
||||
ch = s1[i1]
|
||||
i2low = max(1, i1 - maxdist)
|
||||
i2high = min(length(s2), i1 + maxdist)
|
||||
for i2 in i2low:i2high
|
||||
if ch == s2[i2] && !flag[i2]
|
||||
|
||||
i1 = 0
|
||||
startstate2 = start(s2)
|
||||
starti2 = 0
|
||||
for ch1 in s1
|
||||
i1 += 1
|
||||
if starti2 < i1 - maxdist - 1
|
||||
startstate2 = nextind(s2, startstate2)
|
||||
starti2 += 1
|
||||
end
|
||||
i2 = starti2
|
||||
state2 = startstate2
|
||||
while !done(s2, state2) && i2 < i1 + maxdist
|
||||
ch2, state2 = next(s2, state2)
|
||||
i2 += 1
|
||||
if ch1 == ch2 && !flag[i2]
|
||||
m += 1
|
||||
# if match is before the index of previous match
|
||||
if i2 < prevpos
|
||||
|
@ -194,13 +205,18 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
m == 0.0 && return 1.0
|
||||
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
||||
score = (m / len1 + m / len2 + (m - t) / m) / 3.0
|
||||
|
||||
# common prefix adjustment
|
||||
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (length(s1) >= dist.long_threshold)
|
||||
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (len1 >= dist.long_threshold)
|
||||
l = 0
|
||||
last = min(4, length(s1))
|
||||
while l < last && s1[l+1] == s2[l+1]
|
||||
last = min(4, len1)
|
||||
state1 = start(s1)
|
||||
state2 = start(s2)
|
||||
while l < last
|
||||
ch1, state1 = next(s1, state1)
|
||||
ch2, state2 = next(s2, state2)
|
||||
ch1 != ch2 && break
|
||||
l += 1
|
||||
end
|
||||
# common prefix adjustment
|
||||
|
@ -208,8 +224,8 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
|||
score += l * (1 - score) * dist.scaling_factor
|
||||
end
|
||||
# longer string adjustment
|
||||
if (length(s1) >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
|
||||
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
||||
if (len1 >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (len1 - l) / 2)
|
||||
score += (1 - score) * (m - (l + 1)) / (len1 + len2 - (2 * (l - 1)))
|
||||
end
|
||||
end
|
||||
return 1 - score
|
||||
|
@ -223,16 +239,3 @@ end
|
|||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ function Base.done(qgram::QGramIterator, state)
|
|||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype{S, T}(::QGramIterator{S, T}) = S
|
||||
Base.length(qgram::QGramIterator) = length(qgram.s - qgram.q + 1)
|
||||
Base.length(qgram::QGramIterator) = length(qgram.s) - qgram.q + 1
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -79,22 +79,21 @@ end
|
|||
QGram() = QGram(2)
|
||||
|
||||
|
||||
function evaluate{T}(dist::QGram, s1::T, s2::T)
|
||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0
|
||||
n2 = length(s2) - dist.q + 1
|
||||
bag = Bag(QGramIterator(s2, dist.q))
|
||||
count = 0
|
||||
n1 = length(s1) - dist.q + 1
|
||||
for ch in QGramIterator(s1, dist.q)
|
||||
q1 = QGramIterator(s1, dist.q)
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
bag = Bag(q2)
|
||||
for ch in q1
|
||||
delete!(bag, ch)
|
||||
end
|
||||
# number non matched in s1 : n1 - (n2 - length(bag))
|
||||
# number non matched in s2 : length(bag)
|
||||
return n1 - n2 + 2 * length(bag)
|
||||
return length(q1) - length(q2) + 2 * length(bag)
|
||||
end
|
||||
|
||||
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)
|
||||
qgram(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -107,28 +106,27 @@ type Cosine{T <: Integer}
|
|||
end
|
||||
Cosine() = Cosine(2)
|
||||
|
||||
function evaluate{T}(dist::Cosine, s1::T, s2::T)
|
||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0.0
|
||||
|
||||
bag2 = Bag(QGramIterator(s2, dist.q))
|
||||
bag1 = Bag(QGramIterator(s1, dist.q))
|
||||
count = 0
|
||||
numerator = 0
|
||||
for (k, v1) in bag1.dict
|
||||
count += v1 * get(bag2.dict, k, 0)
|
||||
numerator += v1 * get(bag2.dict, k, 0)
|
||||
end
|
||||
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
||||
denominator == 0 ? 1.0 : 1.0 - count / denominator
|
||||
denominator == 0 ? 1.0 : 1.0 - numerator / denominator
|
||||
end
|
||||
|
||||
cosine{T}(s1::T, s2::T; q = 2) = evaluate(Cosine(q), s1, s2)
|
||||
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Jaccard
|
||||
##
|
||||
## Denote Q(s, q) the set of tuple of length q in s
|
||||
## jaccard(s1, s2, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
||||
## jaccard(s1::AbstractString, s2::AbstractString, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
|
@ -137,22 +135,19 @@ type Jaccard{T <: Integer}
|
|||
end
|
||||
Jaccard() = Jaccard(2)
|
||||
|
||||
function evaluate{T}(dist::Jaccard, s1::T, s2::T)
|
||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0.0
|
||||
|
||||
|
||||
set2 = Set(QGramIterator(s2, dist.q))
|
||||
set1 = Set(QGramIterator(s1, dist.q))
|
||||
|
||||
n_intersect = 0
|
||||
numerator = 0
|
||||
for x in set1
|
||||
if x in set2
|
||||
n_intersect += 1
|
||||
numerator += 1
|
||||
end
|
||||
end
|
||||
|
||||
return 1.0 - n_intersect / (length(set1) + length(set2) - n_intersect)
|
||||
denominator = length(set1) + length(set2) - numerator
|
||||
return 1.0 - numerator / denominator
|
||||
end
|
||||
|
||||
jaccard{T}(s1::T, s2::T; q = 2) = evaluate(Jaccard(q), s1, s2)
|
||||
jaccard(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
Loading…
Reference in New Issue