2015-10-25 16:23:46 +01:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Find common prefixes (up to lim. -1 means Inf)
|
|
|
|
##############################################################################
|
|
|
|
|
2017-08-05 20:45:19 +02:00
|
|
|
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1)
|
|
|
|
x2 = iterate(s2)
|
2015-10-25 16:23:46 +01:00
|
|
|
l = 0
|
2018-07-04 23:27:40 +02:00
|
|
|
while (x1 !== nothing) && (x2 !== nothing) && (l < lim || lim < 0)
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
|
|
|
ch2, state2 = x2
|
2015-10-25 16:23:46 +01:00
|
|
|
ch1 != ch2 && break
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
|
|
|
x2 = iterate(s2, state2)
|
2015-10-25 16:23:46 +01:00
|
|
|
l += 1
|
|
|
|
end
|
2018-07-04 21:04:06 +02:00
|
|
|
return l, x1, x2
|
2015-10-25 16:23:46 +01:00
|
|
|
end
|
|
|
|
|
2015-10-23 16:12:51 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Hamming
|
|
|
|
##
|
|
|
|
##############################################################################
|
|
|
|
|
2017-08-05 20:45:19 +02:00
|
|
|
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
|
|
|
out = 0
|
2015-11-03 19:07:17 +01:00
|
|
|
for (ch1, ch2) in zip(s1, s2)
|
2017-08-05 20:45:19 +02:00
|
|
|
out += ch1 != ch2
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2017-08-05 20:45:19 +02:00
|
|
|
out += abs(length(s2) - length(s1))
|
|
|
|
return out
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
##
|
2015-11-02 18:54:47 +01:00
|
|
|
## Levenshtein
|
|
|
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
2015-10-23 16:12:51 +02:00
|
|
|
##
|
|
|
|
##############################################################################
|
2015-10-25 16:23:46 +01:00
|
|
|
|
2015-10-24 21:18:35 +02:00
|
|
|
|
2017-05-12 23:41:56 +02:00
|
|
|
struct Levenshtein <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2017-08-05 20:45:19 +02:00
|
|
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
2015-10-25 16:23:46 +01:00
|
|
|
# prefix common to both strings can be ignored
|
2017-08-05 20:45:19 +02:00
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
2018-07-04 21:04:06 +02:00
|
|
|
k, x1, x2start = common_prefix(s1, s2)
|
2018-07-04 20:02:50 +02:00
|
|
|
(x1 == nothing) && return len2 - k
|
2015-10-24 18:45:24 +02:00
|
|
|
# distance initialized to first row of matrix
|
|
|
|
# => distance between "" and s2[1:i}
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
2018-07-04 20:02:50 +02:00
|
|
|
current = 0
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
2018-07-04 23:27:40 +02:00
|
|
|
while x1 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2015-10-24 18:45:24 +02:00
|
|
|
left = (i1 - 1)
|
|
|
|
current = (i1 - 1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2018-07-04 21:04:06 +02:00
|
|
|
x2 = x2start
|
2018-07-04 23:27:40 +02:00
|
|
|
while x2 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-24 18:45:24 +02:00
|
|
|
# update
|
|
|
|
above, current, left = current, left, v0[i2]
|
|
|
|
if ch1 != ch2
|
|
|
|
# substitution
|
|
|
|
current = min(current + 1,
|
|
|
|
above + 1,
|
|
|
|
left + 1)
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2015-10-24 18:45:24 +02:00
|
|
|
v0[i2] = current
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
return current
|
|
|
|
end
|
|
|
|
|
2015-11-02 18:54:47 +01:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Damerau Levenshtein
|
|
|
|
## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
|
|
|
##
|
|
|
|
##############################################################################
|
|
|
|
|
2017-05-12 23:41:56 +02:00
|
|
|
struct DamerauLevenshtein <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2017-08-05 20:45:19 +02:00
|
|
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
2018-07-04 18:33:13 +02:00
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
2015-10-25 16:23:46 +01:00
|
|
|
# prefix common to both strings can be ignored
|
2018-07-04 21:04:06 +02:00
|
|
|
k, x1, x2start = common_prefix(s1, s2)
|
2018-07-04 20:02:50 +02:00
|
|
|
(x1 == nothing) && return len2 - k
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
|
|
|
v2 = similar(v0)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
|
|
|
current = i1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1, = x1
|
2018-07-04 23:27:40 +02:00
|
|
|
while (x1 !== nothing)
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2015-10-24 18:45:24 +02:00
|
|
|
left = (i1 - 1)
|
|
|
|
current = i1
|
2015-10-23 16:12:51 +02:00
|
|
|
nextTransCost = 0
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2, = x2start
|
|
|
|
x2 = x2start
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2018-07-04 23:27:40 +02:00
|
|
|
while (x2 !== nothing)
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-23 16:12:51 +02:00
|
|
|
above = current
|
|
|
|
thisTransCost = nextTransCost
|
2015-10-24 18:45:24 +02:00
|
|
|
nextTransCost = v2[i2]
|
|
|
|
# cost of diagonal (substitution)
|
|
|
|
v2[i2] = current = left
|
|
|
|
# left now equals current cost (which will be diagonal at next iteration)
|
|
|
|
left = v0[i2]
|
2015-10-23 16:12:51 +02:00
|
|
|
if ch1 != ch2
|
2015-10-24 18:45:24 +02:00
|
|
|
# insertion
|
2015-10-23 16:12:51 +02:00
|
|
|
if left < current
|
|
|
|
current = left
|
|
|
|
end
|
2015-10-24 18:45:24 +02:00
|
|
|
# deletion
|
2015-10-23 16:12:51 +02:00
|
|
|
if above < current
|
|
|
|
current = above
|
|
|
|
end
|
|
|
|
current += 1
|
|
|
|
if i1 != 1 && i2 != 1 && ch1 == prevch2 && prevch1 == ch2
|
|
|
|
thisTransCost += 1
|
|
|
|
if thisTransCost < current
|
|
|
|
current = thisTransCost
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2015-10-24 18:45:24 +02:00
|
|
|
v0[i2] = current
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2 = ch2
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1 = ch1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
return current
|
|
|
|
end
|
|
|
|
|
|
|
|
##############################################################################
|
|
|
|
##
|
2015-11-04 18:40:30 +01:00
|
|
|
## Jaro
|
2017-08-05 20:45:19 +02:00
|
|
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
2015-10-23 16:12:51 +02:00
|
|
|
##############################################################################
|
|
|
|
|
2017-05-12 23:41:56 +02:00
|
|
|
struct Jaro <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2017-08-05 20:45:19 +02:00
|
|
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
|
|
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
2015-11-03 19:09:10 +01:00
|
|
|
len2 == 0 && return 0.0
|
2015-10-24 19:29:15 +02:00
|
|
|
maxdist = max(0, div(len2, 2) - 1)
|
|
|
|
flag = fill(false, len2)
|
2018-07-04 21:15:07 +02:00
|
|
|
prevstate1 = firstindex(s1)
|
2018-07-04 21:26:24 +02:00
|
|
|
i1_match = prevstate1 * ones(Int, len1)
|
|
|
|
# m counts matching characters
|
|
|
|
m = 0
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
|
|
|
i2 = 1
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1)
|
2018-07-04 21:15:07 +02:00
|
|
|
x2 = iterate(s2)
|
2018-07-04 23:27:40 +02:00
|
|
|
while (x1 !== nothing)
|
2018-07-04 21:15:07 +02:00
|
|
|
ch1, state1 = x1
|
2018-07-04 21:47:11 +02:00
|
|
|
if i2 <= i1 - maxdist - 1
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-24 19:29:15 +02:00
|
|
|
i2 += 1
|
2018-07-04 21:15:07 +02:00
|
|
|
x2 = iterate(s2, state2)
|
|
|
|
end
|
|
|
|
i2curr = i2
|
|
|
|
x2curr = x2
|
2018-07-04 23:27:40 +02:00
|
|
|
while (x2curr !== nothing)
|
|
|
|
(i2curr > i1 + maxdist) && break
|
2018-07-04 21:15:07 +02:00
|
|
|
ch2, state2 = x2curr
|
|
|
|
if ch1 == ch2 && !flag[i2curr]
|
2015-10-23 16:12:51 +02:00
|
|
|
m += 1
|
2018-07-04 21:15:07 +02:00
|
|
|
flag[i2curr] = true
|
|
|
|
i1_match[m] = prevstate1
|
2015-10-23 16:12:51 +02:00
|
|
|
break
|
|
|
|
end
|
2018-07-04 21:15:07 +02:00
|
|
|
x2curr = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2curr += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2018-07-04 21:15:07 +02:00
|
|
|
prevstate1 = state1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2017-08-06 00:26:27 +02:00
|
|
|
# count t transpotsitions
|
2017-08-05 20:45:19 +02:00
|
|
|
t = 0
|
|
|
|
i1 = 0
|
|
|
|
i2 = 0
|
|
|
|
for ch2 in s2
|
|
|
|
i2 += 1
|
|
|
|
if flag[i2]
|
|
|
|
i1 += 1
|
2018-07-04 18:07:26 +02:00
|
|
|
t += ch2 != iterate(s1, i1_match[i1])[1]
|
2017-08-05 20:45:19 +02:00
|
|
|
end
|
|
|
|
end
|
2018-07-04 21:26:24 +02:00
|
|
|
m == 0 && return 1.0
|
2017-08-05 20:45:19 +02:00
|
|
|
score = (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
2015-11-03 19:09:10 +01:00
|
|
|
return 1.0 - score
|
2018-07-04 21:26:24 +02:00
|
|
|
end
|