StringDistances.jl/src/distances/edit.jl

222 lines
6.9 KiB
Julia
Raw Normal View History

2015-10-25 16:23:46 +01:00
2015-10-23 16:12:51 +02:00
##############################################################################
##
## Hamming
##
##############################################################################
2017-08-05 20:45:19 +02:00
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
2019-08-17 17:40:26 +02:00
current = abs(length(s2) - length(s1))
2015-11-03 19:07:17 +01:00
for (ch1, ch2) in zip(s1, s2)
2019-08-17 17:40:26 +02:00
current += ch1 != ch2
2015-10-23 16:12:51 +02:00
end
2019-08-17 17:40:26 +02:00
return current
2015-10-23 16:12:51 +02:00
end
##############################################################################
##
2015-11-02 18:54:47 +01:00
## Levenshtein
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
2015-10-23 16:12:51 +02:00
##
##############################################################################
2015-10-25 16:23:46 +01:00
2017-05-12 23:41:56 +02:00
struct Levenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2017-08-05 20:45:19 +02:00
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
2019-08-17 17:40:26 +02:00
# prefix common to both strings can be ignored
2018-07-04 21:04:06 +02:00
k, x1, x2start = common_prefix(s1, s2)
2018-07-04 20:02:50 +02:00
(x1 == nothing) && return len2 - k
2015-10-24 18:45:24 +02:00
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
2018-07-04 21:26:24 +02:00
v0 = collect(1:(len2 - k))
2018-07-04 20:02:50 +02:00
current = 0
2018-07-04 21:47:11 +02:00
i1 = 1
2018-07-04 23:27:40 +02:00
while x1 !== nothing
2018-07-04 20:02:50 +02:00
ch1, state1 = x1
2019-08-17 17:40:26 +02:00
left = i1 - 1
current = i1 - 1
2018-07-04 21:47:11 +02:00
i2 = 1
2018-07-04 21:04:06 +02:00
x2 = x2start
2018-07-04 23:27:40 +02:00
while x2 !== nothing
2018-07-04 20:02:50 +02:00
ch2, state2 = x2
2015-10-24 18:45:24 +02:00
# update
above, current, left = current, left, v0[i2]
if ch1 != ch2
# substitution
2019-08-17 17:40:26 +02:00
current = min(current + 1, above + 1, left + 1)
2015-10-23 16:12:51 +02:00
end
2015-10-24 18:45:24 +02:00
v0[i2] = current
2018-07-04 20:02:50 +02:00
x2 = iterate(s2, state2)
2018-07-04 21:47:11 +02:00
i2 += 1
2015-10-23 16:12:51 +02:00
end
2018-07-04 20:02:50 +02:00
x1 = iterate(s1, state1)
2018-07-04 21:47:11 +02:00
i1 += 1
2015-10-23 16:12:51 +02:00
end
return current
end
2015-11-02 18:54:47 +01:00
##############################################################################
##
## Damerau Levenshtein
## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################
2017-05-12 23:41:56 +02:00
struct DamerauLevenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2017-08-05 20:45:19 +02:00
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
2018-07-04 18:33:13 +02:00
s2, len2, s1, len1 = reorder(s1, s2)
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2018-07-04 21:04:06 +02:00
k, x1, x2start = common_prefix(s1, s2)
2018-07-04 20:02:50 +02:00
(x1 == nothing) && return len2 - k
2018-07-04 21:26:24 +02:00
v0 = collect(1:(len2 - k))
v2 = similar(v0)
2018-07-04 21:47:11 +02:00
i1 = 1
current = i1
2018-07-04 21:04:06 +02:00
prevch1, = x1
2019-08-17 17:40:26 +02:00
while x1 !== nothing
2018-07-04 20:02:50 +02:00
ch1, state1 = x1
2015-10-24 18:45:24 +02:00
left = (i1 - 1)
current = i1
2015-10-23 16:12:51 +02:00
nextTransCost = 0
2018-07-04 21:04:06 +02:00
prevch2, = x2start
x2 = x2start
2018-07-04 21:47:11 +02:00
i2 = 1
2019-08-17 17:40:26 +02:00
while x2 !== nothing
2018-07-04 20:02:50 +02:00
ch2, state2 = x2
2015-10-23 16:12:51 +02:00
above = current
thisTransCost = nextTransCost
2015-10-24 18:45:24 +02:00
nextTransCost = v2[i2]
# cost of diagonal (substitution)
v2[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v0[i2]
2015-10-23 16:12:51 +02:00
if ch1 != ch2
2015-10-24 18:45:24 +02:00
# insertion
2015-10-23 16:12:51 +02:00
if left < current
current = left
end
2015-10-24 18:45:24 +02:00
# deletion
2015-10-23 16:12:51 +02:00
if above < current
current = above
end
current += 1
2019-08-17 17:40:26 +02:00
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
2015-10-23 16:12:51 +02:00
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
end
2015-10-24 18:45:24 +02:00
v0[i2] = current
2018-07-04 20:02:50 +02:00
x2 = iterate(s2, state2)
2018-07-04 21:47:11 +02:00
i2 += 1
2018-07-04 21:04:06 +02:00
prevch2 = ch2
2015-10-23 16:12:51 +02:00
end
2018-07-04 20:02:50 +02:00
x1 = iterate(s1, state1)
2018-07-04 21:47:11 +02:00
i1 += 1
2018-07-04 21:04:06 +02:00
prevch1 = ch1
2015-10-23 16:12:51 +02:00
end
return current
end
##############################################################################
##
2015-11-04 18:40:30 +01:00
## Jaro
2017-08-05 20:45:19 +02:00
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
2019-08-17 17:40:26 +02:00
##
2015-10-23 16:12:51 +02:00
##############################################################################
2017-05-12 23:41:56 +02:00
struct Jaro <: SemiMetric end
2015-10-23 16:12:51 +02:00
2017-08-05 20:45:19 +02:00
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
2015-11-03 19:09:10 +01:00
len2 == 0 && return 0.0
2015-10-24 19:29:15 +02:00
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
2018-07-04 21:15:07 +02:00
prevstate1 = firstindex(s1)
2018-07-04 21:26:24 +02:00
i1_match = prevstate1 * ones(Int, len1)
2019-08-17 17:40:26 +02:00
# m counts number matching characters
2018-07-04 21:26:24 +02:00
m = 0
2018-07-04 21:47:11 +02:00
i1 = 1
i2 = 1
2018-07-04 20:02:50 +02:00
x1 = iterate(s1)
2018-07-04 21:15:07 +02:00
x2 = iterate(s2)
2019-08-17 17:40:26 +02:00
while x1 !== nothing
2018-07-04 21:15:07 +02:00
ch1, state1 = x1
2018-07-04 21:47:11 +02:00
if i2 <= i1 - maxdist - 1
2018-07-04 20:02:50 +02:00
ch2, state2 = x2
2015-10-24 19:29:15 +02:00
i2 += 1
2018-07-04 21:15:07 +02:00
x2 = iterate(s2, state2)
end
i2curr = i2
x2curr = x2
2019-08-17 17:40:26 +02:00
while x2curr !== nothing
2018-07-04 23:27:40 +02:00
(i2curr > i1 + maxdist) && break
2018-07-04 21:15:07 +02:00
ch2, state2 = x2curr
2019-08-17 17:40:26 +02:00
if (ch1 == ch2) & !flag[i2curr]
2015-10-23 16:12:51 +02:00
m += 1
2018-07-04 21:15:07 +02:00
flag[i2curr] = true
i1_match[m] = prevstate1
2015-10-23 16:12:51 +02:00
break
end
2018-07-04 21:15:07 +02:00
x2curr = iterate(s2, state2)
2018-07-04 21:47:11 +02:00
i2curr += 1
2015-10-23 16:12:51 +02:00
end
2018-07-04 20:02:50 +02:00
x1 = iterate(s1, state1)
2018-07-04 21:47:11 +02:00
i1 += 1
2018-07-04 21:15:07 +02:00
prevstate1 = state1
2015-10-23 16:12:51 +02:00
end
2019-08-17 17:40:26 +02:00
m == 0 && return 1.0
# t counts number of transpotsitions
2017-08-05 20:45:19 +02:00
t = 0
i1 = 0
i2 = 0
for ch2 in s2
i2 += 1
if flag[i2]
i1 += 1
2018-07-04 18:07:26 +02:00
t += ch2 != iterate(s1, i1_match[i1])[1]
2017-08-05 20:45:19 +02:00
end
end
2019-08-17 17:40:26 +02:00
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
2019-08-17 18:57:35 +02:00
end
##############################################################################
##
## Ratcliff/Obershelp
##
##############################################################################
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
1.0 - 2 * n_matched / (length(s1) + length(s2))
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2, len1 , len2)
# if there is a common substring
if a[3] > 0
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
# add the longest common substring that happens after
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
end
return x
end