StringDistances.jl/src/edit.jl

211 lines
6.0 KiB
Julia
Raw Normal View History

2015-10-25 16:23:46 +01:00
##############################################################################
##
## Find common prefixes (up to lim. -1 means Inf)
## Assumes length(s1) <= length(s2)
##############################################################################
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
start1 = start(s1)
start2 = start(s2)
l = 0
while !done(s1, start1) && (l < lim || lim < 0)
ch1, nextstart1 = next(s1, start1)
ch2, nextstart2 = next(s2, start2)
ch1 != ch2 && break
l += 1
start1, start2 = nextstart1, nextstart2
end
return l, start1, start2
end
2015-10-23 16:12:51 +02:00
##############################################################################
##
## Hamming
##
##############################################################################
2015-10-25 16:23:46 +01:00
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
2015-10-24 21:18:35 +02:00
count = 0
2015-10-24 18:45:24 +02:00
state2 = start(s2)
for ch1 in s1
ch2, state2 = next(s2, state2)
count += ch1 != ch2
2015-10-23 16:12:51 +02:00
end
2015-10-24 18:45:24 +02:00
count += len2 - len1
2015-10-23 16:12:51 +02:00
return count
end
2015-10-24 18:45:24 +02:00
hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
2015-10-23 16:12:51 +02:00
##############################################################################
##
## Levenshtein and Damerau Levenshtein
## Source Levenshtein: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
2015-10-23 23:05:11 +02:00
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
2015-10-23 16:12:51 +02:00
##
##############################################################################
2015-10-25 16:23:46 +01:00
2015-10-24 21:18:35 +02:00
2015-10-25 22:26:17 +01:00
type Levenshtein <: SemiMetric end
2015-10-25 16:23:46 +01:00
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
2015-10-24 18:45:24 +02:00
len2 == 0 && return 0
2015-10-23 16:12:51 +02:00
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2015-10-25 02:35:40 +02:00
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
2015-10-23 16:12:51 +02:00
2015-10-24 18:45:24 +02:00
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
2015-10-25 02:35:40 +02:00
v0 = Array(Int, len2 - k)
@inbounds for i2 in 1:(len2 - k)
2015-10-24 18:45:24 +02:00
v0[i2] = i2
2015-10-23 16:12:51 +02:00
end
2015-10-24 18:45:24 +02:00
current = zero(0)
state1 = start1
i1 = 0
while !done(s1, state1)
i1 += 1
ch1, state1 = next(s1, state1)
left = (i1 - 1)
current = (i1 - 1)
state2 = start2
i2 = 0
while !done(s2, state2)
i2 += 1
ch2, state2 = next(s2, state2)
# update
above, current, left = current, left, v0[i2]
if ch1 != ch2
# substitution
current = min(current + 1,
above + 1,
left + 1)
2015-10-23 16:12:51 +02:00
end
2015-10-24 18:45:24 +02:00
v0[i2] = current
2015-10-23 16:12:51 +02:00
end
end
return current
end
2015-10-24 18:45:24 +02:00
function levenshtein(s1::AbstractString, s2::AbstractString)
evaluate(Levenshtein(), s1, s2)
end
2015-10-23 16:12:51 +02:00
2015-10-25 22:26:17 +01:00
type DamerauLevenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2015-10-25 16:23:46 +01:00
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
2015-10-24 21:18:35 +02:00
len2 == 0 && return 0
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2015-10-25 02:35:40 +02:00
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
2015-10-23 16:12:51 +02:00
2015-10-25 02:35:40 +02:00
v0 = Array(Int, len2 - k)
@inbounds for i2 in 1:(len2 - k)
2015-10-24 18:45:24 +02:00
v0[i2] = i2
2015-10-23 16:12:51 +02:00
end
2015-10-25 02:35:40 +02:00
v2 = Array(Int, len2 - k)
2015-10-23 16:12:51 +02:00
2015-10-24 18:45:24 +02:00
ch1, = next(s1, start1)
2015-10-23 16:12:51 +02:00
current = 0
2015-10-24 18:45:24 +02:00
state1 = start1
i1 = 0
while !done(s1, state1)
i1 += 1
2015-10-23 16:12:51 +02:00
prevch1 = ch1
2015-10-24 18:45:24 +02:00
ch1, state1 = next(s1, state1)
ch2, = next(s2, start2)
left = (i1 - 1)
current = i1
2015-10-23 16:12:51 +02:00
nextTransCost = 0
2015-10-24 18:45:24 +02:00
state2 = start2
i2 = 0
while !done(s2, state2)
i2 += 1
prevch2 = ch2
ch2, state2 = next(s2, state2)
2015-10-23 16:12:51 +02:00
above = current
thisTransCost = nextTransCost
2015-10-24 18:45:24 +02:00
nextTransCost = v2[i2]
# cost of diagonal (substitution)
v2[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v0[i2]
2015-10-23 16:12:51 +02:00
if ch1 != ch2
2015-10-24 18:45:24 +02:00
# insertion
2015-10-23 16:12:51 +02:00
if left < current
current = left
end
2015-10-24 18:45:24 +02:00
# deletion
2015-10-23 16:12:51 +02:00
if above < current
current = above
end
current += 1
if i1 != 1 && i2 != 1 && ch1 == prevch2 && prevch1 == ch2
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
end
2015-10-24 18:45:24 +02:00
v0[i2] = current
2015-10-23 16:12:51 +02:00
end
end
return current
end
2015-10-24 18:45:24 +02:00
damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2)
2015-10-23 16:12:51 +02:00
##############################################################################
##
## JaroWinkler
##
##############################################################################
2015-10-25 22:26:17 +01:00
type Jaro <: SemiMetric end
2015-10-23 16:12:51 +02:00
2015-10-25 16:23:46 +01:00
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
2015-10-24 19:29:15 +02:00
len2 == 0 && return 1.0
2015-10-23 16:12:51 +02:00
2015-10-24 19:29:15 +02:00
maxdist = max(0, div(len2, 2) - 1)
2015-10-23 16:12:51 +02:00
m = 0 # matching characters
t = 0 # half number of transpositions
2015-10-24 19:29:15 +02:00
flag = fill(false, len2)
2015-10-23 16:12:51 +02:00
prevpos = 0
2015-10-24 19:29:15 +02:00
i1 = 0
startstate2 = start(s2)
starti2 = 0
for ch1 in s1
i1 += 1
if starti2 < i1 - maxdist - 1
startstate2 = nextind(s2, startstate2)
starti2 += 1
end
i2 = starti2
state2 = startstate2
while !done(s2, state2) && i2 < i1 + maxdist
ch2, state2 = next(s2, state2)
i2 += 1
if ch1 == ch2 && !flag[i2]
2015-10-23 16:12:51 +02:00
m += 1
# if match is before the index of previous match
if i2 < prevpos
t += 1
end
prevpos = max(i2, prevpos)
flag[i2] = true
break
end
end
end
2015-10-23 23:05:11 +02:00
m == 0.0 && return 1.0
2015-10-24 19:29:15 +02:00
score = (m / len1 + m / len2 + (m - t) / m) / 3.0
2015-10-23 23:05:11 +02:00
return 1 - score
2015-10-23 16:12:51 +02:00
end
2015-10-25 16:23:46 +01:00
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)
2015-10-23 16:12:51 +02:00
2015-10-24 18:45:24 +02:00