Update edit.jl

pull/22/head
matthieugomez 2020-02-02 11:47:31 -05:00
parent 3b9493f8a9
commit 276d022df6
1 changed files with 13 additions and 11 deletions

View File

@ -13,6 +13,7 @@ where ``m`` is the number of matching characters and
"""
struct Jaro <: StringDistance end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s1, s2 = reorder(s1, s2)
@ -78,7 +79,7 @@ Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: StringDistance end
struct Levenshtein <: StringDistance end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist +1 if distance higher than max_dist
@ -93,7 +94,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
x1 == nothing && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v0 = collect(1:(len2 - k))
v = collect(1:(len2-k))
current = 0
i1 = 1
while x1 !== nothing
@ -106,12 +107,12 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
while x2 !== nothing
ch2, state2 = x2
# update
above, current, left = current, left, v0[i2]
above, current, left = current, left, v[i2]
if ch1 != ch2
current = min(current + 1, above + 1, left + 1)
end
min_dist = min(min_dist, left)
v0[i2] = current
v[i2] = current
x2 = iterate(s2, state2)
i2 += 1
end
@ -124,6 +125,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
end
"""
DamerauLevenshtein()
@ -143,8 +145,8 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
# prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2)
(x1 == nothing) && return len2 - k
v0 = collect(1:(len2 - k))
v2 = similar(v0)
v = collect(1:(len2-k))
w = similar(v)
if max_dist !== nothing
offset = 1 + max_dist - (len2 - len1)
i2_start = 1
@ -170,11 +172,11 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
if max_dist == nothing || (i2_start <= i2 <= i2_end)
above = current
thisTransCost = nextTransCost
nextTransCost = v2[i2]
nextTransCost = w[i2]
# cost of diagonal (substitution)
v2[i2] = current = left
w[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v0[i2]
left = v[i2]
if ch1 != ch2
# insertion
if left < current
@ -192,13 +194,13 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
end
end
end
v0[i2] = current
v[i2] = current
end
x2 = iterate(s2, state2)
i2 += 1
prevch2 = ch2
end
max_dist !== nothing && v0[i1 + len2 - len1] > max_dist && return max_dist + 1
max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
prevch1 = ch1