compathelper/new_version/2020-10-08-17-05-17-769-1797568811
matthieugomez 2020-07-13 10:40:30 -07:00
parent 6b5f858158
commit 4df4bad6af
2 changed files with 26 additions and 11 deletions

View File

@ -124,24 +124,26 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} =
v = collect(1:(len2-k))
w = similar(v)
if max_dist !== nothing
i2_start = k + 1
i2_end = max_dist
i2_start = 1
i2_end = max_dist + 1
end
prevch1, prevch2 = first(s1), first(s2)
current = 0
for (i1, ch1) in enumerate(s1)
i1 <= k && continue
left = current = i1 - k - 1
left = i1 - k - 1
current = left + 1
nextTransCost = 0
if max_dist !== nothing
i2_start += (i1 > 1 + max_dist - (len2 - len1)) ? 1 : 0
i2_end += (i2_end < len2) ? 1 : 0
i2_start += (i1 - k - 1 > max_dist - (len2 - len1)) ? 1 : 0
i2_end += (i2_end <= len2) ? 1 : 0
end
for (i2, ch2) in enumerate(s2)
i2 <= k && continue
# no need to look beyond window of lower right diagonal - maxDistance cells
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
if (max_dist !== nothing) && ((i2 < i2_start) | (i2 > i2_end))
if i2 <= k
prevch2 = ch2
elseif (max_dist !== nothing) && ((i2 - k < i2_start) | (i2 - k >= i2_end))
# no need to look beyond window of lower right diagonal - maxDistance cells
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
prevch2 = ch2
else
above, current, left = current, left, v[i2 - k]
@ -150,7 +152,7 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} =
if ch1 != ch2
current = min(left, current, above) + 1
# never happens at i2 = k + 1 because then the two previous characters were equal
if (i1 > 1 + k) & (i2 > 1 + k) && (ch1 == prevch2) && (prevch1 == ch2)
if (i1 - k > 1) & (i2 - k > 1) && (ch1 == prevch2) && (prevch1 == ch2)
thisTransCost += 1
current = min(current, thisTransCost)
end

View File

@ -44,6 +44,8 @@ using StringDistances, Unicode, Test
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test DamerauLevenshtein()("abcdef", "abcxyf", 2) == 2
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
@test DamerauLevenshtein()("bc", "abc") == 1
@ -161,7 +163,7 @@ using StringDistances, Unicode, Test
# Test with R package StringDist
for x in solutions
t, solution = x
for i in 1:length(solution)
for i in eachindex(solution)
if isnan(evaluate(t, strings[i]...))
@test isnan(solution[i])
else
@ -174,8 +176,19 @@ using StringDistances, Unicode, Test
for i in eachindex(strings)
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) solution[i] atol = 1e-4
end
# test max_dist
for i in eachindex(strings)
d = Levenshtein()(strings[i]...)
@test Levenshtein()(strings[i]..., d) == d
d = DamerauLevenshtein()(strings[i]...)
@test DamerauLevenshtein()(strings[i]..., d) == d
end
end
d = DamerauLevenshtein()("abcdef", "abcxyf")
@test DamerauLevenshtein()("abcdef", "abcxyf", d) == d
#= R test