From 49b1f3b43975159e37f833372a7c8efe1be5a194 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Tue, 18 Feb 2020 08:18:45 -0500 Subject: [PATCH] slower but simpler iteration --- .gitignore | 1 + src/edit.jl | 148 +++++++++++++++++++++------------------------- src/utils.jl | 15 +++-- test/distances.jl | 2 +- 4 files changed, 75 insertions(+), 91 deletions(-) diff --git a/.gitignore b/.gitignore index 02ab656..d8fa608 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ benchmark/ PC25 Manifest.toml +draft diff --git a/src/edit.jl b/src/edit.jl index a59d26f..befa30c 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -27,34 +27,20 @@ function (dist::Jaro)(s1, s2) ch1_match = Vector{eltype(s1)}(undef, len1) # m counts number matching characters m = 0 - i1 = 1 - i2 = 1 - x1 = iterate(s1) - x2 = iterate(s2) - while x1 !== nothing - ch1, state1 = x1 - if i2 <= i1 - maxdist - 1 - ch2, state2 = x2 + i1 = 0 + for ch1 in s1 + i1 += 1 + i2 = 0 + for ch2 in s2 i2 += 1 - x2 = iterate(s2, state2) - end - i2curr = i2 - x2curr = x2 - while x2curr !== nothing - i2curr > i1 + maxdist && break - ch2, state2 = x2curr - if (ch1 == ch2) && !flag[i2curr] + i2 > i1 + maxdist && break + if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] m += 1 - flag[i2curr] = true + flag[i2] = true ch1_match[m] = ch1 break end - x2curr = iterate(s2, state2) - i2curr += 1 end - x1 = iterate(s1, state1) - i1 += 1 - prevstate1 = state1 end m == 0 && return 1.0 # t counts number of transpositions @@ -91,35 +77,35 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored - k, x1, x2start = common_prefix(s1, s2) - x1 === nothing && return len2 - k + k = common_prefix(s1, s2) + (k == length(s1)) && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} v = collect(1:(len2-k)) current = 0 - i1 = 1 - while x1 !== nothing - ch1, state1 = x1 - left = i1 - 1 - current = i1 - 1 - min_dist = i1 - 2 - i2 = 1 - x2 = x2start - while x2 !== nothing - ch2, state2 = x2 + i1 = 0 + left = 0 + current = 0 + min_dist = 0 + for ch1 in s1 + i1 += 1 + i1 <= k && continue + left = i1 - k - 1 + current = i1 - k - 1 + min_dist = i1 - k - 2 + i2 = 0 + for ch2 in s2 + i2 += 1 + i2 <= k && continue # update - above, current, left = current, left, v[i2] + above, current, left = current, left, v[i2 - k] if ch1 != ch2 current = min(current + 1, above + 1, left + 1) end min_dist = min(min_dist, left) - v[i2] = current - x2 = iterate(s2, state2) - i2 += 1 + v[i2 - k] = current end max_dist !== nothing && min_dist > max_dist && return max_dist + 1 - x1 = iterate(s1, state1) - i1 += 1 end max_dist !== nothing && current > max_dist && return max_dist + 1 return current @@ -144,8 +130,8 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored - k, x1, x2start = common_prefix(s1, s2) - x1 === nothing && return len2 - k + k = common_prefix(s1, s2) + (k == length(s1)) && return len2 - k v = collect(1:(len2-k)) w = similar(v) if max_dist !== nothing @@ -153,57 +139,55 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing) i2_start = 1 i2_end = max_dist end - i1 = 1 + i1 = 0 current = i1 - prevch1, = x1 - while x1 !== nothing - ch1, state1 = x1 - left = i1 - 1 - current = i1 + prevch1 = first(s1) + prevch2 = first(s2) + for ch1 in s1 + i1 += 1 + i1 <= k && continue + left = i1 - k - 1 + current = i1 - k nextTransCost = 0 - prevch2, = x2start if max_dist !== nothing i2_start += (i1 > offset) ? 1 : 0 i2_end = min(i2_end + 1, len2) end - x2 = x2start - i2 = 1 - while x2 !== nothing - ch2, state2 = x2 - if max_dist === nothing || (i2_start <= i2 <= i2_end) - above = current - thisTransCost = nextTransCost - nextTransCost = w[i2] - # cost of diagonal (substitution) - w[i2] = current = left - # left now equals current cost (which will be diagonal at next iteration) - left = v[i2] - if ch1 != ch2 - # insertion - if left < current - current = left - end - # deletion - if above < current - current = above - end - current += 1 - if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2) - thisTransCost += 1 - if thisTransCost < current - current = thisTransCost - end + i2 = 0 + for ch2 in s2 + i2 += 1 + if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end)) + prevch2 = ch2 + continue + end + above = current + thisTransCost = nextTransCost + nextTransCost = w[i2 - k] + # cost of diagonal (substitution) + w[i2 - k] = current = left + # left now equals current cost (which will be diagonal at next iteration) + left = v[i2 - k] + if ch1 != ch2 + # insertion + if left < current + current = left + end + # deletion + if above < current + current = above + end + current += 1 + if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2) + thisTransCost += 1 + if thisTransCost < current + current = thisTransCost end end - v[i2] = current end - x2 = iterate(s2, state2) - i2 += 1 + v[i2 - k] = current prevch2 = ch2 end - max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1 - x1 = iterate(s1, state1) - i1 += 1 + max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1 prevch1 = ch1 end max_dist !== nothing && current > max_dist && return max_dist + 1 diff --git a/src/utils.jl b/src/utils.jl index 54dd656..fa8791d 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -43,18 +43,12 @@ function reorder(s1, s2) end function common_prefix(s1, s2) - x1 = iterate(s1) - x2 = iterate(s2) l = 0 - while (x1 !== nothing) & (x2 !== nothing) - ch1, state1 = x1 - ch2, state2 = x2 + for (ch1, ch2) in zip(s1, s2) ch1 != ch2 && break l += 1 - x1 = iterate(s1, state1) - x2 = iterate(s2, state2) end - return l, x1, x2 + return l end @@ -69,7 +63,12 @@ end function _drop(s, n::Integer) Base.Iterators.drop(s, n) end + function _drop(s::AbstractString, n::Integer) + SubString(s, nextind(s, 0, n + 1), lastindex(s)) +end + +function _drop(s::StringWithLength, n::Integer) StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n) end diff --git a/test/distances.jl b/test/distances.jl index a1cfd5f..ebf9326 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -27,7 +27,7 @@ using StringDistances, Unicode, Test @test evaluate(Levenshtein(), "saturday", "sunday") == 3 @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 - @test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1 + @test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1 @test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak") @test Levenshtein()("", "abc") == 3 @test result_type(Levenshtein(), "hello", "world") == Int