slower but simpler iteration

2020-02-18 08:18:45 -05:00 · 2020-02-18 08:18:45 -05:00 · 49b1f3b439
parent d80071590b
commit 49b1f3b439
4 changed files with 75 additions and 91 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 benchmark/
 PC25
 Manifest.toml
 draft
--- a/src/edit.jl
+++ b/src/edit.jl
@ -27,34 +27,20 @@ function (dist::Jaro)(s1, s2)
    ch1_match = Vector{eltype(s1)}(undef, len1)
    #  m counts number matching characters
    m = 0 
-    i1 = 1
+    i1 = 0
-    i2 = 1
+    for ch1 in s1
-    x1 = iterate(s1)
+        i1 += 1
-    x2 = iterate(s2)
+        i2 = 0
-    while x1 !== nothing
+        for ch2 in s2
        ch1, state1 = x1
        if i2 <= i1 - maxdist - 1
            ch2, state2 = x2
            i2 += 1
-            x2 = iterate(s2, state2)
+            i2 > i1 + maxdist && break
-        end 
+            if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] 
        i2curr = i2
        x2curr = x2
        while x2curr !== nothing
            i2curr > i1 + maxdist && break
            ch2, state2 = x2curr
            if (ch1 == ch2) && !flag[i2curr] 
                m += 1
-                flag[i2curr] = true
+                flag[i2] = true
                ch1_match[m] = ch1
                break
            end
            x2curr = iterate(s2, state2) 
            i2curr += 1
        end
        x1 = iterate(s1, state1)
        i1 += 1
        prevstate1 = state1
    end
    m == 0 && return 1.0
    # t counts number of transpositions
@ -91,35 +77,35 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
-    k, x1, x2start = common_prefix(s1, s2)
+    k = common_prefix(s1, s2)
-    x1 === nothing && return len2 - k
+    (k == length(s1)) && return len2 - k
    # distance initialized to first row of matrix
    # => distance between "" and s2[1:i}
    v = collect(1:(len2-k))
    current = 0
-    i1 = 1
+    i1 = 0
-    while x1 !== nothing
+    left = 0
-        ch1, state1 = x1
+    current = 0
-        left = i1 - 1
+    min_dist = 0
-        current = i1 - 1
+    for ch1 in s1
-        min_dist = i1 - 2 
+        i1 += 1
-        i2 = 1
+        i1 <= k && continue
-        x2 = x2start
+        left = i1 - k - 1
-        while x2 !== nothing
+        current = i1 - k - 1
-            ch2, state2 = x2
+        min_dist = i1 - k - 2 
        i2 = 0
        for ch2 in s2
            i2 += 1
            i2 <= k && continue
            #  update
-            above, current, left = current, left, v[i2]
+            above, current, left = current, left, v[i2 - k]
            if ch1 != ch2
                current = min(current + 1, above + 1, left + 1)
            end
            min_dist = min(min_dist, left)
-            v[i2] = current
+            v[i2 - k] = current
            x2 = iterate(s2, state2)
            i2 += 1
        end
        max_dist !== nothing && min_dist > max_dist && return max_dist + 1
        x1 = iterate(s1, state1)
        i1 += 1
    end
    max_dist !== nothing && current > max_dist && return max_dist + 1 
    return current
@ -144,8 +130,8 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
-    k, x1, x2start = common_prefix(s1, s2)
+    k = common_prefix(s1, s2)
-    x1 === nothing && return len2 - k
+    (k == length(s1)) && return len2 - k
    v = collect(1:(len2-k))
    w = similar(v)
    if max_dist !== nothing
@ -153,57 +139,55 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
        i2_start = 1
        i2_end = max_dist
    end
-    i1 = 1
+    i1 = 0
    current = i1
-    prevch1, = x1
+    prevch1 = first(s1)
-    while x1 !== nothing
+    prevch2 = first(s2)
-        ch1, state1 = x1
+    for ch1 in s1
-        left = i1 - 1
+        i1 += 1
-        current = i1 
+        i1 <= k && continue
        left = i1 - k - 1
        current = i1 - k
        nextTransCost = 0
        prevch2, = x2start
        if max_dist !== nothing
            i2_start += (i1 > offset) ? 1 : 0
            i2_end = min(i2_end + 1, len2)
        end
-        x2 = x2start
+        i2 = 0
-        i2 = 1
+        for ch2 in s2
-        while x2 !== nothing
+            i2 += 1
-            ch2, state2 = x2
+            if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end))
-            if max_dist === nothing || (i2_start <= i2 <= i2_end)
+                prevch2 = ch2
-                above = current
+                continue
-                thisTransCost = nextTransCost
+            end
-                nextTransCost = w[i2]
+            above = current
-                # cost of diagonal (substitution)
+            thisTransCost = nextTransCost
-                w[i2] = current = left
+            nextTransCost = w[i2 - k]
-                # left now equals current cost (which will be diagonal at next iteration)
+            # cost of diagonal (substitution)
-                left = v[i2]
+            w[i2 - k] = current = left
-                if ch1 != ch2
+            # left now equals current cost (which will be diagonal at next iteration)
-                    # insertion
+            left = v[i2 - k]
-                    if left < current
+            if ch1 != ch2
-                        current = left
+                # insertion
-                    end
+                if left < current
-                    # deletion
+                    current = left
-                    if above < current
+                end
-                        current = above
+                # deletion
-                    end
+                if above < current
-                    current += 1
+                    current = above
-                    if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
+                end
-                        thisTransCost += 1
+                current += 1
-                        if thisTransCost < current
+                if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2)
-                            current = thisTransCost
+                    thisTransCost += 1
-                        end
+                    if thisTransCost < current
                        current = thisTransCost
                    end
                end
                v[i2] = current
            end
-            x2 = iterate(s2, state2)
+            v[i2 - k] = current
            i2 += 1
            prevch2 = ch2
        end
-        max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1
+        max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
        x1 = iterate(s1, state1)
        i1 += 1
        prevch1 = ch1
    end
    max_dist !== nothing && current > max_dist && return max_dist + 1
--- a/src/utils.jl
+++ b/src/utils.jl
@ -43,18 +43,12 @@ function reorder(s1, s2)
 end
 function common_prefix(s1, s2)
    x1 = iterate(s1)
    x2 = iterate(s2)
    l = 0
-    while (x1 !== nothing) & (x2 !== nothing)
+    for (ch1, ch2) in zip(s1, s2)
        ch1, state1 = x1
        ch2, state2 = x2
        ch1 != ch2 && break
        l += 1
        x1 = iterate(s1, state1)
        x2 = iterate(s2, state2)
    end
-    return l, x1, x2
+    return l
 end
@ -69,7 +63,12 @@ end
 function _drop(s, n::Integer)
    Base.Iterators.drop(s, n)
 end
 function _drop(s::AbstractString, n::Integer)
    SubString(s, nextind(s, 0, n + 1), lastindex(s))
 end
 function _drop(s::StringWithLength, n::Integer)
   StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
 end
--- a/test/distances.jl
+++ b/test/distances.jl
@ -27,7 +27,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(Levenshtein(), "saturday", "sunday") == 3
 		@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
 		@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
-		@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
+		@test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1
 		@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
 		@test Levenshtein()("", "abc") == 3
 		@test result_type(Levenshtein(), "hello", "world") == Int