simplify RatcliffObershelp + add Partial{Ratcliff} test

2021-09-10 14:38:41 -04:00 · 2021-09-10 14:38:41 -04:00 · f75a10852d
parent 3e5ce898f7
commit f75a10852d
4 changed files with 72 additions and 58 deletions
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -1,5 +1,5 @@

-using StringDistances, Random
+using Revise, StringDistances, Random
 Random.seed!(2)
 x = map(Random.randstring, rand(5:25,500_000))
 y = map(Random.randstring, rand(5:25,500_000))
@ -13,6 +13,7 @@ function g(dist, x, y)
 end


+@time f(Hamming(), x, y);

@time f(Jaro(), x, y);
 #0.3s 
@ -25,7 +26,7 @@ end
@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
 # 0.08
@time f(RatcliffObershelp(), x, y);
-# 1.35s
+# 0.8s



--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -42,26 +42,28 @@ function (dist::Jaro)(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    # Note: if both are empty, formula in Wikipedia gives 1, but it makes more sense to set it to s1 == s2
-    len2 == 0 && return Float64(s1 == s2)
+    # If both iterators empty, formula in Wikipedia gives 1, but it makes more sense to set it to s1 == s2
+    len2 > 0 || return Float64(s1 == s2)
    d = max(0, div(len2, 2) - 1)
    flag = fill(false, len2)
    ch1_match = Vector{eltype(s1)}()
    for (i1, ch1) in enumerate(s1)
        for (i2, ch2) in enumerate(s2)
            # for each character in s1, greedy search of matching character in s2 within a distance d
-            if (i2 >= i1 - d) && (i2 <= i1 + d) && (ch1 == ch2) && !flag[i2] 
+            i2 >= i1 - d || continue
+            i2 <= i1 + d || break
+            if ch1 == ch2 && !flag[i2] 
                flag[i2] = true
                push!(ch1_match, ch1)
                break
            end
        end
    end
-    #  m counts number matching characters
-    m = length(ch1_match)
-    if m == 0
+    if isempty(ch1_match)
        return 1.0
    else
+        #  m counts number matching characters
+        m = length(ch1_match)
        # t/2 counts number transpositions
        t = 0
        i1 = 0
@ -134,11 +136,11 @@ function (dist::Levenshtein)(s1, s2)
    v = collect(1:(len2-k))
    current = 0
    for (i1, ch1) in enumerate(s1)
-        i1 <= k && continue
+        i1 > k || continue
        left = current = i1 - k - 1
        dist.max_dist !== nothing && (value_lb = left - 1)
        for (i2, ch2) in enumerate(s2)
-            i2 <= k && continue
+            i2 > k || continue
            @inbounds above, current, left = current, left, v[i2 - k]
            if ch1 != ch2
                current = min(current, above, left) + 1
@ -192,7 +194,7 @@ function (dist::DamerauLevenshtein)(s1, s2)
    prevch1, prevch2 = first(s1), first(s2)
    current = 0
    for (i1, ch1) in enumerate(s1)
-        i1 <= k && (prevch1 = ch1 ; continue)
+        i1 > k || (prevch1 = ch1 ; continue)
        left = i1 - k - 1
        current = left + 1
        nextTransCost = 0
@ -201,17 +203,17 @@ function (dist::DamerauLevenshtein)(s1, s2)
            i2_end += i2_end < len2
        end
        for (i2, ch2) in enumerate(s2)
-            i2 <= k && (prevch2 = ch2 ; continue)
+            i2 > k || (prevch2 = ch2 ; continue)
            # no need to look beyond window of lower right diagonal - max distance cells 
            #lower right diag is i1 - (len2 - len1)) and the upper left diagonal + dist.max_dist cells (upper left is i1)
-            (dist.max_dist !== nothing) && !(i2_start <= i2 - k - 1 < i2_end) && (prevch2 = ch2 ; continue)
+            dist.max_dist !== nothing && !(i2_start <= i2 - k - 1 < i2_end) && (prevch2 = ch2 ; continue)
            @inbounds above, current, left = current, left, v[i2 - k]
            @inbounds w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
            # left now equals current cost (which will be diagonal at next iteration)
            if ch1 != ch2
                current = min(left, current, above) + 1
                # never happens at i2 = k + 1 because then the two previous characters were equal
-                if (i1 - k - 1 > 0) && (i2 - k - 1 > 0) && (ch1 == prevch2) && (prevch1 == ch2)
+                if i1 - k - 1 > 0 && i2 - k - 1 > 0 && ch1 == prevch2 && prevch1 == ch2
                    thisTransCost += 1
                    current = min(current, thisTransCost)
                end
@ -240,58 +242,52 @@ struct RatcliffObershelp <: SemiMetric end

 function (dist::RatcliffObershelp)(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
-    s1, s2 = reorder(s1, s2)
-    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
-    len1 + len2 == 0 ? 0.0 : 1 - 2 *  n_matched / (len1 + len2)
+    n_matched = length_matching_blocks(s1, s2, 1, 1, len1, len2)
+    len1 + len2 == 0 ? 0.0 : 1 - 2 * n_matched / (len1 + len2)
 end

-function matching_blocks(s1, s2)
-    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
-end
-
-function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
-    n1, n2, len = longest_common_pattern(s1, s2)
+function length_matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
+    end1 >= start1 || return 0
+    end2 >= start2 || return 0
+    j1, j2, len = longest_common_pattern(s1, s2, start1, start2, end1, end2)
    # exit if there is no common substring
-    len == 0 && return x
-    # add the info of the common to the existing set
-    push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
-    # add the longest common substring that happens before
-    matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
-    # add the longest common substring that happens after
-    matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), 
-                    start1 + n1 + len - 1, start2 + n2 + len - 1)
-    return x
+    len == 0 && return 0
+    # add the length of the longest common substring that happens before
+    len += length_matching_blocks(s1, s2, start1, start2, j1 - 1, j2 - 1)
+    # add the length of the longest common substring that happens after
+    len += length_matching_blocks(s1, s2, j1 + len, j2 + len, end1, end2)
+    return len
 end


-_take(s, n::Integer) = Base.Iterators.take(s, n)
-_take(s::StringWithLength, n::Integer) = StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n)
-
-_drop(s, n::Integer) = Base.Iterators.drop(s, n)
-_drop(s::StringWithLength, n::Integer) = StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
-
-
-function longest_common_pattern(s1, s2)
-    if length(s1) > length(s2)
-        start2, start1, len = longest_common_pattern(s2, s1)
+function longest_common_pattern(s1, s2, start1, start2, end1, end2)
+    if end1 - start1 > end2 - start2
+        j2, j1, len = longest_common_pattern(s2, s1, start2, start1, end2, end1)
    else
-        start1, start2, len = 0, 0, 0
-        p = zeros(Int, length(s2))
+        j1, j2, len = 0, 0, 0
+        # p[i2-start2+1] stores the startingindex of the longest 
+        # common pattern up to i2 with prevch1 as last matching character
+        p = zeros(Int, end2 - start2 + 1)
        for (i1, ch1) in enumerate(s1)
-            oldp = 0
+            i1 >= start1 || continue
+            i1 <= end1 || break
+            oldj2 = 0
            for (i2, ch2) in enumerate(s2)
-                newp = 0
-                if ch1 == ch2
-                    newp = oldp > 0 ? oldp : i2
-                    currentlength = i2 - newp + 1
-                    if currentlength > len
-                        start1, start2, len = i1 - currentlength + 1, newp, currentlength
+                i2 >= start2 || continue
+                i2 <= end2 || break
+                if ch1 != ch2
+                    newj2 = 0
+                else
+                    newj2 = oldj2 > 0 ? oldj2 : i2
+                    newlen = i2 - newj2 + 1
+                    if newlen > len
+                        j1, j2, len = i1 - newlen + 1, newj2, newlen
                    end
                end
-                p[i2], oldp = newp, p[i2]
+                p[i2 - start2 + 1], oldj2 = newj2, p[i2 - start2 + 1]
            end
        end
    end
-    return start1, start2, len
+    return j1, j2, len
 end
--- a/src/modifiers.jl
+++ b/src/modifiers.jl
@ -34,11 +34,10 @@ end

 function (dist::Partial{RatcliffObershelp})(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
-    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return dist.dist(s1, s2)
    out = 1.0
-    for r in matching_blocks(s1, s2)
+    for r in matching_blocks(s1, s2, 1, 1, len1, len2)
        # Make sure the substring of s2 has length len1
        s2_start = r[2] - r[1] + 1
        s2_end = s2_start + len1 - 1
@ -49,13 +48,30 @@ function (dist::Partial{RatcliffObershelp})(s1, s2)
            s2_start += len2 - s2_end
            s2_end += len2 - s2_end
        end
-        curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
+        n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_end)
+        curr = 1 - 2 * n_matched / (len1 + s2_end - s2_start + 1)
        out = min(out, curr)
    end
    return out
 end
-_slice(s, n1::Integer, n2::Integer) = Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
-_slice(s::StringWithLength, n1::Integer, n2::Integer) = SubString(s, nextind(s, 0, n1 + 1),  nextind(s, 0, n2))
+
+function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
+    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, start1, start2, end1, end2)
+end
+
+function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
+    j1, j2, len = longest_common_pattern(s1, s2, start1, start2, end1, end2)
+    # exit if there is no common substring
+    len == 0 && return x
+    # add the info of the common to the existing set
+    push!(x, (j1, j2, len))
+     # add the longest common substring that happens before
+    matching_blocks!(x, s1, s2, start1, start2, j1 - 1, j2 - 1)
+     # add the longest common substring that happens after
+    matching_blocks!(x, s1, s2, j1 + len, j2 + len, end1, end2)
+    return x
+end
+
 """
   TokenSort(dist)

--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -7,6 +7,7 @@ using StringDistances, Unicode, Test
 	@test Partial(QGram(2))("martha", missing) === missing
 	@test Partial(Levenshtein())("martha", "marhta") == 2
 	@test Partial(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5
+	@test Partial(RatcliffObershelp())("martha", "marhtaXXX") ≈ 0.16666666 atol = 1e-5
 	@test Partial(RatcliffObershelp())("martha", missing) === missing

 	# TokenSort