simplify Radclikff

2019-08-17 12:57:35 -04:00 · 2019-08-17 12:57:35 -04:00 · efcace4f03
parent 402d24997f
commit efcace4f03
5 changed files with 74 additions and 68 deletions
--- a/README.md
+++ b/README.md
@ -24,6 +24,7 @@ compare("martha", "marhta", Hamming())
 - [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
 - [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
 - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
+- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`


 #### Q-Grams Distances
@ -34,11 +35,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string.
 - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q)`
 - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q)`

-#### Others
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
-
-
-
 ## Distance Modifiers
 The package includes distance "modifiers", that can be applied to any distance.

--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -16,12 +16,12 @@ Hamming,
 Levenshtein,
 DamerauLevenshtein,
 Jaro,
+RatcliffObershelp,
 QGram,
 Cosine,
 Jaccard,
 SorensenDice,
 Overlap,
-RatcliffObershelp,
 Winkler,
 Partial,
 TokenSort,
@ -37,7 +37,6 @@ qgram_iterator
 include("utils.jl")
 include("distances/edit.jl")
 include("distances/qgram.jl")
-include("distances/RatcliffObershelp.jl")
 include("compare.jl")

 end
--- a/src/distances/RatcliffObershelp.jl
+++ b/src/distances/RatcliffObershelp.jl
@ -1,60 +0,0 @@
-# Return start of commn substring in s1, start of common substring in s2, and length of substring
-# Indexes refer to character number, not index (differ for Unicode strings)
-function longest_common_substring(s1::AbstractString, s2::AbstractString)
-    if length(s1) > length(s2)
-        start2, start1, len = longest_common_substring(s2, s1)
-    else
-        start1, start2, len = 0, 0, 0
-        p = zeros(Int, length(s2))
-        i1 = 0
-        for ch1 in s1
-            i1 += 1
-            i2 = 0
-            oldp = 0
-            for ch2 in s2
-                i2 += 1
-                newp = 0
-                if ch1 == ch2
-                    newp = oldp > 0 ? oldp : i2
-                    currentlength = (i2 - newp + 1)
-                    if currentlength > len
-                        start1, start2, len = i1 - currentlength + 1, newp, currentlength
-                    end
-                end
-                p[i2], oldp = newp, p[i2]
-            end
-        end
-    end
-    return start1, start2, len
-end
-
-function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
-    a = longest_common_substring(s1, s2)
-    if a[3] > 0
-        push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
-        s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
-        s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
-        matching_blocks!(x, s1before, s2before, start1, start2)
-        if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
-            s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
-            s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
-            matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
-        end
-    end
-end
-
-function matching_blocks(s1::AbstractString, s2::AbstractString)
-    x = Set{Tuple{Int, Int, Int}}()
-    matching_blocks!(x, s1, s2, 1, 1)
-    return x
-end
-
-struct RatcliffObershelp <: PreMetric end
-
-function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
-    n_matched = 0
-    for x in matching_blocks(s1, s2)
-        n_matched += x[3]
-    end
-    1.0 - 2 * n_matched / (length(s1) + length(s2))
-end
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -182,4 +182,43 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
        end
    end
    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
-end
+end
+
+##############################################################################
+##
+## Ratcliff/Obershelp
+##
+##############################################################################
+
+struct RatcliffObershelp <: PreMetric end
+
+function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
+    n_matched = sum(last.(matching_blocks(s1, s2)))   
+    1.0 - 2 *  n_matched / (length(s1) + length(s2))
+end
+
+function matching_blocks(s1::AbstractString, s2::AbstractString)
+    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
+end
+
+function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
+    a = longest_common_substring(s1, s2, len1 , len2)
+    # if there is a common substring
+    if a[3] > 0
+        # add the info of the common to the existing set
+        push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
+        # add the longest common substring that happens before
+        s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
+        s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
+        matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
+        # add the longest common substring that happens after
+        s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
+        s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
+        matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
+    end
+    return x
+end
+
+
+
+
--- a/src/utils.jl
+++ b/src/utils.jl
@ -23,3 +23,35 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
    end
    return l, x1, x2
 end
+
+
+
+# Return start of commn substring in s1, start of common substring in s2, and length of substring
+# Indexes refer to character number, not index
+function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
+    if len1 > len2
+        start2, start1, len = longest_common_substring(s2, s1, len2, len1)
+    else
+        start1, start2, len = 0, 0, 0
+        p = zeros(Int, len2)
+        i1 = 0
+        for ch1 in s1
+            i1 += 1
+            i2 = 0
+            oldp = 0
+            for ch2 in s2
+                i2 += 1
+                newp = 0
+                if ch1 == ch2
+                    newp = oldp > 0 ? oldp : i2
+                    currentlength = (i2 - newp + 1)
+                    if currentlength > len
+                        start1, start2, len = i1 - currentlength + 1, newp, currentlength
+                    end
+                end
+                p[i2], oldp = newp, p[i2]
+            end
+        end
+    end
+    return start1, start2, len
+end