From 29c2b6aeca83a6028f6cf2423d1d6dc779dc17f3 Mon Sep 17 00:00:00 2001
From: matthieugomez <gomez.matthieu@gmail.com>
Date: Tue, 11 Feb 2020 07:39:15 -0500
Subject: [PATCH] allo any iterator in Jaro + add tests

---
 src/edit.jl       | 20 +++++--------
 src/qgram.jl      |  1 -
 test/distances.jl | 74 +++++++++++++++++++++++++----------------------
 3 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/src/edit.jl b/src/edit.jl
index 3a6d76a..557337c 100755
--- a/src/edit.jl
+++ b/src/edit.jl
@@ -20,13 +20,12 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
     (ismissing(s1) | ismissing(s2)) && return missing
     s1, s2 = reorder(s1, s2)
     len1, len2 = length(s1), length(s2)
-    # if both are empty, m = 0 so should be 1.0 according to wikipedia. 
+    # If both are empty, the formula in Wikipedia gives 0
     # Add this line so that not the case
     len2 == 0 && return 0.0
     maxdist = max(0, div(len2, 2) - 1)
     flag = fill(false, len2)
-    prevstate1 = firstindex(s1)
-    i1_match = fill(prevstate1, len1)
+    ch1_match = Vector{eltype(s1)}(undef, len1)
     #  m counts number matching characters
     m = 0 
     i1 = 1
@@ -48,7 +47,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
             if (ch1 == ch2) && !flag[i2curr] 
                 m += 1
                 flag[i2curr] = true
-                i1_match[m] = prevstate1
+                ch1_match[m] = ch1
                 break
             end
             x2curr = iterate(s2, state2) 
@@ -56,7 +55,6 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
         end
         x1 = iterate(s1, state1)
         i1 += 1
-        prevstate1 = state1
     end
     m == 0 && return 1.0
     # t counts number of transpositions
@@ -67,7 +65,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
         i2 += 1
         if flag[i2]
             i1 += 1
-            t += ch2 != iterate(s1, i1_match[i1])[1]
+            t += ch2 != ch1_match[i1]
         end
     end
     return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
@@ -87,7 +85,6 @@ struct Levenshtein <: Metric end
 # Return max_dist +1 if distance higher than max_dist
 # This makes it possible to differentiate distance equalt to max_dist vs strictly higher
 # This is important for find_all
-## accepts any iterator, including AbstractString
 function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
     (ismissing(s1) | ismissing(s2)) && return missing
     s1, s2 = reorder(s1, s2)
@@ -95,7 +92,7 @@ function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
     max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
     # prefix common to both strings can be ignored
     k, x1, x2start = common_prefix(s1, s2)
-    x1 == nothing && return len2 - k
+    x1 === nothing && return len2 - k
     # distance initialized to first row of matrix
     # => distance between "" and s2[1:i}
     v = collect(1:(len2-k))
@@ -142,7 +139,6 @@ required to change one string into the other.
 struct DamerauLevenshtein <: SemiMetric end
 
 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
-## accepts any iterator, including AbstractString
 function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
     (ismissing(s1) | ismissing(s2)) && return missing
     s1, s2 = reorder(s1, s2)
@@ -150,7 +146,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
     max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
     # prefix common to both strings can be ignored
     k, x1, x2start = common_prefix(s1, s2)
-    (x1 == nothing) && return len2 - k
+    x1 === nothing && return len2 - k
     v = collect(1:(len2-k))
     w = similar(v)
     if max_dist !== nothing
@@ -163,7 +159,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
     prevch1, = x1
     while x1 !== nothing
         ch1, state1 = x1
-        left = (i1 - 1) 
+        left = i1 - 1
         current = i1 
         nextTransCost = 0
         prevch2, = x2start
@@ -175,7 +171,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
         i2 = 1
         while x2 !== nothing
             ch2, state2 = x2
-            if max_dist == nothing || (i2_start <= i2 <= i2_end)
+            if max_dist === nothing || (i2_start <= i2 <= i2_end)
                 above = current
                 thisTransCost = nextTransCost
                 nextTransCost = w[i2]
diff --git a/src/qgram.jl b/src/qgram.jl
index b1265e4..46e6f07 100755
--- a/src/qgram.jl
+++ b/src/qgram.jl
@@ -100,7 +100,6 @@ function evaluate(dist::QGram, s1, s2)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
-	itr = 
 	for (n1, n2) in itr
 		n += abs(n1 - n2)
 	end
diff --git a/test/distances.jl b/test/distances.jl
index 18dd4e2..d1a4a51 100644
--- a/test/distances.jl
+++ b/test/distances.jl
@@ -3,6 +3,17 @@ using StringDistances, Unicode, Test
 
 @testset "Distances" begin
 
+	@testset "Jaro" begin
+		@test evaluate(Jaro(), "martha", "marhta") ≈  0.05555555555555547
+		@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
+		@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
+		@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
+		@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
+		@test result_type(Jaro(), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(Jaro(), "", "")
+		@test ismissing(evaluate(Jaro(), "", missing))
+	end
+
 	@testset "Levenshtein" begin
 		@test evaluate(Levenshtein(), "", "") == 0
 		@test evaluate(Levenshtein(), "abc", "") == 3
@@ -12,10 +23,11 @@ using StringDistances, Unicode, Test
 		@test evaluate(Levenshtein(), "saturday", "sunday") == 3
 		@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
 		@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
+		@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
 		@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
 		@test result_type(Levenshtein(), "hello", "world") == Int
+		@inferred Int evaluate(Levenshtein(), "", "")
 		@test ismissing(evaluate(Levenshtein(), "", missing))
-		@inferred evaluate(Levenshtein(), "", "")
 	end
 
 	@testset "DamerauLevenshtein" begin
@@ -27,12 +39,29 @@ using StringDistances, Unicode, Test
 		@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
 		@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
 		@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
+		@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
 		@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
 		@test result_type(DamerauLevenshtein(), "hello", "world") == Int
+		@inferred Int evaluate(DamerauLevenshtein(), "", "")
 		@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
-		@inferred evaluate(DamerauLevenshtein(), "", "")
 	end
 
+	@testset "RatcliffObershelp" begin
+		@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
+		@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
+		@test evaluate(RatcliffObershelp(), "pennsylvania",  "pencilvaneya") ≈ 1 - 0.6666666666666
+		@test evaluate(RatcliffObershelp(), "",  "pencilvaneya") ≈ 1.0
+		@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 -  0.962962962963
+		@test evaluate(RatcliffObershelp(), "Yankees",  "New York Yankees") ≈ 0.3913043478260869
+		@test evaluate(RatcliffObershelp(), "New York Mets",  "New York Yankees") ≈ 0.24137931034482762
+		@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3
+		@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
+		@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(RatcliffObershelp(), "", "")
+		@test ismissing(evaluate(RatcliffObershelp(), "", missing))
+	end
+
+
 	@testset "QGram" begin
 		@test evaluate(QGram(1), "abc", "abc") == 0
 		@test evaluate(QGram(1), "", "abc") == 3
@@ -43,27 +72,29 @@ using StringDistances, Unicode, Test
 		@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak")
 		@test result_type(QGram(1), "hello", "world") == Int
 		@test ismissing(evaluate(QGram(1), "", missing))
-		@inferred evaluate(QGram(1), "", "")
+		@inferred Int evaluate(QGram(1), "", "")
 	end
 
 	@testset "Cosine" begin
 		@test isnan(evaluate(Cosine(2), "", "abc"))
 		@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
 		@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
+		@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5
 		@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak")
 		@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(Cosine(2), "", "")
 		@test ismissing(evaluate(Cosine(2), "", missing))
-		@inferred evaluate(Cosine(2), "", "")
 	end
 
 	@testset "Jaccard" begin
 		@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
-		@test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4
+		@test evaluate(Jaccard(1), "abc", "ccc") ≈ 2/3 atol = 1e-4
 		@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
+		@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
 		@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak")
 		@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(Jaccard(1), "", "")
 		@test ismissing(evaluate(Jaccard(1), "", missing))
-		@inferred evaluate(Jaccard(1), "", "")
 	end
 
 	@testset "SorensenDice" begin
@@ -71,38 +102,20 @@ using StringDistances, Unicode, Test
 		@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
 		@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak")
 		@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(SorensenDice(1), "", "")
 		@test ismissing(evaluate(SorensenDice(1), "", missing))
-		@inferred evaluate(SorensenDice(1), "", "")
 	end
 
 	@testset "Overlap" begin
 		@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
 		@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
 		@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
+		@inferred Float64 evaluate(Overlap(1), "", "")
 		@test ismissing(evaluate(Overlap(1), "", missing))
-		@inferred evaluate(Overlap(1), "", "")
 	end
 
-	@testset "RatcliffObershelp" begin
-		@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
-		@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
-		@test evaluate(RatcliffObershelp(), "pennsylvania",  "pencilvaneya") ≈ 1 - 0.6666666666666
-		@test evaluate(RatcliffObershelp(), "",  "pencilvaneya") ≈ 1.0
-		@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 -  0.962962962963
-		@test evaluate(RatcliffObershelp(), "Yankees",  "New York Yankees") ≈ 0.3913043478260869
-		@test evaluate(RatcliffObershelp(), "New York Mets",  "New York Yankees") ≈ 0.24137931034482762
-		@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
-		@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
-		@test ismissing(evaluate(RatcliffObershelp(), "", missing))
-		@inferred evaluate(RatcliffObershelp(), "", "")
-	end
 
-	@testset "Jaro" begin
-		@test evaluate(Jaro(), "martha", "marhta") ≈  0.05555555555555547
-		@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
-		@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
-		@test result_type(Jaro(), "hello", "world") == typeof(float(1))
-	end
+
 
 	strings = [
 	("martha", "marhta"),
@@ -150,13 +163,6 @@ using StringDistances, Unicode, Test
 end
 
 
-# allow any iterator 
-evaluate(Jaro(), [1,2,3], [1,2,10])
-evaluate(Levenshtein(), [1,2,3], [1,2,10])
-evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
-evaluate(QGram(2), [1,2,3], [1,2,10])
-evaluate(Overlap(2), [1,2,3], [1,2,10])
-
 
 #= R test
 library(stringdist)