From 29c2b6aeca83a6028f6cf2423d1d6dc779dc17f3 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Tue, 11 Feb 2020 07:39:15 -0500 Subject: [PATCH] allo any iterator in Jaro + add tests --- src/edit.jl | 20 +++++-------- src/qgram.jl | 1 - test/distances.jl | 74 +++++++++++++++++++++++++---------------------- 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/src/edit.jl b/src/edit.jl index 3a6d76a..557337c 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -20,13 +20,12 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - # if both are empty, m = 0 so should be 1.0 according to wikipedia. + # If both are empty, the formula in Wikipedia gives 0 # Add this line so that not the case len2 == 0 && return 0.0 maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) - prevstate1 = firstindex(s1) - i1_match = fill(prevstate1, len1) + ch1_match = Vector{eltype(s1)}(undef, len1) # m counts number matching characters m = 0 i1 = 1 @@ -48,7 +47,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing) if (ch1 == ch2) && !flag[i2curr] m += 1 flag[i2curr] = true - i1_match[m] = prevstate1 + ch1_match[m] = ch1 break end x2curr = iterate(s2, state2) @@ -56,7 +55,6 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing) end x1 = iterate(s1, state1) i1 += 1 - prevstate1 = state1 end m == 0 && return 1.0 # t counts number of transpositions @@ -67,7 +65,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing) i2 += 1 if flag[i2] i1 += 1 - t += ch2 != iterate(s1, i1_match[i1])[1] + t += ch2 != ch1_match[i1] end end return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 @@ -87,7 +85,6 @@ struct Levenshtein <: Metric end # Return max_dist +1 if distance higher than max_dist # This makes it possible to differentiate distance equalt to max_dist vs strictly higher # This is important for find_all -## accepts any iterator, including AbstractString function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) @@ -95,7 +92,7 @@ function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored k, x1, x2start = common_prefix(s1, s2) - x1 == nothing && return len2 - k + x1 === nothing && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} v = collect(1:(len2-k)) @@ -142,7 +139,6 @@ required to change one string into the other. struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html -## accepts any iterator, including AbstractString function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) @@ -150,7 +146,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored k, x1, x2start = common_prefix(s1, s2) - (x1 == nothing) && return len2 - k + x1 === nothing && return len2 - k v = collect(1:(len2-k)) w = similar(v) if max_dist !== nothing @@ -163,7 +159,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing) prevch1, = x1 while x1 !== nothing ch1, state1 = x1 - left = (i1 - 1) + left = i1 - 1 current = i1 nextTransCost = 0 prevch2, = x2start @@ -175,7 +171,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing) i2 = 1 while x2 !== nothing ch2, state2 = x2 - if max_dist == nothing || (i2_start <= i2 <= i2_end) + if max_dist === nothing || (i2_start <= i2 <= i2_end) above = current thisTransCost = nextTransCost nextTransCost = w[i2] diff --git a/src/qgram.jl b/src/qgram.jl index b1265e4..46e6f07 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -100,7 +100,6 @@ function evaluate(dist::QGram, s1, s2) (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) n = 0 - itr = for (n1, n2) in itr n += abs(n1 - n2) end diff --git a/test/distances.jl b/test/distances.jl index 18dd4e2..d1a4a51 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -3,6 +3,17 @@ using StringDistances, Unicode, Test @testset "Distances" begin + @testset "Jaro" begin + @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 + @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 + @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777 + @test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222 + @test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak") + @test result_type(Jaro(), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(Jaro(), "", "") + @test ismissing(evaluate(Jaro(), "", missing)) + end + @testset "Levenshtein" begin @test evaluate(Levenshtein(), "", "") == 0 @test evaluate(Levenshtein(), "abc", "") == 3 @@ -12,10 +23,11 @@ using StringDistances, Unicode, Test @test evaluate(Levenshtein(), "saturday", "sunday") == 3 @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 + @test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1 @test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak") @test result_type(Levenshtein(), "hello", "world") == Int + @inferred Int evaluate(Levenshtein(), "", "") @test ismissing(evaluate(Levenshtein(), "", missing)) - @inferred evaluate(Levenshtein(), "", "") end @testset "DamerauLevenshtein" begin @@ -27,12 +39,29 @@ using StringDistances, Unicode, Test @test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17 @test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2 @test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2 + @test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1 @test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak") @test result_type(DamerauLevenshtein(), "hello", "world") == Int + @inferred Int evaluate(DamerauLevenshtein(), "", "") @test ismissing(evaluate(DamerauLevenshtein(), "", missing)) - @inferred evaluate(DamerauLevenshtein(), "", "") end + @testset "RatcliffObershelp" begin + @test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154 + @test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579 + @test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 + @test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0 + @test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963 + @test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869 + @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 + @test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3 + @test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak") + @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(RatcliffObershelp(), "", "") + @test ismissing(evaluate(RatcliffObershelp(), "", missing)) + end + + @testset "QGram" begin @test evaluate(QGram(1), "abc", "abc") == 0 @test evaluate(QGram(1), "", "abc") == 3 @@ -43,27 +72,29 @@ using StringDistances, Unicode, Test @test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak") @test result_type(QGram(1), "hello", "world") == Int @test ismissing(evaluate(QGram(1), "", missing)) - @inferred evaluate(QGram(1), "", "") + @inferred Int evaluate(QGram(1), "", "") end @testset "Cosine" begin @test isnan(evaluate(Cosine(2), "", "abc")) @test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4 @test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4 + @test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5 @test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak") @test result_type(Cosine(2), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(Cosine(2), "", "") @test ismissing(evaluate(Cosine(2), "", missing)) - @inferred evaluate(Cosine(2), "", "") end @testset "Jaccard" begin @test evaluate(Jaccard(1), "", "abc") ≈ 1.0 - @test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4 + @test evaluate(Jaccard(1), "abc", "ccc") ≈ 2/3 atol = 1e-4 @test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4 + @test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4 @test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak") @test result_type(Jaccard(1), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(Jaccard(1), "", "") @test ismissing(evaluate(Jaccard(1), "", missing)) - @inferred evaluate(Jaccard(1), "", "") end @testset "SorensenDice" begin @@ -71,38 +102,20 @@ using StringDistances, Unicode, Test @test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4 @test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak") @test result_type(SorensenDice(1), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(SorensenDice(1), "", "") @test ismissing(evaluate(SorensenDice(1), "", missing)) - @inferred evaluate(SorensenDice(1), "", "") end @testset "Overlap" begin @test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4 @test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4 @test result_type(Overlap(1), "hello", "world") == typeof(float(1)) + @inferred Float64 evaluate(Overlap(1), "", "") @test ismissing(evaluate(Overlap(1), "", missing)) - @inferred evaluate(Overlap(1), "", "") end - @testset "RatcliffObershelp" begin - @test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154 - @test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579 - @test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 - @test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0 - @test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963 - @test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869 - @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 - @test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak") - @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) - @test ismissing(evaluate(RatcliffObershelp(), "", missing)) - @inferred evaluate(RatcliffObershelp(), "", "") - end - @testset "Jaro" begin - @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 - @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 - @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777 - @test result_type(Jaro(), "hello", "world") == typeof(float(1)) - end + strings = [ ("martha", "marhta"), @@ -150,13 +163,6 @@ using StringDistances, Unicode, Test end -# allow any iterator -evaluate(Jaro(), [1,2,3], [1,2,10]) -evaluate(Levenshtein(), [1,2,3], [1,2,10]) -evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10]) -evaluate(QGram(2), [1,2,3], [1,2,10]) -evaluate(Overlap(2), [1,2,3], [1,2,10]) - #= R test library(stringdist)