allo any iterator in Jaro + add tests

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-11 07:39:15 -05:00
parent 067a7c58d2
commit 29c2b6aeca
3 changed files with 48 additions and 47 deletions

View File

@ -20,13 +20,12 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia.
# If both are empty, the formula in Wikipedia gives 0
# Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
prevstate1 = firstindex(s1)
i1_match = fill(prevstate1, len1)
ch1_match = Vector{eltype(s1)}(undef, len1)
# m counts number matching characters
m = 0
i1 = 1
@ -48,7 +47,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
if (ch1 == ch2) && !flag[i2curr]
m += 1
flag[i2curr] = true
i1_match[m] = prevstate1
ch1_match[m] = ch1
break
end
x2curr = iterate(s2, state2)
@ -56,7 +55,6 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
m == 0 && return 1.0
# t counts number of transpositions
@ -67,7 +65,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
i2 += 1
if flag[i2]
i1 += 1
t += ch2 != iterate(s1, i1_match[i1])[1]
t += ch2 != ch1_match[i1]
end
end
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
@ -87,7 +85,6 @@ struct Levenshtein <: Metric end
# Return max_dist +1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
## accepts any iterator, including AbstractString
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
@ -95,7 +92,7 @@ function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2)
x1 == nothing && return len2 - k
x1 === nothing && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v = collect(1:(len2-k))
@ -142,7 +139,6 @@ required to change one string into the other.
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## accepts any iterator, including AbstractString
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
@ -150,7 +146,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2)
(x1 == nothing) && return len2 - k
x1 === nothing && return len2 - k
v = collect(1:(len2-k))
w = similar(v)
if max_dist !== nothing
@ -163,7 +159,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
prevch1, = x1
while x1 !== nothing
ch1, state1 = x1
left = (i1 - 1)
left = i1 - 1
current = i1
nextTransCost = 0
prevch2, = x2start
@ -175,7 +171,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
i2 = 1
while x2 !== nothing
ch2, state2 = x2
if max_dist == nothing || (i2_start <= i2 <= i2_end)
if max_dist === nothing || (i2_start <= i2 <= i2_end)
above = current
thisTransCost = nextTransCost
nextTransCost = w[i2]

View File

@ -100,7 +100,6 @@ function evaluate(dist::QGram, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
itr =
for (n1, n2) in itr
n += abs(n1 - n2)
end

View File

@ -3,6 +3,17 @@ using StringDistances, Unicode, Test
@testset "Distances" begin
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) 0.2222222222222222
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Jaro(), "", "")
@test ismissing(evaluate(Jaro(), "", missing))
end
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@ -12,10 +23,11 @@ using StringDistances, Unicode, Test
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
@test result_type(Levenshtein(), "hello", "world") == Int
@inferred Int evaluate(Levenshtein(), "", "")
@test ismissing(evaluate(Levenshtein(), "", missing))
@inferred evaluate(Levenshtein(), "", "")
end
@testset "DamerauLevenshtein" begin
@ -27,12 +39,29 @@ using StringDistances, Unicode, Test
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred Int evaluate(DamerauLevenshtein(), "", "")
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@inferred evaluate(DamerauLevenshtein(), "", "")
end
@testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) 1/3
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(RatcliffObershelp(), "", "")
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
end
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@ -43,27 +72,29 @@ using StringDistances, Unicode, Test
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(QGram(2), "alborgów", "amoniak")
@test result_type(QGram(1), "hello", "world") == Int
@test ismissing(evaluate(QGram(1), "", missing))
@inferred evaluate(QGram(1), "", "")
@inferred Int evaluate(QGram(1), "", "")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) 0.5
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Cosine(2), "alborgów", "amoniak")
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Cosine(2), "", "")
@test ismissing(evaluate(Cosine(2), "", missing))
@inferred evaluate(Cosine(2), "", "")
end
@testset "Jaccard" begin
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(1), "abc", "ccc") 2/3 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) 2/3 atol = 1e-4
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Jaccard(2), "alborgów", "amoniak")
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Jaccard(1), "", "")
@test ismissing(evaluate(Jaccard(1), "", missing))
@inferred evaluate(Jaccard(1), "", "")
end
@testset "SorensenDice" begin
@ -71,38 +102,20 @@ using StringDistances, Unicode, Test
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(SorensenDice(2), "alborgów", "amoniak")
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(SorensenDice(1), "", "")
@test ismissing(evaluate(SorensenDice(1), "", missing))
@inferred evaluate(SorensenDice(1), "", "")
end
@testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Overlap(1), "", "")
@test ismissing(evaluate(Overlap(1), "", missing))
@inferred evaluate(Overlap(1), "", "")
end
@testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
@inferred evaluate(RatcliffObershelp(), "", "")
end
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
end
strings = [
("martha", "marhta"),
@ -150,13 +163,6 @@ using StringDistances, Unicode, Test
end
# allow any iterator
evaluate(Jaro(), [1,2,3], [1,2,10])
evaluate(Levenshtein(), [1,2,3], [1,2,10])
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
evaluate(QGram(2), [1,2,3], [1,2,10])
evaluate(Overlap(2), [1,2,3], [1,2,10])
#= R test
library(stringdist)