allo any iterator in Jaro + add tests
parent
067a7c58d2
commit
29c2b6aeca
20
src/edit.jl
20
src/edit.jl
|
@ -20,13 +20,12 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
|||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia.
|
||||
# If both are empty, the formula in Wikipedia gives 0
|
||||
# Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
flag = fill(false, len2)
|
||||
prevstate1 = firstindex(s1)
|
||||
i1_match = fill(prevstate1, len1)
|
||||
ch1_match = Vector{eltype(s1)}(undef, len1)
|
||||
# m counts number matching characters
|
||||
m = 0
|
||||
i1 = 1
|
||||
|
@ -48,7 +47,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
|||
if (ch1 == ch2) && !flag[i2curr]
|
||||
m += 1
|
||||
flag[i2curr] = true
|
||||
i1_match[m] = prevstate1
|
||||
ch1_match[m] = ch1
|
||||
break
|
||||
end
|
||||
x2curr = iterate(s2, state2)
|
||||
|
@ -56,7 +55,6 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
|||
end
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevstate1 = state1
|
||||
end
|
||||
m == 0 && return 1.0
|
||||
# t counts number of transpositions
|
||||
|
@ -67,7 +65,7 @@ function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
|||
i2 += 1
|
||||
if flag[i2]
|
||||
i1 += 1
|
||||
t += ch2 != iterate(s1, i1_match[i1])[1]
|
||||
t += ch2 != ch1_match[i1]
|
||||
end
|
||||
end
|
||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
|
@ -87,7 +85,6 @@ struct Levenshtein <: Metric end
|
|||
# Return max_dist +1 if distance higher than max_dist
|
||||
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||
# This is important for find_all
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -95,7 +92,7 @@ function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
|
|||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = common_prefix(s1, s2)
|
||||
x1 == nothing && return len2 - k
|
||||
x1 === nothing && return len2 - k
|
||||
# distance initialized to first row of matrix
|
||||
# => distance between "" and s2[1:i}
|
||||
v = collect(1:(len2-k))
|
||||
|
@ -142,7 +139,6 @@ required to change one string into the other.
|
|||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -150,7 +146,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
|||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = common_prefix(s1, s2)
|
||||
(x1 == nothing) && return len2 - k
|
||||
x1 === nothing && return len2 - k
|
||||
v = collect(1:(len2-k))
|
||||
w = similar(v)
|
||||
if max_dist !== nothing
|
||||
|
@ -163,7 +159,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
|||
prevch1, = x1
|
||||
while x1 !== nothing
|
||||
ch1, state1 = x1
|
||||
left = (i1 - 1)
|
||||
left = i1 - 1
|
||||
current = i1
|
||||
nextTransCost = 0
|
||||
prevch2, = x2start
|
||||
|
@ -175,7 +171,7 @@ function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
|||
i2 = 1
|
||||
while x2 !== nothing
|
||||
ch2, state2 = x2
|
||||
if max_dist == nothing || (i2_start <= i2 <= i2_end)
|
||||
if max_dist === nothing || (i2_start <= i2 <= i2_end)
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
nextTransCost = w[i2]
|
||||
|
|
|
@ -100,7 +100,6 @@ function evaluate(dist::QGram, s1, s2)
|
|||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
itr =
|
||||
for (n1, n2) in itr
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
|
|
|
@ -3,6 +3,17 @@ using StringDistances, Unicode, Test
|
|||
|
||||
@testset "Distances" begin
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
|
||||
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Jaro(), "", "")
|
||||
@test ismissing(evaluate(Jaro(), "", missing))
|
||||
end
|
||||
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
|
@ -12,10 +23,11 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
|
||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||
@inferred Int evaluate(Levenshtein(), "", "")
|
||||
@test ismissing(evaluate(Levenshtein(), "", missing))
|
||||
@inferred evaluate(Levenshtein(), "", "")
|
||||
end
|
||||
|
||||
@testset "DamerauLevenshtein" begin
|
||||
|
@ -27,12 +39,29 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@inferred Int evaluate(DamerauLevenshtein(), "", "")
|
||||
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
|
||||
@inferred evaluate(DamerauLevenshtein(), "", "")
|
||||
end
|
||||
|
||||
@testset "RatcliffObershelp" begin
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3
|
||||
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(RatcliffObershelp(), "", "")
|
||||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
end
|
||||
|
||||
|
||||
@testset "QGram" begin
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
|
@ -43,27 +72,29 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak")
|
||||
@test result_type(QGram(1), "hello", "world") == Int
|
||||
@test ismissing(evaluate(QGram(1), "", missing))
|
||||
@inferred evaluate(QGram(1), "", "")
|
||||
@inferred Int evaluate(QGram(1), "", "")
|
||||
end
|
||||
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5
|
||||
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak")
|
||||
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Cosine(2), "", "")
|
||||
@test ismissing(evaluate(Cosine(2), "", missing))
|
||||
@inferred evaluate(Cosine(2), "", "")
|
||||
end
|
||||
|
||||
@testset "Jaccard" begin
|
||||
@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ 2/3 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak")
|
||||
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Jaccard(1), "", "")
|
||||
@test ismissing(evaluate(Jaccard(1), "", missing))
|
||||
@inferred evaluate(Jaccard(1), "", "")
|
||||
end
|
||||
|
||||
@testset "SorensenDice" begin
|
||||
|
@ -71,38 +102,20 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak")
|
||||
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(SorensenDice(1), "", "")
|
||||
@test ismissing(evaluate(SorensenDice(1), "", missing))
|
||||
@inferred evaluate(SorensenDice(1), "", "")
|
||||
end
|
||||
|
||||
@testset "Overlap" begin
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Overlap(1), "", "")
|
||||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
@inferred evaluate(Overlap(1), "", "")
|
||||
end
|
||||
|
||||
@testset "RatcliffObershelp" begin
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
@inferred evaluate(RatcliffObershelp(), "", "")
|
||||
end
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
end
|
||||
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
|
@ -150,13 +163,6 @@ using StringDistances, Unicode, Test
|
|||
end
|
||||
|
||||
|
||||
# allow any iterator
|
||||
evaluate(Jaro(), [1,2,3], [1,2,10])
|
||||
evaluate(Levenshtein(), [1,2,3], [1,2,10])
|
||||
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
|
||||
evaluate(QGram(2), [1,2,3], [1,2,10])
|
||||
evaluate(Overlap(2), [1,2,3], [1,2,10])
|
||||
|
||||
|
||||
#= R test
|
||||
library(stringdist)
|
||||
|
|
Loading…
Reference in New Issue