using StringDistances, Unicode, Test @testset "Distances" begin @testset "Jaro" begin @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777 @test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222 @test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak") @test Jaro()(" vs an", "es an ") ≈ 0.2777777777777777 @test result_type(Jaro(), "hello", "world") == typeof(float(1)) @inferred evaluate(Jaro(), "", "") @test ismissing(evaluate(Jaro(), "", missing)) end @testset "Levenshtein" begin @test evaluate(Levenshtein(), "", "") == 0 @test evaluate(Levenshtein(), "abc", "") == 3 @test evaluate(Levenshtein(), "", "abc") == 3 @test evaluate(Levenshtein(), "bc", "abc") == 1 @test evaluate(Levenshtein(), "kitten", "sitting") == 3 @test evaluate(Levenshtein(), "saturday", "sunday") == 3 @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 @test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1 @test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak") @test Levenshtein()("", "abc") == 3 @test result_type(Levenshtein(), "hello", "world") == Int @inferred evaluate(Levenshtein(), "", "") @test ismissing(evaluate(Levenshtein(), "", missing)) end @testset "DamerauLevenshtein" begin @test evaluate(DamerauLevenshtein(), "", "") == 0 @test evaluate(DamerauLevenshtein(), "abc", "") == 3 @test evaluate(DamerauLevenshtein(), "bc", "abc") == 1 @test evaluate(DamerauLevenshtein(), "fuor", "four") == 1 @test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2 @test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17 @test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2 @test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2 @test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1 @test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak") @test DamerauLevenshtein()("bc", "abc") == 1 @test result_type(DamerauLevenshtein(), "hello", "world") == Int @inferred evaluate(DamerauLevenshtein(), "", "") @test ismissing(evaluate(DamerauLevenshtein(), "", missing)) end @testset "RatcliffObershelp" begin @test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154 @test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579 @test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 @test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0 @test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963 @test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869 @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 @test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3 @test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak") @test RatcliffObershelp()("pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) @inferred evaluate(RatcliffObershelp(), "", "") @test ismissing(evaluate(RatcliffObershelp(), "", missing)) end @testset "QGram" begin @test evaluate(QGram(1), "abc", "abc") == 0 @test evaluate(QGram(1), "", "abc") == 3 @test evaluate(QGram(1), "abc", "cba") == 0 @test evaluate(QGram(1), "abc", "ccc") == 4 @test evaluate(QGram(4), "aü☃", "aüaüafs") == 4 @test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2 @test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak") @test QGram(1)("abc", "cba") == 0 @test result_type(QGram(1), "hello", "world") == Int @test ismissing(evaluate(QGram(1), "", missing)) @inferred evaluate(QGram(1), "", "") end @testset "Cosine" begin @test isnan(evaluate(Cosine(2), "", "abc")) @test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4 @test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4 @test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5 @test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak") @test Cosine(2)("leia", "leela") ≈ 0.7113249 atol = 1e-4 @test result_type(Cosine(2), "hello", "world") == typeof(float(1)) @inferred evaluate(Cosine(2), "", "") @test ismissing(evaluate(Cosine(2), "", missing)) end @testset "Jaccard" begin @test evaluate(Jaccard(1), "", "abc") ≈ 1.0 @test evaluate(Jaccard(1), "abc", "ccc") ≈ 2/3 atol = 1e-4 @test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4 @test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4 @test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak") @test Jaccard(2)("leia", "leela") ≈ 0.83333 atol = 1e-4 @test result_type(Jaccard(1), "hello", "world") == typeof(float(1)) @inferred evaluate(Jaccard(1), "", "") @test ismissing(evaluate(Jaccard(1), "", missing)) end @testset "SorensenDice" begin @test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4 @test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4 @test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak") @test SorensenDice(2)("night", "nacht") ≈ 0.75 atol = 1e-4 @test result_type(SorensenDice(1), "hello", "world") == typeof(float(1)) @inferred evaluate(SorensenDice(1), "", "") @test ismissing(evaluate(SorensenDice(1), "", missing)) end @testset "Overlap" begin @test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4 @test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4 @test Overlap(1)("context", "contact") ≈ .2 atol = 1e-4 @test result_type(Overlap(1), "hello", "world") == typeof(float(1)) @inferred evaluate(Overlap(1), "", "") @test ismissing(evaluate(Overlap(1), "", missing)) end strings = [ ("martha", "marhta"), ("dwayne", "duane") , ("dixon", "dicksonx"), ("william", "williams"), ("", "foo"), ("a", "a"), ("abc", "xyz"), ("abc", "ccc"), ("kitten", "sitting"), ("saturday", "sunday"), ("hi, my name is", "my name is"), ("alborgów", "amoniak"), ("cape sand recycling ", "edith ann graham"), ( "jellyifhs", "jellyfish"), ("ifhs", "fish"), ("leia", "leela"), ] solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), (DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]), (Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]), (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), (QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]), (Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]), (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) # Test with R package StringDist for x in solutions t, solution = x for i in 1:length(solution) if isnan(evaluate(t, strings[i]...)) @test isnan(solution[i]) else @test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4 end end end # test RatcliffObershelp solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67] for i in eachindex(strings) @test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4 end end #= R test library(stringdist) strings = matrix(data = c( "martha", "marhta", "dwayne", "duane", "dixon", "dicksonx", "william", "williams", "", "foo", "a", "a", "abc", "xyz", "abc", "ccc", "kitten", "sitting", "saturday", "sunday", "hi, my name is", "my name is", "alborgów", "amoniak", "cape sand recycling ", "edith ann graham", "jellyifhs", "jellyfish", "ifhs", "fish", "leia", "leela"), nrow = 2 ) stringdist(strings[1,], strings[2,], method = "jw", p = 0) stringdist(strings[1,], strings[2,], method = "jw", p = 0.1) stringdist(strings[1,], strings[2,], method = "qgram", q = 1) =# #= Fuzzywuzzy usesRatcliffObershelp if python-Levenshtein not installed, fuzzywuzzy uses RatcliffObershelp) from fuzzywuzzy import fuzz strings = [ ("martha", "marhta"), ("dwayne", "duane") , ("dixon", "dicksonx"), ("william", "williams"), ("", "foo"), ("a", "a"), ("abc", "xyz"), ("abc", "ccc"), ("kitten", "sitting"), ("saturday", "sunday"), ("hi, my name is", "my name is"), ("alborgów", "amoniak"), ("cape sand recycling ", "edith ann graham"), ( "jellyifhs", "jellyfish"), ("ifhs", "fish"), ("leia", "leela"), ] for x in strings: print(fuzz.ratio(x[0], x[1])) =#