StringDistances.jl/test/modifiers.jl

151 lines
8.9 KiB
Julia
Raw Normal View History

2015-11-04 18:40:30 +01:00
2020-02-09 19:42:29 +01:00
using StringDistances, Unicode, Test
2015-11-04 18:40:30 +01:00
@testset "Modifiers" begin
2020-11-14 21:37:04 +01:00
# Partial
@test Partial(QGram(2))("martha", "marhta") == 6
@test Partial(QGram(2))("martha", missing) === missing
@test Partial(Levenshtein())("martha", "marhta") == 2
@test Partial(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
@test Partial(RatcliffObershelp())("martha", missing) === missing
# TokenSort
@test TokenSort(QGram(2))("martha", "marhta") == 6
@test TokenSort(QGram(2))("martha", missing) === missing
@test TokenSort(Levenshtein())("martha", "marhta") == 2
@test TokenSort(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
# TokenSet
@test TokenSet(QGram(2))("martha", "marhta") == 6
@test TokenSet(QGram(2))("martha", missing) === missing
@test TokenSet(Levenshtein())("martha", "marhta") == 2
@test TokenSet(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
# TokenMax
@test TokenMax(QGram(2))("martha", "marhta") 0.6
@test TokenMax(QGram(2))("martha", missing) === missing
@test TokenMax(Levenshtein())("martha", "marhta") 1/3
@test TokenMax(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
end
2020-11-14 21:37:04 +01:00
@testset "Compare" begin
2019-12-12 15:38:20 +01:00
# Qgram
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
compare("aüa", "aua", TokenMax(QGram(2)))
@test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
@test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
# Jaro
2020-11-10 16:29:48 +01:00
@test compare("aüa", "aua", Hamming()) 2/3
@test compare("aüa", "aua", Jaro()) 0.77777777 atol = 1e-4
2020-11-14 21:37:04 +01:00
@test compare("New York Yankees", "", Partial(Jaro())) 0.0
2019-12-12 15:38:20 +01:00
2020-11-14 21:37:04 +01:00
# JaroWinkler
2020-11-12 06:13:14 +01:00
@test compare("martha", "marhta", JaroWinkler()) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", JaroWinkler()) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", JaroWinkler()) 0.81333 atol = 1e-4
@test compare("william", "williams", JaroWinkler()) 0.975 atol = 1e-4
@test compare("", "foo", JaroWinkler()) 0.0 atol = 1e-4
@test compare("a", "a", JaroWinkler()) 1.0 atol = 1e-4
@test compare("abc", "xyz", JaroWinkler()) 0.0 atol = 1e-4
2019-12-12 15:38:20 +01:00
2020-11-14 21:37:04 +01:00
#Levenshtein
compare("aüa", "aua", Levenshtein())
@test compare("ok", missing, Levenshtein()) === missing
compare("aüa", "aua", DamerauLevenshtein())
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
2019-12-12 15:38:20 +01:00
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
#@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
2019-12-12 15:38:20 +01:00
@test compare("HSINCHUANG", "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
2020-11-14 21:37:04 +01:00
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
2020-02-09 19:42:29 +01:00
@test compare(graphemes("New York Mets vs Atlanta Braves"), graphemes("Atlanta Braves vs New York Mets"), Partial(RatcliffObershelp())) compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", Partial(RatcliffObershelp()))
2019-12-12 15:38:20 +01:00
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
2020-11-14 21:37:04 +01:00
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp()) 5 / 100 atol = 1e-2
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp())) 7 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp())) 79 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp())) 88 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp())) 11 / 100 atol = 1e-2
@test compare("mariners", "are mariner playing tomorrow", RatcliffObershelp()) 39 / 100 atol = 1e-2
@test compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp())) 88 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp())) 39 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp()))) 88 / 100 atol = 1e-2
2019-12-12 15:38:20 +01:00
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
2020-11-14 21:37:04 +01:00
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp())) 52 / 100 atol = 1e-2
2019-12-12 15:38:20 +01:00
# check min
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo"),
("a", "a"),
("abc", "xyz"),
("abc", "ccc"),
("kitten", "sitting"),
("saturday", "sunday"),
("hi, my name is", "my name is"),
("alborgów", "amoniak"),
("cape sand recycling ", "edith ann graham"),
( "jellyifhs", "jellyfish"),
("ifhs", "fish"),
("leia", "leela"),
]
for dist in (Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0
else
@test compare(strings[i]..., dist() ; min_score = 1/ 3) compare(strings[i]..., dist())
end
end
end
2020-11-14 21:37:04 +01:00
end
2019-08-14 16:45:16 +02:00
2020-11-14 21:37:04 +01:00
@testset "Find*" begin
# findnearest
2020-09-28 23:57:42 +02:00
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
@test findnearest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
2020-11-14 21:37:04 +01:00
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1)
2020-11-14 21:37:04 +01:00
# findall
2019-12-12 20:48:52 +01:00
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], QGram(2); min_score = 0.99) == Int[]
2019-12-12 22:49:20 +01:00
if VERSION >= v"1.2.0"
2020-09-28 23:57:42 +02:00
@test findnearest("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1)
@test findnearest("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing)
2019-12-12 22:49:20 +01:00
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
end
2020-11-14 21:37:04 +01:00
end