StringDistances.jl/test/modifiers.jl

92 lines
4.0 KiB
Julia
Raw Normal View History

2015-11-04 18:40:30 +01:00
2018-07-04 18:07:26 +02:00
using StringDistances, Test
2015-11-04 18:40:30 +01:00
2018-05-16 00:39:50 +02:00
# Compare
2019-08-17 18:26:24 +02:00
@test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@test compare("saturday", "sunday", Hamming()) 1/8 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
2018-05-16 00:47:55 +02:00
2018-05-16 00:39:50 +02:00
# Winkler
2019-08-17 18:26:24 +02:00
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
2015-11-04 18:40:30 +01:00
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
2017-08-05 20:45:19 +02:00
("", "foo")
2015-11-04 18:40:30 +01:00
]
2017-08-05 20:45:19 +02:00
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
2015-11-04 18:40:30 +01:00
for i in 1:length(solutions)
2019-08-17 18:26:24 +02:00
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) (1 - solutions[i]) atol = 1e-4
2015-11-04 18:40:30 +01:00
end
2018-05-16 00:39:50 +02:00
# Partial
2019-08-17 18:26:24 +02:00
@test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
2015-11-06 16:47:15 +01:00
2015-11-04 18:40:30 +01:00
2015-11-05 16:51:32 +01:00
s = "HSINCHUANG"
2019-08-17 18:26:24 +02:00
@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare("New York Yankees", "Yankees", Partial(Hamming())) 1
@test compare("New York Yankees", "", Partial(Hamming())) 1
2015-11-04 18:40:30 +01:00
2018-05-16 00:39:50 +02:00
# Token
2019-08-17 18:26:24 +02:00
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
2015-11-06 16:47:15 +01:00
2015-11-10 15:47:12 +01:00
2019-08-17 18:26:24 +02:00
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
2015-11-10 15:47:12 +01:00
2016-08-31 22:05:38 +02:00
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
2015-11-10 15:47:12 +01:00
2019-08-17 18:26:24 +02:00
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
2019-08-14 00:18:04 +02:00
2019-08-14 16:45:16 +02:00
2019-08-17 18:26:24 +02:00
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", TokenMax(RatcliffObershelp())) 0.06428571428571427
2019-08-14 16:45:16 +02:00