2015-11-04 18:40:30 +01:00
2018-07-04 18:07:26 +02:00
using StringDistances , Test
2015-11-04 18:40:30 +01:00
2018-05-16 00:39:50 +02:00
# Compare
2019-08-17 18:26:24 +02:00
@test compare ( " " , " abc " , Hamming ( ) ) ≈ 0.0 atol = 1e-4
@test compare ( " acc " , " abc " , Hamming ( ) ) ≈ 2 / 3 atol = 1e-4
@test compare ( " saturday " , " sunday " , Hamming ( ) ) ≈ 1 / 8 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare ( " " , " abc " , QGram ( 1 ) ) ≈ 0.0 atol = 1e-4
@test compare ( " abc " , " cba " , QGram ( 1 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " abc " , " ccc " , QGram ( 1 ) ) ≈ 1 / 3 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare ( " " , " abc " , Jaccard ( 2 ) ) ≈ 0.0 atol = 1e-4
2018-05-16 00:39:50 +02:00
2019-08-17 18:26:24 +02:00
@test compare ( " martha " , " martha " , Jaccard ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , Cosine ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , Jaccard ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , Overlap ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , SorensenDice ( 2 ) ) ≈ 1.0 atol = 1e-4
2018-05-16 00:47:55 +02:00
2018-05-16 00:39:50 +02:00
# Winkler
2019-08-17 18:26:24 +02:00
@test compare ( " martha " , " marhta " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.9611 atol = 1e-4
@test compare ( " dwayne " , " duane " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.84 atol = 1e-4
@test compare ( " dixon " , " dicksonx " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.81333 atol = 1e-4
@test compare ( " william " , " williams " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.975 atol = 1e-4
@test compare ( " " , " foo " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.0 atol = 1e-4
@test compare ( " a " , " a " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " abc " , " xyz " , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ 0.0 atol = 1e-4
2015-11-04 18:40:30 +01:00
strings = [
( " martha " , " marhta " ) ,
( " dwayne " , " duane " ) ,
( " dixon " , " dicksonx " ) ,
( " william " , " williams " ) ,
2017-08-05 20:45:19 +02:00
( " " , " foo " )
2015-11-04 18:40:30 +01:00
]
2017-08-05 20:45:19 +02:00
solutions = [ 0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 ]
2015-11-04 18:40:30 +01:00
for i in 1 : length ( solutions )
2019-08-17 18:26:24 +02:00
@test compare ( strings [ i ] ... , Winkler ( Jaro ( ) , 0.1 , 0.0 ) ) ≈ ( 1 - solutions [ i ] ) atol = 1e-4
2015-11-04 18:40:30 +01:00
end
2018-05-16 00:39:50 +02:00
# Partial
2019-08-17 18:26:24 +02:00
@test compare ( " aa " , " aa " , Partial ( Jaccard ( 2 ) ) ) ≈ 1.0
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare ( " New York Yankees " , " Yankees " , Partial ( RatcliffObershelp ( ) ) ) ≈ 1.0
@test compare ( " New York Yankees " , " " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.0
@test compare ( " mariners vs angels " , " los angeles angels at seattle mariners " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.444444444444
2015-11-06 16:47:15 +01:00
2015-11-04 18:40:30 +01:00
2015-11-05 16:51:32 +01:00
s = " HSINCHUANG "
2019-08-17 18:26:24 +02:00
@test compare ( s , " SINJHUAN " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.875
@test compare ( s , " LSINJHUANG DISTRIC " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8
@test compare ( s , " SINJHUANG DISTRICT " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8
@test compare ( s , " SINJHUANG " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8888888888888
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare ( " New York Yankees " , " Yankees " , Partial ( Hamming ( ) ) ) ≈ 1
@test compare ( " New York Yankees " , " " , Partial ( Hamming ( ) ) ) ≈ 1
2015-11-04 18:40:30 +01:00
2018-05-16 00:39:50 +02:00
# Token
2019-08-17 18:26:24 +02:00
@test compare ( " New York Mets vs Atlanta Braves " , " Atlanta Braves vs New York Mets " , TokenSort ( RatcliffObershelp ( ) ) ) ≈ 1.0
@test compare ( " mariners vs angels " , " los angeles angels of anaheim at seattle mariners " , TokenSet ( RatcliffObershelp ( ) ) ) ≈ 1.0 - 0.09090909090909094
2015-11-04 18:40:30 +01:00
2019-08-17 18:26:24 +02:00
@test compare ( " New York Mets vs Atlanta Braves " , " " , TokenSort ( RatcliffObershelp ( ) ) ) ≈ 0.0
@test compare ( " mariners vs angels " , " " , TokenSet ( RatcliffObershelp ( ) ) ) ≈ 0.0
2015-11-06 16:47:15 +01:00
2015-11-10 15:47:12 +01:00
2019-08-17 18:26:24 +02:00
@test compare ( " mariners " , " mariner " , TokenMax ( RatcliffObershelp ( ) ) ) ≈ 0.933333333333333
2015-11-10 15:47:12 +01:00
2016-08-31 22:05:38 +02:00
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
2015-11-10 15:47:12 +01:00
2019-08-17 18:26:24 +02:00
@test compare ( " mariners vs angels " , " los angeles angels at seattle mariners " , TokenSet ( Partial ( RatcliffObershelp ( ) ) ) ) ≈ 1.0
2019-08-14 00:18:04 +02:00
2019-08-14 16:45:16 +02:00
2019-08-17 18:26:24 +02:00
@test compare ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " , TokenMax ( RatcliffObershelp ( ) ) ) ≈ 0.06428571428571427
2019-08-14 16:45:16 +02:00