2015-11-04 18:40:30 +01:00
2020-02-09 19:42:29 +01:00
using StringDistances , Unicode , Test
2015-11-04 18:40:30 +01:00
2019-12-11 20:45:58 +01:00
@testset " Modifiers " begin
2019-12-12 15:38:20 +01:00
# Qgram
@test compare ( " " , " abc " , QGram ( 1 ) ) ≈ 0.0 atol = 1e-4
@test compare ( " abc " , " cba " , QGram ( 1 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " abc " , " ccc " , QGram ( 1 ) ) ≈ 1 / 3 atol = 1e-4
compare ( " aüa " , " aua " , TokenMax ( QGram ( 2 ) ) )
@test compare ( " " , " abc " , Jaccard ( 2 ) ) ≈ 0.0 atol = 1e-4
@test compare ( " martha " , " martha " , Jaccard ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , Jaccard ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " aa " , " aa " , Partial ( Jaccard ( 2 ) ) ) ≈ 1.0
@test compare ( " martha " , " martha " , Cosine ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , Overlap ( 2 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " martha " , " martha " , SorensenDice ( 2 ) ) ≈ 1.0 atol = 1e-4
# Jaro
compare ( " aüa " , " aua " , Jaro ( ) )
#Levenshtein
compare ( " aüa " , " aua " , Levenshtein ( ) )
compare ( " aüa " , " aua " , DamerauLevenshtein ( ) )
# Winkler
2020-02-09 19:37:37 +01:00
@test compare ( " martha " , " marhta " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.9611 atol = 1e-4
@test compare ( " dwayne " , " duane " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.84 atol = 1e-4
@test compare ( " dixon " , " dicksonx " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.81333 atol = 1e-4
@test compare ( " william " , " williams " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.975 atol = 1e-4
@test compare ( " " , " foo " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.0 atol = 1e-4
@test compare ( " a " , " a " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 1.0 atol = 1e-4
@test compare ( " abc " , " xyz " , Winkler ( Jaro ( ) , p = 0.1 , threshold = 0.0 , maxlength = 4 ) ) ≈ 0.0 atol = 1e-4
2019-12-12 15:38:20 +01:00
# RatcliffObershelp
@test compare ( " New York Mets vs Atlanta Braves " , " " , RatcliffObershelp ( ) ) ≈ 0.0
@test round ( Int , 100 * compare ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " , RatcliffObershelp ( ) ) ) == 5
compare ( " aüa " , " aua " , TokenMax ( RatcliffObershelp ( ) ) )
2020-02-13 15:44:27 +01:00
@test compare ( " New York Yankees " , " " , Partial ( Jaro ( ) ) ) ≈ 0.0
2019-12-12 15:38:20 +01:00
@test compare ( " New York Yankees " , " Yankees " , Partial ( RatcliffObershelp ( ) ) ) ≈ 1.0
@test compare ( " New York Yankees " , " " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.0
@test compare ( " mariners vs angels " , " los angeles angels at seattle mariners " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.444444444444
@test compare ( " HSINCHUANG " , " SINJHUAN " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.875
@test compare ( " HSINCHUANG " , " LSINJHUANG DISTRIC " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8
@test compare ( " HSINCHUANG " , " SINJHUANG DISTRICT " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8
@test compare ( " HSINCHUANG " , " SINJHUANG " , Partial ( RatcliffObershelp ( ) ) ) ≈ 0.8888888888888
@test compare ( " New York Mets vs Atlanta Braves " , " Atlanta Braves vs New York Mets " , TokenSort ( RatcliffObershelp ( ) ) ) ≈ 1.0
2020-02-09 19:42:29 +01:00
@test compare ( graphemes ( " New York Mets vs Atlanta Braves " ) , graphemes ( " Atlanta Braves vs New York Mets " ) , Partial ( RatcliffObershelp ( ) ) ) ≈ compare ( " New York Mets vs Atlanta Braves " , " Atlanta Braves vs New York Mets " , Partial ( RatcliffObershelp ( ) ) )
2019-12-12 15:38:20 +01:00
@test compare ( " mariners vs angels " , " los angeles angels of anaheim at seattle mariners " , TokenSet ( RatcliffObershelp ( ) ) ) ≈ 1.0 - 0.09090909090909094
@test compare ( " New York Mets vs Atlanta Braves " , " " , TokenSort ( RatcliffObershelp ( ) ) ) ≈ 0.0
@test compare ( " mariners vs angels " , " " , TokenSet ( RatcliffObershelp ( ) ) ) ≈ 0.0
@test compare ( " mariners vs angels " , " los angeles angels at seattle mariners " , TokenSet ( Partial ( RatcliffObershelp ( ) ) ) ) ≈ 1.0
@test compare ( " mariners " , " mariner " , TokenMax ( RatcliffObershelp ( ) ) ) ≈ 0.933333333333333
@test round ( Int , 100 * compare ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " , RatcliffObershelp ( ) ) ) == 5
@test round ( Int , 100 * compare ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " , Partial ( RatcliffObershelp ( ) ) ) ) == 7
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow " , TokenMax ( RatcliffObershelp ( ) ) ) ) == 79
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow " , Partial ( RatcliffObershelp ( ) ) ) ) == 88
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow " , TokenSort ( RatcliffObershelp ( ) ) ) ) == 11
@test round ( Int , 100 * compare ( " mariners " , " are mariner playing tomorrow " , RatcliffObershelp ( ) ) ) == 39
@test round ( Int , 100 * compare ( " mariners " , " are mariner playing tomorrow " , Partial ( RatcliffObershelp ( ) ) ) ) == 88
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow " , TokenSet ( RatcliffObershelp ( ) ) ) ) == 39
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow " , TokenSet ( Partial ( RatcliffObershelp ( ) ) ) ) ) == 88
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
@test round ( Int , 100 * compare ( " mariners " , " mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow " , TokenMax ( RatcliffObershelp ( ) ) ) ) == 52
# check missing
@test compare ( " ok " , missing , Levenshtein ( ) ) === missing
# check min
strings = [
( " martha " , " marhta " ) ,
( " dwayne " , " duane " ) ,
( " dixon " , " dicksonx " ) ,
( " william " , " williams " ) ,
( " " , " foo " ) ,
( " a " , " a " ) ,
( " abc " , " xyz " ) ,
( " abc " , " ccc " ) ,
( " kitten " , " sitting " ) ,
( " saturday " , " sunday " ) ,
( " hi, my name is " , " my name is " ) ,
( " alborgów " , " amoniak " ) ,
( " cape sand recycling " , " edith ann graham " ) ,
( " jellyifhs " , " jellyfish " ) ,
( " ifhs " , " fish " ) ,
( " leia " , " leela " ) ,
]
for dist in ( Levenshtein , DamerauLevenshtein )
for i in eachindex ( strings )
if compare ( strings [ i ] ... , dist ( ) ) < 1 / 3
@test compare ( strings [ i ] ... , dist ( ) ; min_score = 1 / 3 ) ≈ 0.0
else
@test compare ( strings [ i ] ... , dist ( ) ; min_score = 1 / 3 ) ≈ compare ( strings [ i ] ... , dist ( ) )
end
end
end
2019-08-14 16:45:16 +02:00
2019-12-12 15:38:20 +01:00
# check find_best and find_all
2019-12-12 20:48:52 +01:00
@test findmax ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Levenshtein ( ) ) == ( " NewYork " , 1 )
2019-12-13 16:33:06 +01:00
@test findmax ( " New York " , [ " San Francisco " , " NewYork " , " Newark " ] , Levenshtein ( ) ) == ( " NewYork " , 2 )
@test findmax ( " New York " , [ " Newark " , " San Francisco " , " NewYork " ] , Levenshtein ( ) ) == ( " NewYork " , 3 )
2019-12-12 20:48:52 +01:00
@test findmax ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Levenshtein ( ) ; min_score = 0.99 ) == ( nothing , nothing )
@test findmax ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Jaro ( ) ) == ( " NewYork " , 1 )
@test findall ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Levenshtein ( ) ) == [ 1 ]
@test findall ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Jaro ( ) ) == [ 1 , 2 ]
@test findall ( " New York " , [ " NewYork " , " Newark " , " San Francisco " ] , Jaro ( ) ; min_score = 0.99 ) == Int [ ]
2019-12-12 22:49:20 +01:00
if VERSION >= v " 1.2.0 "
@test findmax ( " New York " , skipmissing ( [ " NewYork " , " Newark " , missing ] ) , Levenshtein ( ) ) == ( " NewYork " , 1 )
@test findmax ( " New York " , skipmissing ( Union { AbstractString , Missing } [ missing , missing ] ) , Levenshtein ( ) ) == ( nothing , nothing )
@test findall ( " New York " , skipmissing ( [ " NewYork " , " Newark " , missing ] ) , Levenshtein ( ) ) == [ 1 ]
@test findall ( " New York " , skipmissing ( Union { AbstractString , Missing } [ missing , missing ] ) , Levenshtein ( ) ) == [ ]
end
2019-08-17 20:38:49 +02:00
2019-12-12 15:38:20 +01:00
end
2019-08-17 20:38:49 +02:00
2019-08-17 21:46:22 +02:00
#= P y t h o n c o d e
from fuzzywuzzy import fuzz
fuzz . ratio ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " )
fuzz . partial_ratio ( " 为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞 " , " 此前稍早些时候中国商务部发布消息称, 中美经贸高级别磋商双方牵头人通话, 中方就美拟9月1日加征关税进行了严正交涉。 " )
2019-12-12 15:38:20 +01:00
= #