add tests
parent
fc3fc17992
commit
06d8ef1831
|
@ -5,6 +5,7 @@ version = "0.4.0"
|
|||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
|
||||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
#> 0.9538461538461539
|
||||
```
|
||||
|
||||
- Modifiers from the Python library [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). One difference with this Python library is that modifiers are defined for any distance, not just the levenshtein one.
|
||||
- Modifiers from the Python library [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/), that can be applied to any distance.
|
||||
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the maximal similarity score between the shorter string and substrings of the longer string.
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ module StringDistances
|
|||
## Export
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
using DataStructures
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import IterTools: chain
|
||||
|
|
|
@ -129,35 +129,24 @@ struct TokenSet{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
|
||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||
v0, v1, v2 = separate!(SortedSet(split(s1)), SortedSet(split(s2)))
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(Iterators.flatten((v0, v1)), " ")
|
||||
s2 = join(Iterators.flatten((v0, v2)), " ")
|
||||
# otherwise compare("", "a", dist)== 1.0
|
||||
isempty(s0) && return compare(s1, s2, dist.dist)
|
||||
s1 = join(union(v0, v1), " ")
|
||||
s2 = join(union(v0, v2), " ")
|
||||
max(compare(s0, s1, dist.dist),
|
||||
compare(s1, s2, dist.dist),
|
||||
compare(s0, s2, dist.dist))
|
||||
compare(s0, s2, dist.dist),
|
||||
compare(s1, s2, dist.dist))
|
||||
|
||||
end
|
||||
|
||||
# separate 2 vectors in intersection, setdiff1, setdiff2 (all sorted)
|
||||
function _separate!(v1::AbstractVector, v2::AbstractVector)
|
||||
sort!(v1)
|
||||
sort!(v2)
|
||||
out = eltype(v1)[]
|
||||
start = 1
|
||||
i1 = 0
|
||||
while i1 < length(v1)
|
||||
i1 += 1
|
||||
x = v1[i1]
|
||||
i2 = searchsortedfirst(v2, x, start, length(v2), Base.Forward)
|
||||
i2 > length(v2) && break
|
||||
if i2 > 0 && v2[i2] == x
|
||||
deleteat!(v1, i1)
|
||||
deleteat!(v2, i2)
|
||||
# separate 2 sets in intersection, setdiff1, setdiff2 (all sorted)
|
||||
function separate!(v1::SortedSet, v2::SortedSet)
|
||||
out = OrderedSet{eltype(v1)}()
|
||||
for x in v1
|
||||
if x in v2
|
||||
pop!(v1, x)
|
||||
pop!(v2, x)
|
||||
push!(out, x)
|
||||
i1 -= 1
|
||||
start = i2
|
||||
end
|
||||
end
|
||||
return out, v1, v2
|
||||
|
|
|
@ -69,13 +69,17 @@ s = "HSINCHUANG"
|
|||
|
||||
|
||||
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0
|
||||
|
||||
# ADD AGAIN
|
||||
#@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0
|
||||
|
||||
|
||||
|
||||
|
||||
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333
|
||||
|
||||
|
||||
|
||||
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
|
||||
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
|
||||
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
|
||||
|
@ -92,5 +96,26 @@ s = "HSINCHUANG"
|
|||
# test with fuzz ratio
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
|
||||
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
|
||||
|
||||
|
||||
#= Python code
|
||||
from fuzzywuzzy import fuzz
|
||||
fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。")
|
||||
fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow")
|
||||
fuzz.partial_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.token_sort_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.partial_token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
=#
|
||||
|
|
Loading…
Reference in New Issue