change compare

pull/17/head
matthieugomez 2019-08-17 12:26:24 -04:00
parent 9d33299d7b
commit 402d24997f
4 changed files with 74 additions and 77 deletions

View File

@ -6,7 +6,7 @@ module StringDistances
##
##############################################################################
import Base: eltype, length, iterate, ==, hash, isless, convert, show
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import IterTools: chain
export

View File

@ -6,21 +6,17 @@
##############################################################################
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
compare(dist, s1, s2)
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
1.0 - evaluate(dist, s1, s2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Hamming, Levenshtein, DamerauLevenshtein})
len = max(length(s1), length(s2))
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
end
function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString,
dist::AbstractQGramDistance)
# When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2)
min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2)
@ -31,6 +27,8 @@ function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractSt
end
end
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
##############################################################################
##
## Winkler
@ -46,8 +44,8 @@ end
# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString)
score = compare(dist.dist, s1, s2)
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
score = compare(s1, s2, dist.dist)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment
if score >= dist.boosting_limit
@ -67,16 +65,16 @@ struct Partial{T <: PreMetric} <: PreMetric
end
# general
function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(dist.dist, s1, s2)
len1 == 0 && return compare(dist.dist, "", "")
len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare("", "", dist.dist)
iter = QGramIterator(s2, len2, len1)
out = 0.0
x = iterate(iter)
while x !== nothing
s, state = x
curr = compare(dist.dist, s1, s)
curr = compare(s1, s, dist.dist)
out = max(out, curr)
x = iterate(iter, state)
end
@ -85,9 +83,9 @@ end
# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(dist.dist, s1, s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
result = matching_blocks(s1, s2)
for r in result
@ -103,7 +101,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
end
i2_start = nextind(s2, 0, s2_start)
i2_end = nextind(s2, 0, s2_end)
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
out = max(out, curr)
end
return out
@ -119,10 +117,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(dist.dist, s1, s2)
compare(s1, s2, dist.dist)
end
##############################################################################
@ -135,18 +133,18 @@ struct TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = join(v0, " ")
s1 = join(Iterators.flatten((v0, v1)), " ")
s2 = join(Iterators.flatten((v0, v2)), " ")
if isempty(s0)
# otherwise compare(dist, "", "a")== 1.0
compare(dist.dist, s1, s2)
# otherwise compare("", "a", dist)== 1.0
compare(s1, s2, dist.dist)
else
max(compare(dist.dist, s0, s1),
compare(dist.dist, s1, s2),
compare(dist.dist, s0, s2))
max(compare(s0, s1, dist.dist),
compare(s1, s2, dist.dist),
compare(s0, s2, dist.dist))
end
end
@ -182,24 +180,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString)
dist0 = compare(dist.dist, s1, s2)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
dist0 = compare(s1, s2, dist.dist)
s2, len2, s1, len1 = reorder(s1, s2)
unbase_scale = 0.95
# if one string is much much shorter than the other
if len2 >= 1.5 * len1
# if strings are of dissimilar length, use partials
partial = compare(Partial(dist.dist), s1, s2)
ptsor = compare(TokenSort(Partial(dist.dist)), s1, s2)
ptser = compare(TokenSet(Partial(dist.dist)), s1, s2)
partial = compare(s1, s2, Partial(dist.dist))
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
return max(dist0,
partial * partial_scale,
ptsor * unbase_scale * partial_scale,
ptser * unbase_scale * partial_scale)
else
ptsor = compare(TokenSort(dist.dist), s1, s2)
ptser = compare(TokenSet(dist.dist), s1, s2)
ptsor = compare(s1, s2, TokenSort(dist.dist))
ptser = compare(s1, s2, TokenSet(dist.dist))
return max(dist0,
ptsor * unbase_scale,
ptser * unbase_scale)

View File

@ -2,31 +2,31 @@
using StringDistances, Test
# Compare
@test compare(Hamming(), "", "abc") 0.0 atol = 1e-4
@test compare(Hamming(), "acc", "abc") 2/3 atol = 1e-4
@test compare(Hamming(), "saturday", "sunday") 1/8 atol = 1e-4
@test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@test compare("saturday", "sunday", Hamming()) 1/8 atol = 1e-4
@test compare(QGram(1), "", "abc") 0.0 atol = 1e-4
@test compare(QGram(1), "abc", "cba") 1.0 atol = 1e-4
@test compare(QGram(1), "abc", "ccc") 1/3 atol = 1e-4
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
@test compare(Jaccard(2), "", "abc") 0.0 atol = 1e-4
@test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
@test compare(Jaccard(2), "martha", "martha") 1.0 atol = 1e-4
@test compare(Cosine(2), "martha", "martha") 1.0 atol = 1e-4
@test compare(Jaccard(2), "martha", "martha") 1.0 atol = 1e-4
@test compare(Overlap(2), "martha", "martha") 1.0 atol = 1e-4
@test compare(SorensenDice(2), "martha", "martha") 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
# Winkler
@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") 0.9611 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") 0.84 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") 0.81333 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") 0.975 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") 0.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") 1.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") 0.0 atol = 1e-4
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
strings = [
("martha", "marhta"),
@ -37,7 +37,7 @@ strings = [
]
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
for i in 1:length(solutions)
@test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) (1 - solutions[i]) atol = 1e-4
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) (1 - solutions[i]) atol = 1e-4
end
@ -45,37 +45,36 @@ end
# Partial
@test compare(Partial(Jaccard(2)), "aa", "aa ") 1.0
@test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0
@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") 0.444444444444
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
s = "HSINCHUANG"
@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888
@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
@test compare(Partial(Hamming()), "New York Yankees", "Yankees") 1
@test compare(Partial(Hamming()), "New York Yankees", "") 1
@test compare("New York Yankees", "Yankees", Partial(Hamming())) 1
@test compare("New York Yankees", "", Partial(Hamming())) 1
# Token
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") 1.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") 1.0 - 0.09090909090909094
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") 0.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test compare(TokenMax(RatcliffObershelp()),"mariners", "mariner") 0.933333333333333
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
@ -84,9 +83,9 @@ s = "HSINCHUANG"
@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") 1.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test compare(TokenMax(RatcliffObershelp()), "为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。") 0.06428571428571427
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", TokenMax(RatcliffObershelp())) 0.06428571428571427

View File

@ -1,12 +1,12 @@
using StringDistances, Test
# check with weird utf8 strings
compare(TokenMax(RatcliffObershelp()), "aüa", "aua")
compare(TokenMax(QGram(2)), "aüa", "aua")
compare(DamerauLevenshtein(), "aüa", "aua")
compare(Hamming(), "aüa", "aua")
compare(Jaro(), "aüa", "aua")
compare(Levenshtein(), "aüa", "aua")
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
compare("aüa", "aua", TokenMax(QGram(2)))
compare("aüa", "aua", DamerauLevenshtein())
compare("aüa", "aua", Hamming())
compare("aüa", "aua", Jaro())
compare("aüa", "aua", Levenshtein())
s1 = "aü☃"