change compare

pull/17/head
matthieugomez 2019-08-17 12:26:24 -04:00
parent 9d33299d7b
commit 402d24997f
4 changed files with 74 additions and 77 deletions

View File

@ -6,7 +6,7 @@ module StringDistances
## ##
############################################################################## ##############################################################################
import Base: eltype, length, iterate, ==, hash, isless, convert, show import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import IterTools: chain import IterTools: chain
export export

View File

@ -6,21 +6,17 @@
############################################################################## ##############################################################################
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric) function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
compare(dist, s1, s2)
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
1.0 - evaluate(dist, s1, s2) 1.0 - evaluate(dist, s1, s2)
end end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, function compare(s1::AbstractString, s2::AbstractString,
s1::AbstractString, s2::AbstractString) dist::Union{Hamming, Levenshtein, DamerauLevenshtein})
len = max(length(s1), length(s2)) len = max(length(s1), length(s2))
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
end end
function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString,
dist::AbstractQGramDistance)
# When string length < q for qgram distance, returns s1 == s2 # When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2) len1 = length(s1) ; len2 = length(s2)
min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2) min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2)
@ -31,6 +27,8 @@ function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractSt
end end
end end
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
############################################################################## ##############################################################################
## ##
## Winkler ## Winkler
@ -46,8 +44,8 @@ end
# restrict to distance between 0 and 1 # restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7) Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
score = compare(dist.dist, s1, s2) score = compare(s1, s2, dist.dist)
l = common_prefix(s1, s2, 4)[1] l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment # common prefix adjustment
if score >= dist.boosting_limit if score >= dist.boosting_limit
@ -67,16 +65,16 @@ struct Partial{T <: PreMetric} <: PreMetric
end end
# general # general
function compare(dist::Partial, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s2, len2, s1, len1 = reorder(s1, s2) s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(dist.dist, s1, s2) len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare(dist.dist, "", "") len1 == 0 && return compare("", "", dist.dist)
iter = QGramIterator(s2, len2, len1) iter = QGramIterator(s2, len2, len1)
out = 0.0 out = 0.0
x = iterate(iter) x = iterate(iter)
while x !== nothing while x !== nothing
s, state = x s, state = x
curr = compare(dist.dist, s1, s) curr = compare(s1, s, dist.dist)
out = max(out, curr) out = max(out, curr)
x = iterate(iter, state) x = iterate(iter, state)
end end
@ -85,9 +83,9 @@ end
# Specialization for RatcliffObershelp distance # Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py # Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
s2, len2, s1, len1 = reorder(s1, s2) s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(dist.dist, s1, s2) len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0 out = 0.0
result = matching_blocks(s1, s2) result = matching_blocks(s1, s2)
for r in result for r in result
@ -103,7 +101,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
end end
i2_start = nextind(s2, 0, s2_start) i2_start = nextind(s2, 0, s2_start)
i2_end = nextind(s2, 0, s2_end) i2_end = nextind(s2, 0, s2_end)
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end)) curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
out = max(out, curr) out = max(out, curr)
end end
return out return out
@ -119,10 +117,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
dist::T dist::T
end end
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)
s1 = join(sort!(split(s1)), " ") s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ") s2 = join(sort!(split(s2)), " ")
compare(dist.dist, s1, s2) compare(s1, s2, dist.dist)
end end
############################################################################## ##############################################################################
@ -135,18 +133,18 @@ struct TokenSet{T <: PreMetric} <: PreMetric
dist::T dist::T
end end
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
v0, v1, v2 = _separate!(split(s1), split(s2)) v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = join(v0, " ") s0 = join(v0, " ")
s1 = join(Iterators.flatten((v0, v1)), " ") s1 = join(Iterators.flatten((v0, v1)), " ")
s2 = join(Iterators.flatten((v0, v2)), " ") s2 = join(Iterators.flatten((v0, v2)), " ")
if isempty(s0) if isempty(s0)
# otherwise compare(dist, "", "a")== 1.0 # otherwise compare("", "a", dist)== 1.0
compare(dist.dist, s1, s2) compare(s1, s2, dist.dist)
else else
max(compare(dist.dist, s0, s1), max(compare(s0, s1, dist.dist),
compare(dist.dist, s1, s2), compare(s1, s2, dist.dist),
compare(dist.dist, s0, s2)) compare(s0, s2, dist.dist))
end end
end end
@ -182,24 +180,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
dist::T dist::T
end end
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString) function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
dist0 = compare(dist.dist, s1, s2) dist0 = compare(s1, s2, dist.dist)
s2, len2, s1, len1 = reorder(s1, s2) s2, len2, s1, len1 = reorder(s1, s2)
unbase_scale = 0.95 unbase_scale = 0.95
# if one string is much much shorter than the other # if one string is much much shorter than the other
if len2 >= 1.5 * len1 if len2 >= 1.5 * len1
# if strings are of dissimilar length, use partials # if strings are of dissimilar length, use partials
partial = compare(Partial(dist.dist), s1, s2) partial = compare(s1, s2, Partial(dist.dist))
ptsor = compare(TokenSort(Partial(dist.dist)), s1, s2) ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
ptser = compare(TokenSet(Partial(dist.dist)), s1, s2) ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
partial_scale = len2 > (8 * len1) ? 0.6 : 0.9 partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
return max(dist0, return max(dist0,
partial * partial_scale, partial * partial_scale,
ptsor * unbase_scale * partial_scale, ptsor * unbase_scale * partial_scale,
ptser * unbase_scale * partial_scale) ptser * unbase_scale * partial_scale)
else else
ptsor = compare(TokenSort(dist.dist), s1, s2) ptsor = compare(s1, s2, TokenSort(dist.dist))
ptser = compare(TokenSet(dist.dist), s1, s2) ptser = compare(s1, s2, TokenSet(dist.dist))
return max(dist0, return max(dist0,
ptsor * unbase_scale, ptsor * unbase_scale,
ptser * unbase_scale) ptser * unbase_scale)

View File

@ -2,31 +2,31 @@
using StringDistances, Test using StringDistances, Test
# Compare # Compare
@test compare(Hamming(), "", "abc") 0.0 atol = 1e-4 @test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare(Hamming(), "acc", "abc") 2/3 atol = 1e-4 @test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@test compare(Hamming(), "saturday", "sunday") 1/8 atol = 1e-4 @test compare("saturday", "sunday", Hamming()) 1/8 atol = 1e-4
@test compare(QGram(1), "", "abc") 0.0 atol = 1e-4 @test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare(QGram(1), "abc", "cba") 1.0 atol = 1e-4 @test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare(QGram(1), "abc", "ccc") 1/3 atol = 1e-4 @test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
@test compare(Jaccard(2), "", "abc") 0.0 atol = 1e-4 @test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
@test compare(Jaccard(2), "martha", "martha") 1.0 atol = 1e-4 @test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare(Cosine(2), "martha", "martha") 1.0 atol = 1e-4 @test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare(Jaccard(2), "martha", "martha") 1.0 atol = 1e-4 @test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare(Overlap(2), "martha", "martha") 1.0 atol = 1e-4 @test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare(SorensenDice(2), "martha", "martha") 1.0 atol = 1e-4 @test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
# Winkler # Winkler
@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") 0.9611 atol = 1e-4 @test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) 0.9611 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") 0.84 atol = 1e-4 @test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) 0.84 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") 0.81333 atol = 1e-4 @test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) 0.81333 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") 0.975 atol = 1e-4 @test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) 0.975 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") 0.0 atol = 1e-4 @test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") 1.0 atol = 1e-4 @test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) 1.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") 0.0 atol = 1e-4 @test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) 0.0 atol = 1e-4
strings = [ strings = [
("martha", "marhta"), ("martha", "marhta"),
@ -37,7 +37,7 @@ strings = [
] ]
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000] solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
for i in 1:length(solutions) for i in 1:length(solutions)
@test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) (1 - solutions[i]) atol = 1e-4 @test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) (1 - solutions[i]) atol = 1e-4
end end
@ -45,37 +45,36 @@ end
# Partial # Partial
@test compare(Partial(Jaccard(2)), "aa", "aa ") 1.0 @test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0 @test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0 @test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") 0.444444444444 @test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
s = "HSINCHUANG" s = "HSINCHUANG"
@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875 @test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8 @test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8 @test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888 @test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
@test compare(Partial(Hamming()), "New York Yankees", "Yankees") 1 @test compare("New York Yankees", "Yankees", Partial(Hamming())) 1
@test compare(Partial(Hamming()), "New York Yankees", "") 1 @test compare("New York Yankees", "", Partial(Hamming())) 1
# Token # Token
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") 1.0 @test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") 1.0 - 0.09090909090909094 @test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") 0.0 @test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") 0.0 @test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test compare(TokenMax(RatcliffObershelp()),"mariners", "mariner") 0.933333333333333 @test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0 #@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094 #@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
@ -84,9 +83,9 @@ s = "HSINCHUANG"
@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") 1.0 @test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test compare(TokenMax(RatcliffObershelp()), "为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。") 0.06428571428571427 @test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", TokenMax(RatcliffObershelp())) 0.06428571428571427

View File

@ -1,12 +1,12 @@
using StringDistances, Test using StringDistances, Test
# check with weird utf8 strings # check with weird utf8 strings
compare(TokenMax(RatcliffObershelp()), "aüa", "aua") compare("aüa", "aua", TokenMax(RatcliffObershelp()))
compare(TokenMax(QGram(2)), "aüa", "aua") compare("aüa", "aua", TokenMax(QGram(2)))
compare(DamerauLevenshtein(), "aüa", "aua") compare("aüa", "aua", DamerauLevenshtein())
compare(Hamming(), "aüa", "aua") compare("aüa", "aua", Hamming())
compare(Jaro(), "aüa", "aua") compare("aüa", "aua", Jaro())
compare(Levenshtein(), "aüa", "aua") compare("aüa", "aua", Levenshtein())
s1 = "aü☃" s1 = "aü☃"