change compare
parent
9d33299d7b
commit
402d24997f
|
@ -6,7 +6,7 @@ module StringDistances
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import IterTools: chain
|
||||
export
|
||||
|
|
|
@ -6,21 +6,17 @@
|
|||
##############################################################################
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
|
||||
compare(dist, s1, s2)
|
||||
end
|
||||
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||
s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein})
|
||||
len = max(length(s1), length(s2))
|
||||
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
|
||||
end
|
||||
|
||||
function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
dist::AbstractQGramDistance)
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
len1 = length(s1) ; len2 = length(s2)
|
||||
min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2)
|
||||
|
@ -31,6 +27,8 @@ function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractSt
|
|||
end
|
||||
end
|
||||
|
||||
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Winkler
|
||||
|
@ -46,8 +44,8 @@ end
|
|||
# restrict to distance between 0 and 1
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
||||
|
||||
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString)
|
||||
score = compare(dist.dist, s1, s2)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
|
||||
score = compare(s1, s2, dist.dist)
|
||||
l = common_prefix(s1, s2, 4)[1]
|
||||
# common prefix adjustment
|
||||
if score >= dist.boosting_limit
|
||||
|
@ -67,16 +65,16 @@ struct Partial{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
# general
|
||||
function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||
len1 == 0 && return compare(dist.dist, "", "")
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
len1 == 0 && return compare("", "", dist.dist)
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
out = 0.0
|
||||
x = iterate(iter)
|
||||
while x !== nothing
|
||||
s, state = x
|
||||
curr = compare(dist.dist, s1, s)
|
||||
curr = compare(s1, s, dist.dist)
|
||||
out = max(out, curr)
|
||||
x = iterate(iter, state)
|
||||
end
|
||||
|
@ -85,9 +83,9 @@ end
|
|||
|
||||
# Specialization for RatcliffObershelp distance
|
||||
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
|
||||
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
out = 0.0
|
||||
result = matching_blocks(s1, s2)
|
||||
for r in result
|
||||
|
@ -103,7 +101,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
|
|||
end
|
||||
i2_start = nextind(s2, 0, s2_start)
|
||||
i2_end = nextind(s2, 0, s2_end)
|
||||
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
|
||||
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
|
||||
out = max(out, curr)
|
||||
end
|
||||
return out
|
||||
|
@ -119,10 +117,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
compare(dist.dist, s1, s2)
|
||||
compare(s1, s2, dist.dist)
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -135,18 +133,18 @@ struct TokenSet{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
|
||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(Iterators.flatten((v0, v1)), " ")
|
||||
s2 = join(Iterators.flatten((v0, v2)), " ")
|
||||
if isempty(s0)
|
||||
# otherwise compare(dist, "", "a")== 1.0
|
||||
compare(dist.dist, s1, s2)
|
||||
# otherwise compare("", "a", dist)== 1.0
|
||||
compare(s1, s2, dist.dist)
|
||||
else
|
||||
max(compare(dist.dist, s0, s1),
|
||||
compare(dist.dist, s1, s2),
|
||||
compare(dist.dist, s0, s2))
|
||||
max(compare(s0, s1, dist.dist),
|
||||
compare(s1, s2, dist.dist),
|
||||
compare(s0, s2, dist.dist))
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -182,24 +180,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString)
|
||||
dist0 = compare(dist.dist, s1, s2)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
||||
dist0 = compare(s1, s2, dist.dist)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much much shorter than the other
|
||||
if len2 >= 1.5 * len1
|
||||
# if strings are of dissimilar length, use partials
|
||||
partial = compare(Partial(dist.dist), s1, s2)
|
||||
ptsor = compare(TokenSort(Partial(dist.dist)), s1, s2)
|
||||
ptser = compare(TokenSet(Partial(dist.dist)), s1, s2)
|
||||
partial = compare(s1, s2, Partial(dist.dist))
|
||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
|
||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
|
||||
partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
|
||||
return max(dist0,
|
||||
partial * partial_scale,
|
||||
ptsor * unbase_scale * partial_scale,
|
||||
ptser * unbase_scale * partial_scale)
|
||||
else
|
||||
ptsor = compare(TokenSort(dist.dist), s1, s2)
|
||||
ptser = compare(TokenSet(dist.dist), s1, s2)
|
||||
ptsor = compare(s1, s2, TokenSort(dist.dist))
|
||||
ptser = compare(s1, s2, TokenSet(dist.dist))
|
||||
return max(dist0,
|
||||
ptsor * unbase_scale,
|
||||
ptser * unbase_scale)
|
||||
|
|
|
@ -2,31 +2,31 @@
|
|||
using StringDistances, Test
|
||||
|
||||
# Compare
|
||||
@test compare(Hamming(), "", "abc") ≈ 0.0 atol = 1e-4
|
||||
@test compare(Hamming(), "acc", "abc") ≈ 2/3 atol = 1e-4
|
||||
@test compare(Hamming(), "saturday", "sunday") ≈ 1/8 atol = 1e-4
|
||||
@test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4
|
||||
@test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4
|
||||
@test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4
|
||||
|
||||
@test compare(QGram(1), "", "abc") ≈ 0.0 atol = 1e-4
|
||||
@test compare(QGram(1), "abc", "cba") ≈ 1.0 atol = 1e-4
|
||||
@test compare(QGram(1), "abc", "ccc") ≈ 1/3 atol = 1e-4
|
||||
@test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4
|
||||
|
||||
@test compare(Jaccard(2), "", "abc") ≈ 0.0 atol = 1e-4
|
||||
@test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4
|
||||
|
||||
@test compare(Jaccard(2), "martha", "martha") ≈ 1.0 atol = 1e-4
|
||||
@test compare(Cosine(2), "martha", "martha") ≈ 1.0 atol = 1e-4
|
||||
@test compare(Jaccard(2), "martha", "martha") ≈ 1.0 atol = 1e-4
|
||||
@test compare(Overlap(2), "martha", "martha") ≈ 1.0 atol = 1e-4
|
||||
@test compare(SorensenDice(2), "martha", "martha") ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
||||
|
||||
|
||||
# Winkler
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") ≈ 0.9611 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") ≈ 0.84 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") ≈ 0.81333 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") ≈ 0.975 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") ≈ 0.0 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") ≈ 1.0 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") ≈ 0.0 atol = 1e-4
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.84 atol = 1e-4
|
||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.81333 atol = 1e-4
|
||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.975 atol = 1e-4
|
||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
|
@ -37,7 +37,7 @@ strings = [
|
|||
]
|
||||
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
|
||||
for i in 1:length(solutions)
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) ≈ (1 - solutions[i]) atol = 1e-4
|
||||
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) ≈ (1 - solutions[i]) atol = 1e-4
|
||||
end
|
||||
|
||||
|
||||
|
@ -45,37 +45,36 @@ end
|
|||
|
||||
|
||||
# Partial
|
||||
@test compare(Partial(Jaccard(2)), "aa", "aa ") ≈ 1.0
|
||||
@test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0
|
||||
|
||||
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") ≈ 1.0
|
||||
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") ≈ 0.0
|
||||
@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 0.444444444444
|
||||
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
|
||||
|
||||
|
||||
s = "HSINCHUANG"
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") ≈ 0.875
|
||||
@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") ≈ 0.8
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") ≈ 0.8
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") ≈ 0.8888888888888
|
||||
@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875
|
||||
@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888
|
||||
|
||||
@test compare(Partial(Hamming()), "New York Yankees", "Yankees") ≈ 1
|
||||
@test compare(Partial(Hamming()), "New York Yankees", "") ≈ 1
|
||||
@test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1
|
||||
@test compare("New York Yankees", "", Partial(Hamming())) ≈ 1
|
||||
|
||||
|
||||
|
||||
# Token
|
||||
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") ≈ 1.0
|
||||
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") ≈ 1.0 - 0.09090909090909094
|
||||
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094
|
||||
|
||||
|
||||
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") ≈ 0.0
|
||||
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0
|
||||
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0
|
||||
|
||||
|
||||
@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0
|
||||
|
||||
|
||||
@test compare(TokenMax(RatcliffObershelp()),"mariners", "mariner") ≈ 0.933333333333333
|
||||
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333
|
||||
|
||||
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
|
||||
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
|
||||
|
@ -84,9 +83,9 @@ s = "HSINCHUANG"
|
|||
|
||||
|
||||
|
||||
@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 1.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0
|
||||
|
||||
|
||||
@test compare(TokenMax(RatcliffObershelp()), "为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") ≈ 0.06428571428571427
|
||||
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", TokenMax(RatcliffObershelp())) ≈ 0.06428571428571427
|
||||
|
||||
|
||||
|
|
12
test/utf8.jl
12
test/utf8.jl
|
@ -1,12 +1,12 @@
|
|||
using StringDistances, Test
|
||||
|
||||
# check with weird utf8 strings
|
||||
compare(TokenMax(RatcliffObershelp()), "aüa", "aua")
|
||||
compare(TokenMax(QGram(2)), "aüa", "aua")
|
||||
compare(DamerauLevenshtein(), "aüa", "aua")
|
||||
compare(Hamming(), "aüa", "aua")
|
||||
compare(Jaro(), "aüa", "aua")
|
||||
compare(Levenshtein(), "aüa", "aua")
|
||||
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
|
||||
compare("aüa", "aua", TokenMax(QGram(2)))
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
compare("aüa", "aua", Hamming())
|
||||
compare("aüa", "aua", Jaro())
|
||||
compare("aüa", "aua", Levenshtein())
|
||||
|
||||
|
||||
s1 = "aü☃"
|
||||
|
|
Loading…
Reference in New Issue