diff --git a/src/StringDistances.jl b/src/StringDistances.jl index aa9133a..9218349 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -6,7 +6,7 @@ module StringDistances ## ############################################################################## -import Base: eltype, length, iterate, ==, hash, isless, convert, show +import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric import IterTools: chain export diff --git a/src/compare.jl b/src/compare.jl index 65aca93..ea22af5 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -6,21 +6,17 @@ ############################################################################## function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric) - compare(dist, s1, s2) -end - - -function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) 1.0 - evaluate(dist, s1, s2) end -function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, - s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, + dist::Union{Hamming, Levenshtein, DamerauLevenshtein}) len = max(length(s1), length(s2)) len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len end -function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, + dist::AbstractQGramDistance) # When string length < q for qgram distance, returns s1 == s2 len1 = length(s1) ; len2 = length(s2) min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2) @@ -31,6 +27,8 @@ function compare(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractSt end end +@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist) + ############################################################################## ## ## Winkler @@ -46,8 +44,8 @@ end # restrict to distance between 0 and 1 Winkler(x) = Winkler(x, 0.1, 0.7) -function compare(dist::Winkler, s1::AbstractString, s2::AbstractString) - score = compare(dist.dist, s1, s2) +function compare(s1::AbstractString, s2::AbstractString, dist::Winkler) + score = compare(s1, s2, dist.dist) l = common_prefix(s1, s2, 4)[1] # common prefix adjustment if score >= dist.boosting_limit @@ -67,16 +65,16 @@ struct Partial{T <: PreMetric} <: PreMetric end # general -function compare(dist::Partial, s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, dist::Partial) s2, len2, s1, len1 = reorder(s1, s2) - len1 == len2 && return compare(dist.dist, s1, s2) - len1 == 0 && return compare(dist.dist, "", "") + len1 == len2 && return compare(s1, s2, dist.dist) + len1 == 0 && return compare("", "", dist.dist) iter = QGramIterator(s2, len2, len1) out = 0.0 x = iterate(iter) while x !== nothing s, state = x - curr = compare(dist.dist, s1, s) + curr = compare(s1, s, dist.dist) out = max(out, curr) x = iterate(iter, state) end @@ -85,9 +83,9 @@ end # Specialization for RatcliffObershelp distance # Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py -function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}) s2, len2, s1, len1 = reorder(s1, s2) - len1 == len2 && return compare(dist.dist, s1, s2) + len1 == len2 && return compare(s1, s2, dist.dist) out = 0.0 result = matching_blocks(s1, s2) for r in result @@ -103,7 +101,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr end i2_start = nextind(s2, 0, s2_start) i2_end = nextind(s2, 0, s2_end) - curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end)) + curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp()) out = max(out, curr) end return out @@ -119,10 +117,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort) s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") - compare(dist.dist, s1, s2) + compare(s1, s2, dist.dist) end ############################################################################## @@ -135,18 +133,18 @@ struct TokenSet{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet) v0, v1, v2 = _separate!(split(s1), split(s2)) s0 = join(v0, " ") s1 = join(Iterators.flatten((v0, v1)), " ") s2 = join(Iterators.flatten((v0, v2)), " ") if isempty(s0) - # otherwise compare(dist, "", "a")== 1.0 - compare(dist.dist, s1, s2) + # otherwise compare("", "a", dist)== 1.0 + compare(s1, s2, dist.dist) else - max(compare(dist.dist, s0, s1), - compare(dist.dist, s1, s2), - compare(dist.dist, s0, s2)) + max(compare(s0, s1, dist.dist), + compare(s1, s2, dist.dist), + compare(s0, s2, dist.dist)) end end @@ -182,24 +180,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString) - dist0 = compare(dist.dist, s1, s2) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax) + dist0 = compare(s1, s2, dist.dist) s2, len2, s1, len1 = reorder(s1, s2) unbase_scale = 0.95 # if one string is much much shorter than the other if len2 >= 1.5 * len1 # if strings are of dissimilar length, use partials - partial = compare(Partial(dist.dist), s1, s2) - ptsor = compare(TokenSort(Partial(dist.dist)), s1, s2) - ptser = compare(TokenSet(Partial(dist.dist)), s1, s2) + partial = compare(s1, s2, Partial(dist.dist)) + ptsor = compare(s1, s2, TokenSort(Partial(dist.dist))) + ptser = compare(s1, s2, TokenSet(Partial(dist.dist))) partial_scale = len2 > (8 * len1) ? 0.6 : 0.9 return max(dist0, partial * partial_scale, ptsor * unbase_scale * partial_scale, ptser * unbase_scale * partial_scale) else - ptsor = compare(TokenSort(dist.dist), s1, s2) - ptser = compare(TokenSet(dist.dist), s1, s2) + ptsor = compare(s1, s2, TokenSort(dist.dist)) + ptser = compare(s1, s2, TokenSet(dist.dist)) return max(dist0, ptsor * unbase_scale, ptser * unbase_scale) diff --git a/test/modifiers.jl b/test/modifiers.jl index 7c43ba0..24185a8 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -2,31 +2,31 @@ using StringDistances, Test # Compare -@test compare(Hamming(), "", "abc") ≈ 0.0 atol = 1e-4 -@test compare(Hamming(), "acc", "abc") ≈ 2/3 atol = 1e-4 -@test compare(Hamming(), "saturday", "sunday") ≈ 1/8 atol = 1e-4 +@test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4 +@test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4 +@test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4 -@test compare(QGram(1), "", "abc") ≈ 0.0 atol = 1e-4 -@test compare(QGram(1), "abc", "cba") ≈ 1.0 atol = 1e-4 -@test compare(QGram(1), "abc", "ccc") ≈ 1/3 atol = 1e-4 +@test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4 +@test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4 +@test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4 -@test compare(Jaccard(2), "", "abc") ≈ 0.0 atol = 1e-4 +@test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4 -@test compare(Jaccard(2), "martha", "martha") ≈ 1.0 atol = 1e-4 -@test compare(Cosine(2), "martha", "martha") ≈ 1.0 atol = 1e-4 -@test compare(Jaccard(2), "martha", "martha") ≈ 1.0 atol = 1e-4 -@test compare(Overlap(2), "martha", "martha") ≈ 1.0 atol = 1e-4 -@test compare(SorensenDice(2), "martha", "martha") ≈ 1.0 atol = 1e-4 +@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 +@test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4 +@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 +@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4 +@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4 # Winkler -@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") ≈ 0.9611 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") ≈ 0.84 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") ≈ 0.81333 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") ≈ 0.975 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") ≈ 0.0 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") ≈ 1.0 atol = 1e-4 -@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") ≈ 0.0 atol = 1e-4 +@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.9611 atol = 1e-4 +@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.84 atol = 1e-4 +@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.81333 atol = 1e-4 +@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.975 atol = 1e-4 +@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4 +@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) ≈ 1.0 atol = 1e-4 +@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4 strings = [ ("martha", "marhta"), @@ -37,7 +37,7 @@ strings = [ ] solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000] for i in 1:length(solutions) - @test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) ≈ (1 - solutions[i]) atol = 1e-4 + @test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) ≈ (1 - solutions[i]) atol = 1e-4 end @@ -45,37 +45,36 @@ end # Partial -@test compare(Partial(Jaccard(2)), "aa", "aa ") ≈ 1.0 +@test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0 -@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") ≈ 1.0 -@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") ≈ 0.0 -@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 0.444444444444 +@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0 +@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0 +@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444 s = "HSINCHUANG" -@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") ≈ 0.875 -@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") ≈ 0.8 -@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") ≈ 0.8 -@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") ≈ 0.8888888888888 +@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875 +@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8 +@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8 +@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888 -@test compare(Partial(Hamming()), "New York Yankees", "Yankees") ≈ 1 -@test compare(Partial(Hamming()), "New York Yankees", "") ≈ 1 +@test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1 +@test compare("New York Yankees", "", Partial(Hamming())) ≈ 1 # Token -@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") ≈ 1.0 -@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") ≈ 1.0 - 0.09090909090909094 +@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0 +@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094 -@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") ≈ 0.0 -@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0 +@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0 +@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0 -@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0 -@test compare(TokenMax(RatcliffObershelp()),"mariners", "mariner") ≈ 0.933333333333333 +@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333 #@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0 #@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094 @@ -84,9 +83,9 @@ s = "HSINCHUANG" -@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 1.0 +@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0 -@test compare(TokenMax(RatcliffObershelp()), "为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") ≈ 0.06428571428571427 +@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", TokenMax(RatcliffObershelp())) ≈ 0.06428571428571427 diff --git a/test/utf8.jl b/test/utf8.jl index 6e48e6d..36e47ec 100644 --- a/test/utf8.jl +++ b/test/utf8.jl @@ -1,12 +1,12 @@ using StringDistances, Test # check with weird utf8 strings -compare(TokenMax(RatcliffObershelp()), "aüa", "aua") -compare(TokenMax(QGram(2)), "aüa", "aua") -compare(DamerauLevenshtein(), "aüa", "aua") -compare(Hamming(), "aüa", "aua") -compare(Jaro(), "aüa", "aua") -compare(Levenshtein(), "aüa", "aua") +compare("aüa", "aua", TokenMax(RatcliffObershelp())) +compare("aüa", "aua", TokenMax(QGram(2))) +compare("aüa", "aua", DamerauLevenshtein()) +compare("aüa", "aua", Hamming()) +compare("aüa", "aua", Jaro()) +compare("aüa", "aua", Levenshtein()) s1 = "aü☃"