From e0f5c4f711153fe3ff5fa8279cd1d5796c0c3917 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Thu, 5 Nov 2015 20:40:56 -0500 Subject: [PATCH] separate set distances --- README.md | 6 ++--- src/distances/qgram.jl | 36 ++++++++++++++----------- src/modifiers/compare.jl | 28 ++++++++++++-------- test/distances.jl | 57 +++++++++++++++------------------------- 4 files changed, 62 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index ec51976..3d15846 100644 --- a/README.md +++ b/README.md @@ -111,10 +111,9 @@ The package defines a number of ways to modify string metrics: ## Tips - Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings. -- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...) -- Most distances will perform poorly when comparing company or individual names, where each string is composed of multiple words. +- Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words. - - While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different word orders. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically. + - While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically. ```julia compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners") @@ -125,6 +124,7 @@ The package defines a number of ways to modify string metrics: #> 0.8125 ``` - General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers. +- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index a3291ce..dff826b 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -85,6 +85,17 @@ function PairIterator(s1::AbstractString, s2::AbstractString, len1::Integer, len sort2 = sort(QGramIterator(s2, len2, q)) PairIterator(sort1, sort2) end + +############################################################################## +## +## Evaluate Qgram distance on strings calls evaluate on space of qgrams +## +############################################################################## + +function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) + evaluate(dist, PairIterator(s1, s2, len1, len2, dist.q)) +end + ############################################################################## ## ## q-gram @@ -99,9 +110,9 @@ immutable QGram{T <: Integer} <: AbstractQGram end QGram() = QGram(2) -function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::QGram, setiterator) n = 0 - for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q) + for (n1, n2) in setiterator n += abs(n1 - n2) end return n @@ -119,10 +130,9 @@ immutable Cosine{T <: Integer} <: AbstractQGram end Cosine() = Cosine(2) -function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) - len1 <= (dist.q - 1) && return convert(Float64, s1 != s2) +function evaluate(dist::Cosine, setiterator) norm1, norm2, prodnorm = 0, 0, 0 - for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q) + for (n1, n2) in setiterator norm1 += n1^2 norm2 += n2^2 prodnorm += n1 * n2 @@ -146,10 +156,9 @@ immutable Jaccard{T <: Integer} <: AbstractQGram end Jaccard() = Jaccard(2) -function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) - len1 <= (dist.q - 1) && return convert(Float64, s1 != s2) +function evaluate(dist::Jaccard, setiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q) + for (n1, n2) in setiterator ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -157,7 +166,6 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect) end - ############################################################################## ## ## SorensenDice @@ -170,10 +178,9 @@ immutable SorensenDice{T <: Integer} <: AbstractQGram end SorensenDice() = SorensenDice(2) -function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) - len1 <= (dist.q - 1) && return convert(Float64, s1 != s2) +function evaluate(dist::SorensenDice, setiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q) + for (n1, n2) in setiterator ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -193,10 +200,9 @@ immutable Overlap{T <: Integer} <: AbstractQGram end Overlap() = Overlap(2) -function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) - len1 <= (dist.q - 1) && return convert(Float64, s1 != s2) +function evaluate(dist::Overlap, setiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q) + for (n1, n2) in setiterator ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) diff --git a/src/modifiers/compare.jl b/src/modifiers/compare.jl index 11672b9..7c63fc4 100644 --- a/src/modifiers/compare.jl +++ b/src/modifiers/compare.jl @@ -13,24 +13,30 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) end end - - -function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, + len1::Integer, len2::Integer) 1.0 - evaluate(dist, s1, s2, len1, len2) end -function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, s1::AbstractString, s2::AbstractString, +function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, + s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) distance = evaluate(dist, s1, s2, len1, len2) - return len2 == 0 ? 1.0 : 1.0 - distance / len2 + len2 == 0 ? 1.0 : 1.0 - distance / len2 end -function compare(dist::QGram, s1::AbstractString, s2::AbstractString, +# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1 +function compare(dist::AbstractQGram, + s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) + len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) + evaluate(dist, s1, s2, len1, len2) +end + +function compare(dist::QGram, + s1::AbstractString, s2::AbstractString, + len1::Integer, len2::Integer) + len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) distance = evaluate(dist, s1, s2, len1, len2) - if len1 <= (dist.q - 1) - return s1 == s2 ? 1.0 : 0.0 - else - return 1 - distance / (len1 + len2 - 2 * dist.q + 2) - end + 1 - distance / (len1 + len2 - 2 * dist.q + 2) end \ No newline at end of file diff --git a/test/distances.jl b/test/distances.jl index 58414a9..fd11b2c 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -37,28 +37,35 @@ using StringDistances, Base.Test @test evaluate(QGram(1), "", "abc") == 3 @test evaluate(QGram(1), "abc", "cba") == 0 @test evaluate(QGram(1), "abc", "ccc") == 4 - -@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4 +@test isnan(evaluate(Cosine(2), "", "abc")) @test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4 @test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4 - - -@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4 +@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0 @test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4 @test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4 - -@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4 -@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4 -@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4 - @test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4 @test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4 - @test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4 @test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4 +Set([(1,1,3) (4,5,1) (6,6,1)]) +@test matching_blocks("dwayne", "duane") == +Set([(5,4,2) (1,1,1) (3,3,1)]) +@test matching_blocks("dixon", "dicksonx") == +Set([(1,1,2) (4,6,2)]) +@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154 +@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579 +@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666 +@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0 +@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963 +@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869 +@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762 + + + + strings = [ ("martha", "marhta"), ("dwayne", "duane") , @@ -85,9 +92,9 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), (Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]), (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), (QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]), - (Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]), - (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), - (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) + (Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]), + (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), + (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) t, solution = x for i in 1:length(solution) @test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4 @@ -124,25 +131,3 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1) -Set([(1,1,3) -(4,5,1) -(6,6,1) -]) -@test matching_blocks("dwayne", "duane") == -Set([(5,4,2) -(1,1,1) -(3,3,1)]) -@test matching_blocks("dixon", "dicksonx") == -Set([(1,1,2) - (4,6,2) - ]) - - -@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154 -@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579 -@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666 -@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0 -@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963 -@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869 -@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762 -