separate set distances

pull/3/head
matthieugomez 2015-11-05 20:40:56 -05:00
parent f1b5671a63
commit e0f5c4f711
4 changed files with 62 additions and 65 deletions

View File

@ -111,10 +111,9 @@ The package defines a number of ways to modify string metrics:
## Tips
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
- Most distances will perform poorly when comparing company or individual names, where each string is composed of multiple words.
- Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words.
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different word orders. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
```julia
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
@ -125,6 +124,7 @@ The package defines a number of ways to modify string metrics:
#> 0.8125
```
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)

View File

@ -85,6 +85,17 @@ function PairIterator(s1::AbstractString, s2::AbstractString, len1::Integer, len
sort2 = sort(QGramIterator(s2, len2, q))
PairIterator(sort1, sort2)
end
##############################################################################
##
## Evaluate Qgram distance on strings calls evaluate on space of qgrams
##
##############################################################################
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
evaluate(dist, PairIterator(s1, s2, len1, len2, dist.q))
end
##############################################################################
##
## q-gram
@ -99,9 +110,9 @@ immutable QGram{T <: Integer} <: AbstractQGram
end
QGram() = QGram(2)
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::QGram, setiterator)
n = 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
for (n1, n2) in setiterator
n += abs(n1 - n2)
end
return n
@ -119,10 +130,9 @@ immutable Cosine{T <: Integer} <: AbstractQGram
end
Cosine() = Cosine(2)
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
function evaluate(dist::Cosine, setiterator)
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
for (n1, n2) in setiterator
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
@ -146,10 +156,9 @@ immutable Jaccard{T <: Integer} <: AbstractQGram
end
Jaccard() = Jaccard(2)
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
function evaluate(dist::Jaccard, setiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
for (n1, n2) in setiterator
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -157,7 +166,6 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
##############################################################################
##
## SorensenDice
@ -170,10 +178,9 @@ immutable SorensenDice{T <: Integer} <: AbstractQGram
end
SorensenDice() = SorensenDice(2)
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
function evaluate(dist::SorensenDice, setiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
for (n1, n2) in setiterator
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -193,10 +200,9 @@ immutable Overlap{T <: Integer} <: AbstractQGram
end
Overlap() = Overlap(2)
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
function evaluate(dist::Overlap, setiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
for (n1, n2) in setiterator
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)

View File

@ -13,24 +13,30 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
end
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, s1::AbstractString, s2::AbstractString,
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
return len2 == 0 ? 1.0 : 1.0 - distance / len2
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
function compare(dist::QGram, s1::AbstractString, s2::AbstractString,
# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
if len1 <= (dist.q - 1)
return s1 == s2 ? 1.0 : 0.0
else
return 1 - distance / (len1 + len2 - 2 * dist.q + 2)
end
1 - distance / (len1 + len2 - 2 * dist.q + 2)
end

View File

@ -37,28 +37,35 @@ using StringDistances, Base.Test
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
@test isnan(evaluate(Cosine(2), "", "abc"))
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
Set([(1,1,3) (4,5,1) (6,6,1)])
@test matching_blocks("dwayne", "duane") ==
Set([(5,4,2) (1,1,1) (3,3,1)])
@test matching_blocks("dixon", "dicksonx") ==
Set([(1,1,2) (4,6,2)])
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
@ -85,9 +92,9 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
t, solution = x
for i in 1:length(solution)
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
@ -124,25 +131,3 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
Set([(1,1,3)
(4,5,1)
(6,6,1)
])
@test matching_blocks("dwayne", "duane") ==
Set([(5,4,2)
(1,1,1)
(3,3,1)])
@test matching_blocks("dixon", "dicksonx") ==
Set([(1,1,2)
(4,6,2)
])
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762