separate set distances
parent
f1b5671a63
commit
e0f5c4f711
|
@ -111,10 +111,9 @@ The package defines a number of ways to modify string metrics:
|
||||||
## Tips
|
## Tips
|
||||||
|
|
||||||
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
|
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
|
||||||
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
|
- Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words.
|
||||||
- Most distances will perform poorly when comparing company or individual names, where each string is composed of multiple words.
|
|
||||||
|
|
||||||
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different word orders. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
|
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
|
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
|
||||||
|
@ -125,6 +124,7 @@ The package defines a number of ways to modify string metrics:
|
||||||
#> 0.8125
|
#> 0.8125
|
||||||
```
|
```
|
||||||
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
|
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
|
||||||
|
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -85,6 +85,17 @@ function PairIterator(s1::AbstractString, s2::AbstractString, len1::Integer, len
|
||||||
sort2 = sort(QGramIterator(s2, len2, q))
|
sort2 = sort(QGramIterator(s2, len2, q))
|
||||||
PairIterator(sort1, sort2)
|
PairIterator(sort1, sort2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Evaluate Qgram distance on strings calls evaluate on space of qgrams
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
|
evaluate(dist, PairIterator(s1, s2, len1, len2, dist.q))
|
||||||
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## q-gram
|
## q-gram
|
||||||
|
@ -99,9 +110,9 @@ immutable QGram{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
QGram() = QGram(2)
|
QGram() = QGram(2)
|
||||||
|
|
||||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::QGram, setiterator)
|
||||||
n = 0
|
n = 0
|
||||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
for (n1, n2) in setiterator
|
||||||
n += abs(n1 - n2)
|
n += abs(n1 - n2)
|
||||||
end
|
end
|
||||||
return n
|
return n
|
||||||
|
@ -119,10 +130,9 @@ immutable Cosine{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Cosine() = Cosine(2)
|
Cosine() = Cosine(2)
|
||||||
|
|
||||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::Cosine, setiterator)
|
||||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
for (n1, n2) in setiterator
|
||||||
norm1 += n1^2
|
norm1 += n1^2
|
||||||
norm2 += n2^2
|
norm2 += n2^2
|
||||||
prodnorm += n1 * n2
|
prodnorm += n1 * n2
|
||||||
|
@ -146,10 +156,9 @@ immutable Jaccard{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Jaccard() = Jaccard(2)
|
Jaccard() = Jaccard(2)
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::Jaccard, setiterator)
|
||||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
for (n1, n2) in setiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
|
@ -157,7 +166,6 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
|
||||||
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## SorensenDice
|
## SorensenDice
|
||||||
|
@ -170,10 +178,9 @@ immutable SorensenDice{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
SorensenDice() = SorensenDice(2)
|
SorensenDice() = SorensenDice(2)
|
||||||
|
|
||||||
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::SorensenDice, setiterator)
|
||||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
for (n1, n2) in setiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
|
@ -193,10 +200,9 @@ immutable Overlap{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Overlap() = Overlap(2)
|
Overlap() = Overlap(2)
|
||||||
|
|
||||||
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::Overlap, setiterator)
|
||||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
for (n1, n2) in setiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
|
|
|
@ -13,24 +13,30 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
|
||||||
|
len1::Integer, len2::Integer)
|
||||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
|
||||||
1.0 - evaluate(dist, s1, s2, len1, len2)
|
1.0 - evaluate(dist, s1, s2, len1, len2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, s1::AbstractString, s2::AbstractString,
|
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||||
|
s1::AbstractString, s2::AbstractString,
|
||||||
len1::Integer, len2::Integer)
|
len1::Integer, len2::Integer)
|
||||||
distance = evaluate(dist, s1, s2, len1, len2)
|
distance = evaluate(dist, s1, s2, len1, len2)
|
||||||
return len2 == 0 ? 1.0 : 1.0 - distance / len2
|
len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(dist::QGram, s1::AbstractString, s2::AbstractString,
|
# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1
|
||||||
|
function compare(dist::AbstractQGram,
|
||||||
|
s1::AbstractString, s2::AbstractString,
|
||||||
len1::Integer, len2::Integer)
|
len1::Integer, len2::Integer)
|
||||||
distance = evaluate(dist, s1, s2, len1, len2)
|
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||||
if len1 <= (dist.q - 1)
|
evaluate(dist, s1, s2, len1, len2)
|
||||||
return s1 == s2 ? 1.0 : 0.0
|
end
|
||||||
else
|
|
||||||
return 1 - distance / (len1 + len2 - 2 * dist.q + 2)
|
function compare(dist::QGram,
|
||||||
end
|
s1::AbstractString, s2::AbstractString,
|
||||||
|
len1::Integer, len2::Integer)
|
||||||
|
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||||
|
distance = evaluate(dist, s1, s2, len1, len2)
|
||||||
|
1 - distance / (len1 + len2 - 2 * dist.q + 2)
|
||||||
end
|
end
|
|
@ -37,28 +37,35 @@ using StringDistances, Base.Test
|
||||||
@test evaluate(QGram(1), "", "abc") == 3
|
@test evaluate(QGram(1), "", "abc") == 3
|
||||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||||
|
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
|
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
||||||
|
@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0
|
||||||
|
|
||||||
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
|
|
||||||
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
||||||
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
||||||
|
|
||||||
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
|
|
||||||
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
|
||||||
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
|
||||||
|
|
||||||
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
|
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
|
||||||
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
|
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
|
||||||
|
|
||||||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Set([(1,1,3) (4,5,1) (6,6,1)])
|
||||||
|
@test matching_blocks("dwayne", "duane") ==
|
||||||
|
Set([(5,4,2) (1,1,1) (3,3,1)])
|
||||||
|
@test matching_blocks("dixon", "dicksonx") ==
|
||||||
|
Set([(1,1,2) (4,6,2)])
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
|
||||||
|
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
strings = [
|
strings = [
|
||||||
("martha", "marhta"),
|
("martha", "marhta"),
|
||||||
("dwayne", "duane") ,
|
("dwayne", "duane") ,
|
||||||
|
@ -85,9 +92,9 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
|
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
|
||||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||||
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
|
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
|
||||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||||
t, solution = x
|
t, solution = x
|
||||||
for i in 1:length(solution)
|
for i in 1:length(solution)
|
||||||
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||||
|
@ -124,25 +131,3 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Set([(1,1,3)
|
|
||||||
(4,5,1)
|
|
||||||
(6,6,1)
|
|
||||||
])
|
|
||||||
@test matching_blocks("dwayne", "duane") ==
|
|
||||||
Set([(5,4,2)
|
|
||||||
(1,1,1)
|
|
||||||
(3,3,1)])
|
|
||||||
@test matching_blocks("dixon", "dicksonx") ==
|
|
||||||
Set([(1,1,2)
|
|
||||||
(4,6,2)
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
|
|
||||||
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue