separate set distances
parent
f1b5671a63
commit
e0f5c4f711
|
@ -111,10 +111,9 @@ The package defines a number of ways to modify string metrics:
|
|||
## Tips
|
||||
|
||||
- Each distance is tailored to a specific problem. Edit distances works well with local spelling errors, the Ratcliff-Obsershelp distance works well with edited texts, the Jaro Winkler distance was invented for short strings such as person names, the QGrams distances works well with strings composed of multiple words with fluctuating orderings.
|
||||
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
|
||||
- Most distances will perform poorly when comparing company or individual names, where each string is composed of multiple words.
|
||||
- Most distances perform poorly when comparing company or individual names, where each string is composed of multiple words.
|
||||
|
||||
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different word orders. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
|
||||
- While word ordering is mostly irrelevant in this situation, edit distances heavily penalize different orderings. Instead, use either a distance robust to word order (like QGram distances), or compose a distance with `TokenSort`, which reorders the words alphabetically.
|
||||
|
||||
```julia
|
||||
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
|
||||
|
@ -125,6 +124,7 @@ The package defines a number of ways to modify string metrics:
|
|||
#> 0.8125
|
||||
```
|
||||
- General words (like "bank", "company") may appear in one string but no the other. One solution is to abbreviate these common names first to diminish their importance (ie "bk" "co"). Another solution is to use something like the `Partial` or `TokenSet` modifiers.
|
||||
- Standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -85,6 +85,17 @@ function PairIterator(s1::AbstractString, s2::AbstractString, len1::Integer, len
|
|||
sort2 = sort(QGramIterator(s2, len2, q))
|
||||
PairIterator(sort1, sort2)
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Evaluate Qgram distance on strings calls evaluate on space of qgrams
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
evaluate(dist, PairIterator(s1, s2, len1, len2, dist.q))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## q-gram
|
||||
|
@ -99,9 +110,9 @@ immutable QGram{T <: Integer} <: AbstractQGram
|
|||
end
|
||||
QGram() = QGram(2)
|
||||
|
||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::QGram, setiterator)
|
||||
n = 0
|
||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
||||
for (n1, n2) in setiterator
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
return n
|
||||
|
@ -119,10 +130,9 @@ immutable Cosine{T <: Integer} <: AbstractQGram
|
|||
end
|
||||
Cosine() = Cosine(2)
|
||||
|
||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
||||
function evaluate(dist::Cosine, setiterator)
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
||||
for (n1, n2) in setiterator
|
||||
norm1 += n1^2
|
||||
norm2 += n2^2
|
||||
prodnorm += n1 * n2
|
||||
|
@ -146,10 +156,9 @@ immutable Jaccard{T <: Integer} <: AbstractQGram
|
|||
end
|
||||
Jaccard() = Jaccard(2)
|
||||
|
||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
||||
function evaluate(dist::Jaccard, setiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
||||
for (n1, n2) in setiterator
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -157,7 +166,6 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
|
|||
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## SorensenDice
|
||||
|
@ -170,10 +178,9 @@ immutable SorensenDice{T <: Integer} <: AbstractQGram
|
|||
end
|
||||
SorensenDice() = SorensenDice(2)
|
||||
|
||||
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
||||
function evaluate(dist::SorensenDice, setiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
||||
for (n1, n2) in setiterator
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -193,10 +200,9 @@ immutable Overlap{T <: Integer} <: AbstractQGram
|
|||
end
|
||||
Overlap() = Overlap(2)
|
||||
|
||||
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
|
||||
function evaluate(dist::Overlap, setiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
|
||||
for (n1, n2) in setiterator
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
|
|
@ -13,24 +13,30 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
|
||||
len1::Integer, len2::Integer)
|
||||
1.0 - evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
len1::Integer, len2::Integer)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
return len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||
len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||
end
|
||||
|
||||
function compare(dist::QGram, s1::AbstractString, s2::AbstractString,
|
||||
# while q gram definition are not modified for smaller string (the set is just considered as empty, which leads to NaN values), compare always returns a Float64 value between 0 and 1
|
||||
function compare(dist::AbstractQGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
len1::Integer, len2::Integer)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
if len1 <= (dist.q - 1)
|
||||
return s1 == s2 ? 1.0 : 0.0
|
||||
else
|
||||
return 1 - distance / (len1 + len2 - 2 * dist.q + 2)
|
||||
end
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::QGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
1 - distance / (len1 + len2 - 2 * dist.q + 2)
|
||||
end
|
|
@ -37,28 +37,35 @@ using StringDistances, Base.Test
|
|||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
|
||||
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
||||
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
||||
|
||||
|
||||
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
|
||||
@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0
|
||||
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
||||
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
||||
|
||||
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
|
||||
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
||||
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
||||
|
||||
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
|
||||
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||
|
||||
|
||||
|
||||
Set([(1,1,3) (4,5,1) (6,6,1)])
|
||||
@test matching_blocks("dwayne", "duane") ==
|
||||
Set([(5,4,2) (1,1,1) (3,3,1)])
|
||||
@test matching_blocks("dixon", "dicksonx") ==
|
||||
Set([(1,1,2) (4,6,2)])
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
|
||||
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
|
||||
|
||||
|
||||
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
("dwayne", "duane") ,
|
||||
|
@ -85,9 +92,9 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
|||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||
|
@ -124,25 +131,3 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
|
|||
|
||||
|
||||
|
||||
Set([(1,1,3)
|
||||
(4,5,1)
|
||||
(6,6,1)
|
||||
])
|
||||
@test matching_blocks("dwayne", "duane") ==
|
||||
Set([(5,4,2)
|
||||
(1,1,1)
|
||||
(3,3,1)])
|
||||
@test matching_blocks("dixon", "dicksonx") ==
|
||||
Set([(1,1,2)
|
||||
(4,6,2)
|
||||
])
|
||||
|
||||
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
|
||||
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
|
||||
|
||||
|
|
Loading…
Reference in New Issue