more examples

pull/3/head
matthieugomez 2015-11-05 10:51:32 -05:00
parent 99d77a585b
commit 44b05e3db9
6 changed files with 101 additions and 27 deletions

View File

@ -9,25 +9,23 @@ This Julia package computes various distances between strings.
## Distances
#### Edit Distances
- Hamming Distance
- Jaro Distance
- Levenshtein Distance
- Damerau-Levenshtein Distance
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) (similar to the Python library [difflib](https://docs.python.org/2/library/difflib.html))
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance)
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
#### Q-Grams Distances
Q-gram distances compare the set of all substrings of length `q` in each
- QGram Distance
- Cosine Distance
- Jaccard Distance
A good reference for q-gram distances is the article written for the R package `stringdist`:
*The stringdist Package for Approximate String Matching* Mark P.J. van der Loo
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity)
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index)
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient)
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
#### Others
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) is based on the length of matching subsequences. It is used in the Python library [difflib](https://docs.python.org/2/library/difflib.html).
## Syntax
#### evaluate
The function `evaluate` returns the litteral distance between two strings (a value of 0 being identical). While some distances are bounded by 1, other distances like `Hamming`, `Levenshtein`, `Damerau-Levenshtein`, `Jaccard` can be higher than 1.
@ -52,7 +50,7 @@ compare(QGram(2), "martha", "marhta")
## Modifiers
The package defines a number of types to modify string metrics:
The package defines a number of ways to modify string metrics:
- [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes
@ -76,13 +74,17 @@ The package defines a number of types to modify string metrics:
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in string lengths. The function returns the maximal similarity score between the shorter string and all substrings of the longer string.
```julia
compare(Partial(Hamming()), "New York Yankees", "Yankees")
compare(Levenshtein(), "New York Yankees", "Yankees")
#> 0.4375
compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
#> 1.0
```
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically.
```julia
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
#> 0.44444
compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
#> 1.0
```
@ -90,7 +92,23 @@ The package defines a number of types to modify string metrics:
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers.
```julia
compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners")
compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
#> 0.559904
compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
#> 0.944444
```
You can compose multiple modifiers:
```julia
compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners")
#> 0.7378917378917379
compare(TokenSet(Partial(RatcliffObershel())),"mariners vs angels", "los angeles angels at seattle mariners")
#> 1.0
```
## References
A good reference for some distances in this package is the article written for the R package `stringdist`:
*The stringdist Package for Approximate String Matching* Mark P.J. van der Loo

View File

@ -20,6 +20,8 @@ Jaro,
QGram,
Cosine,
Jaccard,
SorensenDice,
Overlap,
longest_common_substring,
matching_blocks,
RatcliffObershelp,

View File

@ -107,10 +107,6 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Int
return n
end
function qgram(s1::AbstractString, s2::AbstractString; q::Integer = 2)
evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
end
##############################################################################
##
## cosine
@ -134,9 +130,7 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::In
return 1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
end
function cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2)
evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
end
##############################################################################
##
@ -163,6 +157,50 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
function jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2)
evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
end
##############################################################################
##
## SorensenDice
##
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
##############################################################################
immutable SorensenDice{T <: Integer} <: AbstractQGram
q::T
end
SorensenDice() = SorensenDice(2)
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
return 1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
##############################################################################
##
## overlap
##
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
##############################################################################
immutable Overlap{T <: Integer} <: AbstractQGram
q::T
end
Overlap() = Overlap(2)
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
end
return 1.0 - nintersect / min(ndistinct1, ndistinct2)
end

View File

@ -31,7 +31,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
out = 0.0
result = matching_blocks(s1, s2)
for r in result
s2_start = max(1, r[2] - r[1] + 1)
s2_start = max(0, r[2] - r[1]) + 1
s2_end = s2_start + len1 - 1
i2_start = chr2ind(s2, s2_start)
i2_end = s2_end == len2 ? endof(s2) : (chr2ind(s2, s2_end + 1) - 1)

View File

@ -47,6 +47,17 @@ using StringDistances, Base.Test
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
strings = [
("martha", "marhta"),

View File

@ -47,6 +47,11 @@ end
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0
s = "HSINCHUANG"
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "Yankees") 1
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "") 1