more examples

2015-11-05 10:51:32 -05:00 · 2015-11-05 10:51:32 -05:00 · 44b05e3db9
parent 99d77a585b
commit 44b05e3db9
6 changed files with 101 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -9,25 +9,23 @@ This Julia package computes various distances between strings.
 ## Distances

 #### Edit Distances
- Hamming Distance
- Jaro Distance
- Levenshtein Distance
- Damerau-Levenshtein Distance
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) (similar to the Python library [difflib](https://docs.python.org/2/library/difflib.html))
+- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance)
+- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
+- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)

 #### Q-Grams Distances
+Q-gram distances compare the set of all substrings of length `q` in each
 - QGram Distance
- Cosine Distance
- Jaccard Distance
-
-A good reference for q-gram distances is the article written for the R package `stringdist`:
-*The stringdist Package for Approximate String Matching* Mark P.J. van der Loo
+- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity)
+- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index)
+- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient)
+- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)

+#### Others
+- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)
+- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) is based on the length of matching subsequences. It is used in the Python library [difflib](https://docs.python.org/2/library/difflib.html).

 ## Syntax
-
-
-
 #### evaluate
 The function `evaluate` returns the litteral distance between two strings (a value of 0 being identical). While some distances are bounded by 1, other distances like `Hamming`, `Levenshtein`, `Damerau-Levenshtein`,  `Jaccard` can be higher than 1.

@ -52,7 +50,7 @@ compare(QGram(2), "martha", "marhta")

 ## Modifiers

-The package defines a number of types to modify string metrics:
+The package defines a number of ways to modify string metrics:

 - [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes

@ -76,13 +74,17 @@ The package defines a number of types to modify string metrics:
 	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in string lengths. The function returns the maximal similarity score between the shorter string and all substrings of the longer string. 	

 		```julia
-		compare(Partial(Hamming()), "New York Yankees", "Yankees")
+		compare(Levenshtein(), "New York Yankees", "Yankees")
+		#> 0.4375
+		compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
 		#> 1.0
 		```

 	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically.

 		```julia
+		compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
+		#> 0.44444
 		compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
 		#> 1.0
 		```
@ -90,7 +92,23 @@ The package defines a number of types to modify string metrics:
 	- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers.

 		```julia
-		compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners")
+		compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
+		#> 0.559904
+		compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
+		#> 0.944444
 		```


+You can compose multiple modifiers:
+```julia
+compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners")
+#> 0.7378917378917379
+compare(TokenSet(Partial(RatcliffObershel())),"mariners vs angels", "los angeles angels at seattle mariners")
+#> 1.0
+```
+
+## References
+A good reference for some distances in this package is the article written for the R package `stringdist`:
+*The stringdist Package for Approximate String Matching* Mark P.J. van der Loo
+
+
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -20,6 +20,8 @@ Jaro,
 QGram,
 Cosine,
 Jaccard,
+SorensenDice,
+Overlap,
 longest_common_substring,
 matching_blocks,
 RatcliffObershelp,
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -107,10 +107,6 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Int
 	return n
 end

-function qgram(s1::AbstractString, s2::AbstractString; q::Integer = 2)
-	evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
-end
-
 ##############################################################################
 ##
 ## cosine 
@ -134,9 +130,7 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::In
 	return 1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
 end

-function cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2)
-	evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
-end
+

 ##############################################################################
 ##
@ -163,6 +157,50 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
 	return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
 end

-function jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2)
-	evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
-end
+
+##############################################################################
+##
+## SorensenDice
+##
+## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
+##############################################################################
+
+immutable SorensenDice{T <: Integer} <: AbstractQGram
+	q::T
+end
+SorensenDice() = SorensenDice(2)
+
+function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
+	len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
+	ndistinct1, ndistinct2, nintersect = 0, 0, 0
+	for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
+		ndistinct1 += n1 > 0
+		ndistinct2 += n2 > 0
+		nintersect += (n1 > 0) & (n2 > 0)
+	end
+	return 1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
+end
+
+##############################################################################
+##
+## overlap
+##
+## 1 -  |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
+##############################################################################
+
+immutable Overlap{T <: Integer} <: AbstractQGram
+	q::T
+end
+Overlap() = Overlap(2)
+
+function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
+	len1 <= (dist.q - 1) && return convert(Float64, s1 != s2)
+	ndistinct1, ndistinct2, nintersect = 0, 0, 0
+	for (n1, n2) in PairIterator(s1, s2, len1, len2, dist.q)
+		ndistinct1 += n1 > 0
+		ndistinct2 += n2 > 0
+		nintersect += (n1 > 0) & (n2 > 0)
+	end
+	return 1.0 - nintersect / min(ndistinct1, ndistinct2)
+end
+
--- a/src/modifiers/partial.jl
+++ b/src/modifiers/partial.jl
@ -31,7 +31,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
    out = 0.0
    result = matching_blocks(s1, s2)
    for r in result
-        s2_start = max(1, r[2] - r[1] + 1)
+        s2_start = max(0, r[2] - r[1]) + 1
        s2_end = s2_start + len1 - 1
        i2_start =  chr2ind(s2, s2_start)
        i2_end = s2_end == len2 ? endof(s2) : (chr2ind(s2, s2_end + 1) - 1)
--- a/test/distances.jl
+++ b/test/distances.jl
@ -47,6 +47,17 @@ using StringDistances, Base.Test
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4

+@test_approx_eq_eps evaluate(Jaccard(1), "", "abc") 1.0 1e-4
+@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
+@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
+
+@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
+@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
+
+@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
+@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
+
+

 strings = [
 ("martha", "marhta"),
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -47,6 +47,11 @@ end
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees",  "Yankees") 1.0
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees",  "") 0.0

+s = "HSINCHUANG"
+@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
+@test_approx_eq compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
+@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
+@test_approx_eq compare(Partial(RatcliffObershelp()), s,  "SINJHUANG") 0.8888888888888

@test_approx_eq compare(Partial(Hamming()), "New York Yankees",  "Yankees") 1
@test_approx_eq compare(Partial(Hamming()), "New York Yankees",  "") 1