diff --git a/README.md b/README.md index f80ddbb..f686be2 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # StringDistances - [x] Hamming Distance -- [x] Jaro Distance and Jaro-Winkler Distance +- [x] Jaro-Winkler Distance - [x] Levenshtein Distance - [x] Damerau-Levenshtein Distance - [x] Qgram Distance diff --git a/src/StringDistances.jl b/src/StringDistances.jl index ee9ea67..bc5056c 100644 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -12,8 +12,8 @@ import Distances: evaluate, Hamming, hamming export evaluate, Hamming, hamming, Levenshtein, levenshtein, -JaroWinkler, jaro_winkler, jaro, DamerauLevenshtein, damerau_levenshtein, +JaroWinkler, jaro_winkler, QGram, qgram, Cosine, cosine, Jaccard, jaccard diff --git a/src/edit_distances.jl b/src/edit_distances.jl index d3dd1f7..cd24d38 100644 --- a/src/edit_distances.jl +++ b/src/edit_distances.jl @@ -32,13 +32,15 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2) function common_prefix(s1::AbstractString, s2::AbstractString) start1 = start(s1) start2 = start(s2) + k = 0 while !done(s1, start1) ch1, nextstart1 = next(s1, start1) ch2, nextstart2 = next(s2, start2) ch1 != ch2 && break + k += 1 start1, start2 = nextstart1, nextstart2 end - return start1, start2 + return k, start1, start2 end type Levenshtein end @@ -47,13 +49,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString) len1 > len2 && return evaluate(dist, s2, s1) len2 == 0 && return 0 - start1, start2 = common_prefix(s1, s2) - done(s1, start1) && return len2 + k, start1, start2 = common_prefix(s1, s2) + done(s1, start1) && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} - v0 = Array(Int, len2) - @inbounds for i2 in 1:len2 + v0 = Array(Int, len2 - k) + @inbounds for i2 in 1:(len2 - k) v0[i2] = i2 end current = zero(0) @@ -93,14 +95,14 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri len1 > len2 && return evaluate(dist, s2, s1) len2 == 0 && return 0 - start1, start2 = common_prefix(s1, s2) - done(s1, start1) && return len2 + k, start1, start2 = common_prefix(s1, s2) + done(s1, start1) && return len2 - k - v0 = Array(Int, len2) - @inbounds for i2 in 1:len2 + v0 = Array(Int, len2 - k) + @inbounds for i2 in 1:(len2 - k) v0[i2] = i2 end - v2 = Array(Int, len2) + v2 = Array(Int, len2 - k) ch1, = next(s1, start1) current = 0 @@ -158,12 +160,12 @@ damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLe ## ############################################################################## -type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Integer} +type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Real} scaling_factor::T1 # scaling factor. Default to 0.1 boosting_threshold::T2 # boost threshold. Default to 0.7 long_threshold::T3 # long string adjustment. Default to 5 end -JaroWinkler() = JaroWinkler(0.1, 0.7, 5) +JaroWinkler() = JaroWinkler(0.1, 0.25, 5) function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString) len1, len2 = length(s1), length(s2) @@ -234,6 +236,6 @@ function jaro_winkler(s1::AbstractString, s2::AbstractString; evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2) end -jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2) +jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.1, 0.0, Inf), s1, s2) diff --git a/src/qgrams_distances.jl b/src/qgrams_distances.jl index bab0095..a990285 100644 --- a/src/qgrams_distances.jl +++ b/src/qgrams_distances.jl @@ -42,7 +42,7 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv) return bag end -function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv) +function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x) v = get(bag.dict, x, zero(Ti)) if v > zero(Ti) bag.dict[x] = v - one(Ti) @@ -120,7 +120,7 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString) numerator += v1 * get(bag2.dict, k, 0) end denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict))) - denominator == 0 ? 1.0 : 1.0 - numerator / denominator + return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator end cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString) @@ -132,6 +132,8 @@ cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine ## Denote Q(s, q) the set of tuple of length q in s ## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))| ## +## return 1.0 if smaller than qgram +## ############################################################################## type Jaccard{T <: Integer} @@ -156,7 +158,7 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString) end end denominator = length(set1) + length(set2) - numerator - return 1.0 - numerator / denominator + return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator end jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString) \ No newline at end of file diff --git a/test/distances.jl b/test/distances.jl index 09502d9..9cd8b82 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -56,3 +56,78 @@ using StringDistances, Base.Test @test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4 @test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4 @test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4 + + + + + + + + + + + +strings = [ +("martha", "marhta"), +("dwayne", "duane") , +("dixon", "dicksonx"), +("william", "williams"), +("", "foo"), +("a", "a"), +("abc", "xyz"), +("abc", "ccc"), +("kitten", "sitting"), +("saturday", "sunday"), +("hi, my name is", "my name is"), +("alborgów", "amoniak"), +("cape sand recycling ", "edith ann graham"), +( "jellyifhs", "jellyfish"), +("ifhs", "fish"), +("leia", "leela"), +] + + + +#solution hamming + + +for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), + (DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]), + (JaroWinkler(0.1, 0, Inf), [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]), + (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), + (QGram(2), [ 6 7 7 1 1 0 4 4 7 8 4 13 32 8 6 5]), + (Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]), + (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), + (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) + t, solution = x + for i in 1:length(solution) + @test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4 + end +end + + +#= R test +library(stringdist) +strings = matrix(data = c( +"martha", "marhta", +"dwayne", "duane", +"dixon", "dicksonx", +"william", "williams", +"", "foo", +"a", "a", +"abc", "xyz", +"abc", "ccc", +"kitten", "sitting", +"saturday", "sunday", +"hi, my name is", "my name is", +"alborgów", "amoniak", +"cape sand recycling ", "edith ann graham", + "jellyifhs", "jellyfish", +"ifhs", "fish", +"leia", "leela"), +nrow = 2 +) +stringdist(strings[1,], strings[2,], method = "jw", p = 0.1) +stringdist(strings[1,], strings[2,], method = "qgram", p = 0.1) + +=#