diff --git a/src/edit_distances.jl b/src/edit_distances.jl index 02a544a..6ce494c 100644 --- a/src/edit_distances.jl +++ b/src/edit_distances.jl @@ -21,7 +21,7 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2) ## ## Levenshtein and Damerau Levenshtein ## Source Levenshtein: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html -## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html +## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## ############################################################################## @@ -162,7 +162,7 @@ end function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString) length(s1) > length(s2) && return evaluate(dist, s2, s1) - length(s2) == 0 && return 0.0 + length(s2) == 0 && return 1.0 maxdist = max(0, div(length(s2), 2) - 1) m = 0 # matching characters @@ -186,7 +186,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString) end end end - m == 0.0 && return 0.0 + m == 0.0 && return 1.0 score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0 # common prefix adjustment @@ -205,7 +205,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString) score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1))) end end - return score + return 1 - score end function jaro_winkler(s1::AbstractString, s2::AbstractString; diff --git a/src/qgrams_distances.jl b/src/qgrams_distances.jl index 8276995..728f5c6 100644 --- a/src/qgrams_distances.jl +++ b/src/qgrams_distances.jl @@ -35,9 +35,9 @@ Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0 function Base.length(bag::Bag) v = values(bag.dict) if isempty(v) - 0 + return 0 else - mapreduce(x -> max(x, 0), +, values(bag.dict)) + return mapreduce(x -> max(x, 0), +, values(bag.dict)) end end @@ -101,8 +101,8 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString) count += bag1.dict[x] * bag2.dict[x] end end - - return 1.0 - count / (sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))) + denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict))) + denominator == 0 ? 1.0 : 1.0 - count / denominator end cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1, s2) diff --git a/test/distances.jl b/test/distances.jl index 0647595..304de9e 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -2,13 +2,13 @@ using StringDistances, Base.Test -@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4 -@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 0.84 1e-4 -@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 0.81333 1e-4 -@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 0.975 1e-4 -@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4 -@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4 -@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4 +@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.9611 1e-4 +@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.84 1e-4 +@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.81333 1e-4 +@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.975 1e-4 +@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4 +@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4 +@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4 @@ -50,6 +50,6 @@ using StringDistances, Base.Test @test_approx_eq_eps jaccard("leia", "leela", q = 2) 0.83333 1e-4 -@test_approx_eq_eps cosine("", "abc", q = 2) NaN 1e-4 +@test_approx_eq_eps cosine("", "abc", q = 2) 1 1e-4 @test_approx_eq_eps cosine("abc", "ccc", q = 2) 1 1e-4 -@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4 \ No newline at end of file +@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4