pull/1/head
matthieugomez 2015-10-23 17:05:11 -04:00
parent ee90780451
commit 1edff24a1d
3 changed files with 17 additions and 17 deletions

View File

@ -21,7 +21,7 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
##
## Levenshtein and Damerau Levenshtein
## Source Levenshtein: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################
@ -162,7 +162,7 @@ end
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
length(s2) == 0 && return 1.0
maxdist = max(0, div(length(s2), 2) - 1)
m = 0 # matching characters
@ -186,7 +186,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
end
end
end
m == 0.0 && return 0.0
m == 0.0 && return 1.0
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
# common prefix adjustment
@ -205,7 +205,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
end
end
return score
return 1 - score
end
function jaro_winkler(s1::AbstractString, s2::AbstractString;

View File

@ -35,9 +35,9 @@ Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0
function Base.length(bag::Bag)
v = values(bag.dict)
if isempty(v)
0
return 0
else
mapreduce(x -> max(x, 0), +, values(bag.dict))
return mapreduce(x -> max(x, 0), +, values(bag.dict))
end
end
@ -101,8 +101,8 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
count += bag1.dict[x] * bag2.dict[x]
end
end
return 1.0 - count / (sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict))))
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
denominator == 0 ? 1.0 : 1.0 - count / denominator
end
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1, s2)

View File

@ -2,13 +2,13 @@
using StringDistances, Base.Test
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 0.84 1e-4
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 0.81333 1e-4
@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 0.975 1e-4
@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.9611 1e-4
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.84 1e-4
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.81333 1e-4
@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.975 1e-4
@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
@ -50,6 +50,6 @@ using StringDistances, Base.Test
@test_approx_eq_eps jaccard("leia", "leela", q = 2) 0.83333 1e-4
@test_approx_eq_eps cosine("", "abc", q = 2) NaN 1e-4
@test_approx_eq_eps cosine("", "abc", q = 2) 1 1e-4
@test_approx_eq_eps cosine("abc", "ccc", q = 2) 1 1e-4
@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4
@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4