update
parent
ee90780451
commit
1edff24a1d
|
@ -162,7 +162,7 @@ end
|
||||||
|
|
||||||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||||
length(s2) == 0 && return 0.0
|
length(s2) == 0 && return 1.0
|
||||||
|
|
||||||
maxdist = max(0, div(length(s2), 2) - 1)
|
maxdist = max(0, div(length(s2), 2) - 1)
|
||||||
m = 0 # matching characters
|
m = 0 # matching characters
|
||||||
|
@ -186,7 +186,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
m == 0.0 && return 0.0
|
m == 0.0 && return 1.0
|
||||||
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
||||||
|
|
||||||
# common prefix adjustment
|
# common prefix adjustment
|
||||||
|
@ -205,7 +205,7 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return score
|
return 1 - score
|
||||||
end
|
end
|
||||||
|
|
||||||
function jaro_winkler(s1::AbstractString, s2::AbstractString;
|
function jaro_winkler(s1::AbstractString, s2::AbstractString;
|
||||||
|
|
|
@ -35,9 +35,9 @@ Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0
|
||||||
function Base.length(bag::Bag)
|
function Base.length(bag::Bag)
|
||||||
v = values(bag.dict)
|
v = values(bag.dict)
|
||||||
if isempty(v)
|
if isempty(v)
|
||||||
0
|
return 0
|
||||||
else
|
else
|
||||||
mapreduce(x -> max(x, 0), +, values(bag.dict))
|
return mapreduce(x -> max(x, 0), +, values(bag.dict))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -101,8 +101,8 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||||
count += bag1.dict[x] * bag2.dict[x]
|
count += bag1.dict[x] * bag2.dict[x]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
||||||
return 1.0 - count / (sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict))))
|
denominator == 0 ? 1.0 : 1.0 - count / denominator
|
||||||
end
|
end
|
||||||
|
|
||||||
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1, s2)
|
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1, s2)
|
||||||
|
|
|
@ -2,13 +2,13 @@
|
||||||
using StringDistances, Base.Test
|
using StringDistances, Base.Test
|
||||||
|
|
||||||
|
|
||||||
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4
|
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.9611 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 0.84 1e-4
|
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.84 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 0.81333 1e-4
|
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.81333 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 0.975 1e-4
|
@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 1 - 0.975 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
|
@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
|
@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
|
@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +50,6 @@ using StringDistances, Base.Test
|
||||||
@test_approx_eq_eps jaccard("leia", "leela", q = 2) 0.83333 1e-4
|
@test_approx_eq_eps jaccard("leia", "leela", q = 2) 0.83333 1e-4
|
||||||
|
|
||||||
|
|
||||||
@test_approx_eq_eps cosine("", "abc", q = 2) NaN 1e-4
|
@test_approx_eq_eps cosine("", "abc", q = 2) 1 1e-4
|
||||||
@test_approx_eq_eps cosine("abc", "ccc", q = 2) 1 1e-4
|
@test_approx_eq_eps cosine("abc", "ccc", q = 2) 1 1e-4
|
||||||
@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4
|
@test_approx_eq_eps cosine("leia", "leela", q = 2) 0.7113249 1e-4
|
Loading…
Reference in New Issue