add tests
parent
3ecd33e732
commit
6bb6d706aa
10
README.md
10
README.md
|
@ -1,2 +1,12 @@
|
||||||
# StringDistances
|
# StringDistances
|
||||||
String Distances in Julia
|
String Distances in Julia
|
||||||
|
|
||||||
|
- Hamming Distance
|
||||||
|
- Jaro distance
|
||||||
|
- Jaro-Winkler Distance. Options are `p` (scaling factor), `b` (boosting parameter), and `minlen` (minimum length for long string adjustment)
|
||||||
|
- Levenshtein distance
|
||||||
|
|
||||||
|
|
||||||
|
To be implemented
|
||||||
|
- Damerau-Levenshtein Distance
|
||||||
|
- qgram
|
|
@ -74,9 +74,9 @@ levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
|
type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
|
||||||
p::T1 # scaling factor. Default to 0.1
|
scaling_factor::T1 # scaling factor. Default to 0.1
|
||||||
b::T2 # boost threshold. Default to 0.7
|
boosting_threshold::T2 # boost threshold. Default to 0.7
|
||||||
long::T3 # long string adjustment. Default to 5
|
long_threshold::T3 # long string adjustment. Default to 5
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
|
@ -108,25 +108,25 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
|
|
||||||
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
||||||
# common prefix adjustment
|
# common prefix adjustment
|
||||||
if (dist.p > 0 && score >= dist.b) || (length(s1) >= dist.long)
|
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (length(s1) >= dist.long_threshold)
|
||||||
l = 0
|
l = 0
|
||||||
last = min(4, length(s1))
|
last = min(4, length(s1))
|
||||||
while l < last && s1[l+1] == s2[l+1]
|
while l < last && s1[l+1] == s2[l+1]
|
||||||
l += 1
|
l += 1
|
||||||
end
|
end
|
||||||
# common prefix adjustment
|
# common prefix adjustment
|
||||||
if (dist.p > 0 && score >= dist.b)
|
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold)
|
||||||
score += l * (1 - score) * dist.p
|
score += l * (1 - score) * dist.scaling_factor
|
||||||
end
|
end
|
||||||
# longer string adjustment
|
# longer string adjustment
|
||||||
if (length(s1) >= dist.long) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
|
if (length(s1) >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
|
||||||
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return score
|
return score
|
||||||
end
|
end
|
||||||
|
|
||||||
jaro_winkler(s1::AbstractString, s2::AbstractString; p = 0.1, b = 0.7, long = 5) = evaluate(JaroWinkler(p, b, long), s1, s2)
|
jaro_winkler(s1::AbstractString, s2::AbstractString; scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5) = evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
|
||||||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
||||||
|
|
||||||
end # module FixedEffectModels
|
end # module FixedEffectModels
|
|
@ -2,13 +2,13 @@
|
||||||
using Base.Test
|
using Base.Test
|
||||||
|
|
||||||
|
|
||||||
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", b = 0.0, long = 100) 0.9611 1e-4
|
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", b = 0.0, long = 100) 0.84 1e-4
|
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", boosting_threshold = 0.0, long_threshold = 100) 0.84 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", b = 0.0, long = 100) 0.81333 1e-4
|
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", boosting_threshold = 0.0, long_threshold = 100) 0.81333 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("William", "Williams", b = 0.0, long = 100) 0.975 1e-4
|
@test_approx_eq_eps jaro_winkler("William", "Williams", boosting_threshold = 0.0, long_threshold = 100) 0.975 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("", "foo", b = 0.0, long = 100) 0.0 1e-4
|
@test_approx_eq_eps jaro_winkler("", "foo", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("a", "a", b = 0.0, long = 100) 1.0 1e-4
|
@test_approx_eq_eps jaro_winkler("a", "a", boosting_threshold = 0.0, long_threshold = 100) 1.0 1e-4
|
||||||
@test_approx_eq_eps jaro_winkler("abc", "xyz", b = 0.0, long = 100) 0.0 1e-4
|
@test_approx_eq_eps jaro_winkler("abc", "xyz", boosting_threshold = 0.0, long_threshold = 100) 0.0 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue