add tests
parent
37c8d848cd
commit
c883a6a5ba
|
@ -5,7 +5,7 @@
|
||||||
# StringDistances
|
# StringDistances
|
||||||
|
|
||||||
- [x] Hamming Distance
|
- [x] Hamming Distance
|
||||||
- [x] Jaro Distance and Jaro-Winkler Distance
|
- [x] Jaro-Winkler Distance
|
||||||
- [x] Levenshtein Distance
|
- [x] Levenshtein Distance
|
||||||
- [x] Damerau-Levenshtein Distance
|
- [x] Damerau-Levenshtein Distance
|
||||||
- [x] Qgram Distance
|
- [x] Qgram Distance
|
||||||
|
|
|
@ -12,8 +12,8 @@ import Distances: evaluate, Hamming, hamming
|
||||||
export evaluate,
|
export evaluate,
|
||||||
Hamming, hamming,
|
Hamming, hamming,
|
||||||
Levenshtein, levenshtein,
|
Levenshtein, levenshtein,
|
||||||
JaroWinkler, jaro_winkler, jaro,
|
|
||||||
DamerauLevenshtein, damerau_levenshtein,
|
DamerauLevenshtein, damerau_levenshtein,
|
||||||
|
JaroWinkler, jaro_winkler,
|
||||||
QGram, qgram,
|
QGram, qgram,
|
||||||
Cosine, cosine,
|
Cosine, cosine,
|
||||||
Jaccard, jaccard
|
Jaccard, jaccard
|
||||||
|
|
|
@ -32,13 +32,15 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
|
||||||
function common_prefix(s1::AbstractString, s2::AbstractString)
|
function common_prefix(s1::AbstractString, s2::AbstractString)
|
||||||
start1 = start(s1)
|
start1 = start(s1)
|
||||||
start2 = start(s2)
|
start2 = start(s2)
|
||||||
|
k = 0
|
||||||
while !done(s1, start1)
|
while !done(s1, start1)
|
||||||
ch1, nextstart1 = next(s1, start1)
|
ch1, nextstart1 = next(s1, start1)
|
||||||
ch2, nextstart2 = next(s2, start2)
|
ch2, nextstart2 = next(s2, start2)
|
||||||
ch1 != ch2 && break
|
ch1 != ch2 && break
|
||||||
|
k += 1
|
||||||
start1, start2 = nextstart1, nextstart2
|
start1, start2 = nextstart1, nextstart2
|
||||||
end
|
end
|
||||||
return start1, start2
|
return k, start1, start2
|
||||||
end
|
end
|
||||||
|
|
||||||
type Levenshtein end
|
type Levenshtein end
|
||||||
|
@ -47,13 +49,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
len1 > len2 && return evaluate(dist, s2, s1)
|
len1 > len2 && return evaluate(dist, s2, s1)
|
||||||
len2 == 0 && return 0
|
len2 == 0 && return 0
|
||||||
|
|
||||||
start1, start2 = common_prefix(s1, s2)
|
k, start1, start2 = common_prefix(s1, s2)
|
||||||
done(s1, start1) && return len2
|
done(s1, start1) && return len2 - k
|
||||||
|
|
||||||
# distance initialized to first row of matrix
|
# distance initialized to first row of matrix
|
||||||
# => distance between "" and s2[1:i}
|
# => distance between "" and s2[1:i}
|
||||||
v0 = Array(Int, len2)
|
v0 = Array(Int, len2 - k)
|
||||||
@inbounds for i2 in 1:len2
|
@inbounds for i2 in 1:(len2 - k)
|
||||||
v0[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
current = zero(0)
|
current = zero(0)
|
||||||
|
@ -93,14 +95,14 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
||||||
len1 > len2 && return evaluate(dist, s2, s1)
|
len1 > len2 && return evaluate(dist, s2, s1)
|
||||||
len2 == 0 && return 0
|
len2 == 0 && return 0
|
||||||
|
|
||||||
start1, start2 = common_prefix(s1, s2)
|
k, start1, start2 = common_prefix(s1, s2)
|
||||||
done(s1, start1) && return len2
|
done(s1, start1) && return len2 - k
|
||||||
|
|
||||||
v0 = Array(Int, len2)
|
v0 = Array(Int, len2 - k)
|
||||||
@inbounds for i2 in 1:len2
|
@inbounds for i2 in 1:(len2 - k)
|
||||||
v0[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
v2 = Array(Int, len2)
|
v2 = Array(Int, len2 - k)
|
||||||
|
|
||||||
ch1, = next(s1, start1)
|
ch1, = next(s1, start1)
|
||||||
current = 0
|
current = 0
|
||||||
|
@ -158,12 +160,12 @@ damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLe
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Integer}
|
type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Real}
|
||||||
scaling_factor::T1 # scaling factor. Default to 0.1
|
scaling_factor::T1 # scaling factor. Default to 0.1
|
||||||
boosting_threshold::T2 # boost threshold. Default to 0.7
|
boosting_threshold::T2 # boost threshold. Default to 0.7
|
||||||
long_threshold::T3 # long string adjustment. Default to 5
|
long_threshold::T3 # long string adjustment. Default to 5
|
||||||
end
|
end
|
||||||
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
|
JaroWinkler() = JaroWinkler(0.1, 0.25, 5)
|
||||||
|
|
||||||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
@ -234,6 +236,6 @@ function jaro_winkler(s1::AbstractString, s2::AbstractString;
|
||||||
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
|
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.1, 0.0, Inf), s1, s2)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||||
return bag
|
return bag
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x)
|
||||||
v = get(bag.dict, x, zero(Ti))
|
v = get(bag.dict, x, zero(Ti))
|
||||||
if v > zero(Ti)
|
if v > zero(Ti)
|
||||||
bag.dict[x] = v - one(Ti)
|
bag.dict[x] = v - one(Ti)
|
||||||
|
@ -120,7 +120,7 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||||
numerator += v1 * get(bag2.dict, k, 0)
|
numerator += v1 * get(bag2.dict, k, 0)
|
||||||
end
|
end
|
||||||
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
||||||
denominator == 0 ? 1.0 : 1.0 - numerator / denominator
|
return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator
|
||||||
end
|
end
|
||||||
|
|
||||||
cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
|
||||||
|
@ -132,6 +132,8 @@ cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine
|
||||||
## Denote Q(s, q) the set of tuple of length q in s
|
## Denote Q(s, q) the set of tuple of length q in s
|
||||||
## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
|
||||||
##
|
##
|
||||||
|
## return 1.0 if smaller than qgram
|
||||||
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type Jaccard{T <: Integer}
|
type Jaccard{T <: Integer}
|
||||||
|
@ -156,7 +158,7 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
denominator = length(set1) + length(set2) - numerator
|
denominator = length(set1) + length(set2) - numerator
|
||||||
return 1.0 - numerator / denominator
|
return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator
|
||||||
end
|
end
|
||||||
|
|
||||||
jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)
|
|
@ -56,3 +56,78 @@ using StringDistances, Base.Test
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
|
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
||||||
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
strings = [
|
||||||
|
("martha", "marhta"),
|
||||||
|
("dwayne", "duane") ,
|
||||||
|
("dixon", "dicksonx"),
|
||||||
|
("william", "williams"),
|
||||||
|
("", "foo"),
|
||||||
|
("a", "a"),
|
||||||
|
("abc", "xyz"),
|
||||||
|
("abc", "ccc"),
|
||||||
|
("kitten", "sitting"),
|
||||||
|
("saturday", "sunday"),
|
||||||
|
("hi, my name is", "my name is"),
|
||||||
|
("alborgów", "amoniak"),
|
||||||
|
("cape sand recycling ", "edith ann graham"),
|
||||||
|
( "jellyifhs", "jellyfish"),
|
||||||
|
("ifhs", "fish"),
|
||||||
|
("leia", "leela"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#solution hamming
|
||||||
|
|
||||||
|
|
||||||
|
for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||||
|
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||||
|
(JaroWinkler(0.1, 0, Inf), [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]),
|
||||||
|
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||||
|
(QGram(2), [ 6 7 7 1 1 0 4 4 7 8 4 13 32 8 6 5]),
|
||||||
|
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
|
||||||
|
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||||
|
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||||
|
t, solution = x
|
||||||
|
for i in 1:length(solution)
|
||||||
|
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
#= R test
|
||||||
|
library(stringdist)
|
||||||
|
strings = matrix(data = c(
|
||||||
|
"martha", "marhta",
|
||||||
|
"dwayne", "duane",
|
||||||
|
"dixon", "dicksonx",
|
||||||
|
"william", "williams",
|
||||||
|
"", "foo",
|
||||||
|
"a", "a",
|
||||||
|
"abc", "xyz",
|
||||||
|
"abc", "ccc",
|
||||||
|
"kitten", "sitting",
|
||||||
|
"saturday", "sunday",
|
||||||
|
"hi, my name is", "my name is",
|
||||||
|
"alborgów", "amoniak",
|
||||||
|
"cape sand recycling ", "edith ann graham",
|
||||||
|
"jellyifhs", "jellyfish",
|
||||||
|
"ifhs", "fish",
|
||||||
|
"leia", "leela"),
|
||||||
|
nrow = 2
|
||||||
|
)
|
||||||
|
stringdist(strings[1,], strings[2,], method = "jw", p = 0.1)
|
||||||
|
stringdist(strings[1,], strings[2,], method = "qgram", p = 0.1)
|
||||||
|
|
||||||
|
=#
|
||||||
|
|
Loading…
Reference in New Issue