add tests

pull/1/head
matthieugomez 2015-10-24 20:35:40 -04:00
parent 37c8d848cd
commit c883a6a5ba
5 changed files with 97 additions and 18 deletions

View File

@ -5,7 +5,7 @@
# StringDistances
- [x] Hamming Distance
- [x] Jaro Distance and Jaro-Winkler Distance
- [x] Jaro-Winkler Distance
- [x] Levenshtein Distance
- [x] Damerau-Levenshtein Distance
- [x] Qgram Distance

View File

@ -12,8 +12,8 @@ import Distances: evaluate, Hamming, hamming
export evaluate,
Hamming, hamming,
Levenshtein, levenshtein,
JaroWinkler, jaro_winkler, jaro,
DamerauLevenshtein, damerau_levenshtein,
JaroWinkler, jaro_winkler,
QGram, qgram,
Cosine, cosine,
Jaccard, jaccard

View File

@ -32,13 +32,15 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
function common_prefix(s1::AbstractString, s2::AbstractString)
start1 = start(s1)
start2 = start(s2)
k = 0
while !done(s1, start1)
ch1, nextstart1 = next(s1, start1)
ch2, nextstart2 = next(s2, start2)
ch1 != ch2 && break
k += 1
start1, start2 = nextstart1, nextstart2
end
return start1, start2
return k, start1, start2
end
type Levenshtein end
@ -47,13 +49,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
len1 > len2 && return evaluate(dist, s2, s1)
len2 == 0 && return 0
start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v0 = Array(Int, len2)
@inbounds for i2 in 1:len2
v0 = Array(Int, len2 - k)
@inbounds for i2 in 1:(len2 - k)
v0[i2] = i2
end
current = zero(0)
@ -93,14 +95,14 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
len1 > len2 && return evaluate(dist, s2, s1)
len2 == 0 && return 0
start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
v0 = Array(Int, len2)
@inbounds for i2 in 1:len2
v0 = Array(Int, len2 - k)
@inbounds for i2 in 1:(len2 - k)
v0[i2] = i2
end
v2 = Array(Int, len2)
v2 = Array(Int, len2 - k)
ch1, = next(s1, start1)
current = 0
@ -158,12 +160,12 @@ damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLe
##
##############################################################################
type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Integer}
type JaroWinkler{T1 <: Real, T2 <: Real, T3 <: Real}
scaling_factor::T1 # scaling factor. Default to 0.1
boosting_threshold::T2 # boost threshold. Default to 0.7
long_threshold::T3 # long string adjustment. Default to 5
end
JaroWinkler() = JaroWinkler(0.1, 0.7, 5)
JaroWinkler() = JaroWinkler(0.1, 0.25, 5)
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
@ -234,6 +236,6 @@ function jaro_winkler(s1::AbstractString, s2::AbstractString;
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
end
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.1, 0.0, Inf), s1, s2)

View File

@ -42,7 +42,7 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
return bag
end
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x)
v = get(bag.dict, x, zero(Ti))
if v > zero(Ti)
bag.dict[x] = v - one(Ti)
@ -120,7 +120,7 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
numerator += v1 * get(bag2.dict, k, 0)
end
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
denominator == 0 ? 1.0 : 1.0 - numerator / denominator
return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator
end
cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine(q), s1::AbstractString, s2::AbstractString)
@ -132,6 +132,8 @@ cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine
## Denote Q(s, q) the set of tuple of length q in s
## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
##
## return 1.0 if smaller than qgram
##
##############################################################################
type Jaccard{T <: Integer}
@ -156,7 +158,7 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
end
end
denominator = length(set1) + length(set2) - numerator
return 1.0 - numerator / denominator
return denominator == 0 ? convert(Float64, 1 - (s1 == s2)) : 1.0 - numerator / denominator
end
jaccard(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Jaccard(q), s1::AbstractString, s2::AbstractString)

View File

@ -56,3 +56,78 @@ using StringDistances, Base.Test
@test_approx_eq_eps evaluate(Cosine(2), "", "abc") 1 1e-4
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo"),
("a", "a"),
("abc", "xyz"),
("abc", "ccc"),
("kitten", "sitting"),
("saturday", "sunday"),
("hi, my name is", "my name is"),
("alborgów", "amoniak"),
("cape sand recycling ", "edith ann graham"),
( "jellyifhs", "jellyfish"),
("ifhs", "fish"),
("leia", "leela"),
]
#solution hamming
for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(JaroWinkler(0.1, 0, Inf), [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 1 0 4 4 7 8 4 13 32 8 6 5]),
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
t, solution = x
for i in 1:length(solution)
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
end
end
#= R test
library(stringdist)
strings = matrix(data = c(
"martha", "marhta",
"dwayne", "duane",
"dixon", "dicksonx",
"william", "williams",
"", "foo",
"a", "a",
"abc", "xyz",
"abc", "ccc",
"kitten", "sitting",
"saturday", "sunday",
"hi, my name is", "my name is",
"alborgów", "amoniak",
"cape sand recycling ", "edith ann graham",
"jellyifhs", "jellyfish",
"ifhs", "fish",
"leia", "leela"),
nrow = 2
)
stringdist(strings[1,], strings[2,], method = "jw", p = 0.1)
stringdist(strings[1,], strings[2,], method = "qgram", p = 0.1)
=#