impose same type

pull/1/head
matthieugomez 2015-10-23 18:47:43 -04:00
parent 1edff24a1d
commit c8dfb12d13
3 changed files with 60 additions and 37 deletions

31
benchmark/benchmark.jl Normal file
View File

@ -0,0 +1,31 @@
using StringDistances
x = map(randstring, rand(5:25,100_000))
y = map(randstring, rand(5:25,100_000))
function f(out, t, x, y)
d = Array(out, length(x))
for i in 1:length(x)
d[i] = StringDistances.evaluate(t, x[i], y[i])
end
end
# I get 0.12 vs 0.10 in stringdist
# http://www.markvanderloo.eu/yaRb/2013/09/07/a-bit-of-benchmarking-with-string-distances/
@time f(Int, Levenshtein(), x, y)
@time f(Float64, Jaccard(2), x, y)
@time f(Float64, Cosine(2), x, y)
#= Rcode
library(stringdist)
x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
stringdist(x,y,method='lv')
stringdist(x,y,method='jaccard')
stringdist(x,y,method='jaccard')
stringdist(x,y,method='cosine')
=#

View File

@ -5,7 +5,7 @@
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::Hamming, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
count = 0
@inbounds for i in 1:length(s1)
@ -15,7 +15,7 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
return count
end
hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
hamming{T}(s1::T, s2::T) = evaluate(Hamming(), s1, s2)
##############################################################################
##
@ -25,7 +25,7 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
##
##############################################################################
function common_suffix(s1::AbstractString, s2::AbstractString)
function common_suffix{T}(s1::T, s2::T)
len1 = length(s1)
len2 = length(s2)
while ((len1 > 0) && (s1[len1] == s2[len2]))
@ -35,7 +35,7 @@ function common_suffix(s1::AbstractString, s2::AbstractString)
return len1, len2
end
function common_prefix(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
function common_prefix{T}(s1::T, s2::T, len1::Int, len2::Int)
start = 0
len1 == 0 && return len1, len2, start
if (s1[start + 1] == s2[start + 1])
@ -51,7 +51,7 @@ end
type Levenshtein end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::Levenshtein, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
@ -89,11 +89,11 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
return current
end
levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1, s2)
levenshtein{T}(s1::T, s2::T) = evaluate(Levenshtein(), s1, s2)
type DamerauLevenshtein end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::DamerauLevenshtein, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
@ -146,7 +146,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
return current
end
damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2)
damerau_levenshtein{T}(s1::T, s2::T) = evaluate(DamerauLevenshtein(), s1, s2)
##############################################################################
##
@ -160,7 +160,7 @@ type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
long_threshold::T3 # long string adjustment. Default to 5
end
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::JaroWinkler, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 1.0
@ -208,11 +208,11 @@ function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
return 1 - score
end
function jaro_winkler(s1::AbstractString, s2::AbstractString;
function jaro_winkler{T}(s1::T, s2::T;
scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5)
evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
end
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
jaro{T}(s1::T, s2::T) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)

View File

@ -26,28 +26,22 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
end
function Base.pop!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
bag.dict[x] -= 1
bag.dict[x] = max(0, bag.dict[x] - 1)
return x
end
Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0
function Base.length(bag::Bag)
v = values(bag.dict)
if isempty(v)
return 0
else
return mapreduce(x -> max(x, 0), +, values(bag.dict))
end
end
Base.length(bag::Bag) = sum(values(bag.dict))
function Bag(s::AbstractString, q::Integer)
bag = Bag{typeof(s), Int}()
for i in 1:(length(s) - q + 1)
bag = Bag{typeof(s), UInt}()
@inbounds for i in 1:(length(s) - q + 1)
push!(bag, s[i:(i + q - 1)])
end
return bag
end
##############################################################################
##
## q-gram
@ -58,7 +52,7 @@ type QGram{T <: Integer}
q::T
end
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::QGram, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
@ -66,7 +60,7 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
count = 0
n1 = length(s1) - dist.q + 1
for i1 in 1:n1
ch = s1[i1:(i1 + dist.q - 1)]
@inbounds ch = s1[i1:(i1 + dist.q - 1)]
if ch in bag
pop!(bag, ch)
count += 1
@ -76,7 +70,7 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
return n1 - count + length(bag)
end
qgram(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(QGram(q), s1, s2)
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)
##############################################################################
##
@ -88,7 +82,7 @@ type Cosine{T <: Integer}
q::T
end
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::Cosine, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
@ -96,16 +90,14 @@ function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
bag1 = Bag(s1, dist.q)
count = 0
for x in keys(bag1.dict)
if x in bag2
count += bag1.dict[x] * bag2.dict[x]
end
for (k, v1) in bag1.dict
count += v1 * get(bag2.dict, k, 0)
end
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
denominator == 0 ? 1.0 : 1.0 - count / denominator
end
cosine(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Cosine(q), s1, s2)
cosine{T}(s1::T, s2::T; q = 2) = evaluate(Cosine(q), s1, s2)
##############################################################################
##
@ -120,20 +112,20 @@ type Jaccard{T <: Integer}
q::T
end
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
function evaluate{T}(dist::Jaccard, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
n2 = length(s2) - dist.q + 1
n1 = length(s1) - dist.q + 1
set2 = Set{typeof(s2)}()
for i2 in 1:n2
set2 = Set{T}()
@inbounds for i2 in 1:n2
push!(set2, s2[i2:(i2 + dist.q - 1)])
end
set1 = Set{typeof(s1)}()
for i1 in 1:n1
set1 = Set{T}()
@inbounds for i1 in 1:n1
push!(set1, s1[i1:(i1 + dist.q - 1)])
end
@ -147,7 +139,7 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
return 1.0 - n_intersect / (length(set1) + length(set2) - n_intersect)
end
jaccard(s1::AbstractString, s2::AbstractString; q = 2) = evaluate(Jaccard(q), s1, s2)
jaccard{T}(s1::T, s2::T; q = 2) = evaluate(Jaccard(q), s1, s2)