simplify jaccard

pull/1/head
matthieugomez 2015-10-23 19:09:42 -04:00
parent c8dfb12d13
commit a3a8d2daca
2 changed files with 13 additions and 16 deletions

View File

@ -5,7 +5,7 @@ x = map(randstring, rand(5:25,100_000))
y = map(randstring, rand(5:25,100_000))
function f(out, t, x, y)
d = Array(out, length(x))
for i in 1:length(x)
@inbounds for i in 1:length(x)
d[i] = StringDistances.evaluate(t, x[i], y[i])
end
end
@ -16,9 +16,7 @@ end
@time f(Int, Levenshtein(), x, y)
@time f(Float64, Jaccard(2), x, y)
@time f(Float64, Cosine(2), x, y)
@time f(Float64, Cosine(2), x, y)
#= Rcode
library(stringdist)
x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))

View File

@ -25,14 +25,15 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
return bag
end
function Base.pop!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
bag.dict[x] = max(0, bag.dict[x] - 1)
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
v = get(bag.dict, x, 0)
if v > 0
bag.dict[x] = v - 1
end
return x
end
Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0
Base.length(bag::Bag) = sum(values(bag.dict))
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
function Bag(s::AbstractString, q::Integer)
bag = Bag{typeof(s), UInt}()
@ -55,19 +56,17 @@ end
function evaluate{T}(dist::QGram, s1::T, s2::T)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
n2 = length(s2) - dist.q + 1
bag = Bag(s2, dist.q)
count = 0
n1 = length(s1) - dist.q + 1
for i1 in 1:n1
@inbounds ch = s1[i1:(i1 + dist.q - 1)]
if ch in bag
pop!(bag, ch)
count += 1
end
delete!(bag, ch)
end
return n1 - count + length(bag)
# number non matched in s1 : n1 - (n2 - length(bag))
# number non matched in s2 : length(bag)
return n1 - n2 + 2 * length(bag)
end
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)