simplify jaccard
parent
c8dfb12d13
commit
a3a8d2daca
|
@ -5,7 +5,7 @@ x = map(randstring, rand(5:25,100_000))
|
|||
y = map(randstring, rand(5:25,100_000))
|
||||
function f(out, t, x, y)
|
||||
d = Array(out, length(x))
|
||||
for i in 1:length(x)
|
||||
@inbounds for i in 1:length(x)
|
||||
d[i] = StringDistances.evaluate(t, x[i], y[i])
|
||||
end
|
||||
end
|
||||
|
@ -16,9 +16,7 @@ end
|
|||
@time f(Int, Levenshtein(), x, y)
|
||||
@time f(Float64, Jaccard(2), x, y)
|
||||
@time f(Float64, Cosine(2), x, y)
|
||||
|
||||
|
||||
|
||||
@time f(Float64, Cosine(2), x, y)
|
||||
#= Rcode
|
||||
library(stringdist)
|
||||
x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
|
|
|
@ -25,14 +25,15 @@ function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
|||
return bag
|
||||
end
|
||||
|
||||
function Base.pop!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||
bag.dict[x] = max(0, bag.dict[x] - 1)
|
||||
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||
v = get(bag.dict, x, 0)
|
||||
if v > 0
|
||||
bag.dict[x] = v - 1
|
||||
end
|
||||
return x
|
||||
end
|
||||
|
||||
Base.in{Tv, Ti}(x::Tv, bag::Bag{Tv, Ti}) = get(bag.dict, x, 0) > 0
|
||||
|
||||
Base.length(bag::Bag) = sum(values(bag.dict))
|
||||
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
|
||||
|
||||
function Bag(s::AbstractString, q::Integer)
|
||||
bag = Bag{typeof(s), UInt}()
|
||||
|
@ -55,19 +56,17 @@ end
|
|||
function evaluate{T}(dist::QGram, s1::T, s2::T)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0
|
||||
|
||||
n2 = length(s2) - dist.q + 1
|
||||
bag = Bag(s2, dist.q)
|
||||
count = 0
|
||||
n1 = length(s1) - dist.q + 1
|
||||
for i1 in 1:n1
|
||||
@inbounds ch = s1[i1:(i1 + dist.q - 1)]
|
||||
if ch in bag
|
||||
pop!(bag, ch)
|
||||
count += 1
|
||||
end
|
||||
delete!(bag, ch)
|
||||
end
|
||||
|
||||
return n1 - count + length(bag)
|
||||
# number non matched in s1 : n1 - (n2 - length(bag))
|
||||
# number non matched in s2 : length(bag)
|
||||
return n1 - n2 + 2 * length(bag)
|
||||
end
|
||||
|
||||
qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2)
|
||||
|
|
Loading…
Reference in New Issue