update with dictionary + faster

pull/7/head
matthieugomez 2018-07-04 17:27:40 -04:00
parent 9fb8c589f6
commit 0d1f2e7e9f
6 changed files with 79 additions and 40 deletions

View File

@ -1,6 +1,5 @@
language: julia
julia:
- 0.6
- nightly
script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi

View File

@ -0,0 +1 @@
@time f(Jaccard(2), x, y)

View File

@ -1,26 +1,23 @@
using DataStructures, StringDistances
using StringDistances
x = map(randstring, rand(5:25,100_000))
y = map(randstring, rand(5:25,100_000))
function f(out, t, x, y)
d = Array(out, length(x))
@inbounds for i in 1:length(x)
d[i] = evaluate(t, x[i], y[i])
end
x = map(Base.randstring, rand(5:25,500_000))
y = map(Base.randstring, rand(5:25,500_000))
function f(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
end
# similar
@time f(Int, Levenshtein(), x, y)
@time f(Float64, Jaro(), x, y)
# same speed as StringDist
@time f(Levenshtein(), x, y)
@time f(Jaro(), x, y)
@time f(RatcliffObershelp(), x, y)
# 2x slower compared to StringDist
@time f(Int, QGram(2), x, y)
@time f(Float64, Cosine(2), x, y)
@time f(Float64, Jaccard(2), x, y)
# 4x slower compared to StringDist
@time f(Jaccard(2), x, y)
@time f(Cosine(2), x, y)
@time f(QGram(2), x, y)
#
@time f(Float64, RatcliffObershelp(), x, y)
@ -29,8 +26,8 @@ end
#= Rcode
library(stringdist)
x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
system.time(stringdist(x,y,method='lv', nthread = 1))
system.time(stringdist(x,y,method='jaccard', nthread = 1))
system.time(stringdist(x,y,method='cosine', nthread = 1))

View File

@ -69,7 +69,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
iter = QGramIterator(s2, len2, len1)
out = 0.0
x = iterate(iter)
while x != nothing
while x !== nothing
s, state = x
curr = compare(dist.dist, s1, s)
out = max(out, curr)

View File

@ -7,7 +7,7 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
x1 = iterate(s1)
x2 = iterate(s2)
l = 0
while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
while (x1 !== nothing) && (x2 !== nothing) && (l < lim || lim < 0)
ch1, state1 = x1
ch2, state2 = x2
ch1 != ch2 && break
@ -53,13 +53,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
v0 = collect(1:(len2 - k))
current = 0
i1 = 1
while x1 != nothing
while x1 !== nothing
ch1, state1 = x1
left = (i1 - 1)
current = (i1 - 1)
i2 = 1
x2 = x2start
while x2 != nothing
while x2 !== nothing
ch2, state2 = x2
# update
above, current, left = current, left, v0[i2]
@ -98,7 +98,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
i1 = 1
current = i1
prevch1, = x1
while (x1 != nothing)
while (x1 !== nothing)
ch1, state1 = x1
left = (i1 - 1)
current = i1
@ -106,7 +106,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
prevch2, = x2start
x2 = x2start
i2 = 1
while (x2 != nothing)
while (x2 !== nothing)
ch2, state2 = x2
above = current
thisTransCost = nextTransCost
@ -166,7 +166,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
i2 = 1
x1 = iterate(s1)
x2 = iterate(s2)
while (x1 != nothing)
while (x1 !== nothing)
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
@ -175,7 +175,8 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
end
i2curr = i2
x2curr = x2
while (x2curr != nothing) && i2curr <= i1 + maxdist
while (x2curr !== nothing)
(i2curr > i1 + maxdist) && break
ch2, state2 = x2curr
if ch1 == ch2 && !flag[i2curr]
m += 1

View File

@ -22,6 +22,15 @@ end
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
##############################################################################
##
## CountedIterator that use Binary Search
##
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
##
##############################################################################
function Base.collect(qgram::QGramIterator)
x = Array{eltype(qgram)}(undef, length(qgram))
i = 0
@ -33,19 +42,17 @@ function Base.collect(qgram::QGramIterator)
end
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
##############################################################################
##
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
##
##############################################################################
struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
v1::T1
v2::T2
struct CountIteratorBinary{T1, T2}
v1::Vector{T1}
v2::Vector{T2}
end
function Base.iterate(s::CountIterator, state = (1, 1))
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
CountIteratorBinary(sort(s1), sort(s2))
end
function Base.iterate(s::CountIteratorBinary, state = (1, 1))
state1, state2 = state
iter1 = state2 > length(s.v2)
iter2 = state1 > length(s.v1)
@ -66,6 +73,40 @@ function Base.iterate(s::CountIterator, state = (1, 1))
end
##############################################################################
##
## CountedIterator that use Dictionary
##
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
##
##############################################################################
struct CountIteratorDictionary{T}
d::T
end
function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator)
d = Dict{eltype(s1), Tuple{Int, Int}}()
for ch1 in s1
if haskey(d, ch1)
t = d[ch1]
d[ch1] = (t[1] + 1, 0)
else
d[ch1] = (1, 0)
end
end
for ch2 in s2
if haskey(d, ch2)
t = d[ch2]
d[ch2] = (t[1], t[2] + 1)
else
d[ch2] = (0, 1)
end
end
return values(d)
end
##############################################################################
##
@ -75,9 +116,9 @@ end
abstract type AbstractQGram <: SemiMetric end
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
sort1 = sort(QGramIterator(s1, length(s1), dist.q))
sort2 = sort(QGramIterator(s2, length(s2), dist.q))
evaluate(dist, CountIterator(sort1, sort2))
evaluate(dist,
CountIteratorDictionary(QGramIterator(s1, length(s1), dist.q),
QGramIterator(s2, length(s2), dist.q)))
end
##############################################################################