update with dictionary + faster
parent
9fb8c589f6
commit
0d1f2e7e9f
|
@ -1,6 +1,5 @@
|
|||
language: julia
|
||||
julia:
|
||||
- 0.6
|
||||
- nightly
|
||||
script:
|
||||
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
@time f(Jaccard(2), x, y)
|
|
@ -1,26 +1,23 @@
|
|||
|
||||
using DataStructures, StringDistances
|
||||
using StringDistances
|
||||
|
||||
x = map(randstring, rand(5:25,100_000))
|
||||
y = map(randstring, rand(5:25,100_000))
|
||||
function f(out, t, x, y)
|
||||
d = Array(out, length(x))
|
||||
@inbounds for i in 1:length(x)
|
||||
d[i] = evaluate(t, x[i], y[i])
|
||||
end
|
||||
x = map(Base.randstring, rand(5:25,500_000))
|
||||
y = map(Base.randstring, rand(5:25,500_000))
|
||||
function f(t, x, y)
|
||||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||
end
|
||||
|
||||
# similar
|
||||
@time f(Int, Levenshtein(), x, y)
|
||||
@time f(Float64, Jaro(), x, y)
|
||||
# same speed as StringDist
|
||||
@time f(Levenshtein(), x, y)
|
||||
@time f(Jaro(), x, y)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
|
||||
# 2x slower compared to StringDist
|
||||
@time f(Int, QGram(2), x, y)
|
||||
@time f(Float64, Cosine(2), x, y)
|
||||
@time f(Float64, Jaccard(2), x, y)
|
||||
# 4x slower compared to StringDist
|
||||
@time f(Jaccard(2), x, y)
|
||||
@time f(Cosine(2), x, y)
|
||||
@time f(QGram(2), x, y)
|
||||
|
||||
#
|
||||
@time f(Float64, RatcliffObershelp(), x, y)
|
||||
|
||||
|
||||
|
||||
|
@ -29,8 +26,8 @@ end
|
|||
|
||||
#= Rcode
|
||||
library(stringdist)
|
||||
x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
y <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
system.time(stringdist(x,y,method='lv', nthread = 1))
|
||||
system.time(stringdist(x,y,method='jaccard', nthread = 1))
|
||||
system.time(stringdist(x,y,method='cosine', nthread = 1))
|
||||
|
|
|
@ -69,7 +69,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
|||
iter = QGramIterator(s2, len2, len1)
|
||||
out = 0.0
|
||||
x = iterate(iter)
|
||||
while x != nothing
|
||||
while x !== nothing
|
||||
s, state = x
|
||||
curr = compare(dist.dist, s1, s)
|
||||
out = max(out, curr)
|
||||
|
|
|
@ -7,7 +7,7 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
|
|||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
l = 0
|
||||
while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
|
||||
while (x1 !== nothing) && (x2 !== nothing) && (l < lim || lim < 0)
|
||||
ch1, state1 = x1
|
||||
ch2, state2 = x2
|
||||
ch1 != ch2 && break
|
||||
|
@ -53,13 +53,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
|||
v0 = collect(1:(len2 - k))
|
||||
current = 0
|
||||
i1 = 1
|
||||
while x1 != nothing
|
||||
while x1 !== nothing
|
||||
ch1, state1 = x1
|
||||
left = (i1 - 1)
|
||||
current = (i1 - 1)
|
||||
i2 = 1
|
||||
x2 = x2start
|
||||
while x2 != nothing
|
||||
while x2 !== nothing
|
||||
ch2, state2 = x2
|
||||
# update
|
||||
above, current, left = current, left, v0[i2]
|
||||
|
@ -98,7 +98,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
i1 = 1
|
||||
current = i1
|
||||
prevch1, = x1
|
||||
while (x1 != nothing)
|
||||
while (x1 !== nothing)
|
||||
ch1, state1 = x1
|
||||
left = (i1 - 1)
|
||||
current = i1
|
||||
|
@ -106,7 +106,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
prevch2, = x2start
|
||||
x2 = x2start
|
||||
i2 = 1
|
||||
while (x2 != nothing)
|
||||
while (x2 !== nothing)
|
||||
ch2, state2 = x2
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
|
@ -166,7 +166,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
i2 = 1
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
while (x1 != nothing)
|
||||
while (x1 !== nothing)
|
||||
ch1, state1 = x1
|
||||
if i2 <= i1 - maxdist - 1
|
||||
ch2, state2 = x2
|
||||
|
@ -175,7 +175,8 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
i2curr = i2
|
||||
x2curr = x2
|
||||
while (x2curr != nothing) && i2curr <= i1 + maxdist
|
||||
while (x2curr !== nothing)
|
||||
(i2curr > i1 + maxdist) && break
|
||||
ch2, state2 = x2curr
|
||||
if ch1 == ch2 && !flag[i2curr]
|
||||
m += 1
|
||||
|
|
|
@ -22,6 +22,15 @@ end
|
|||
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
||||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## CountedIterator that use Binary Search
|
||||
##
|
||||
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
||||
## v1 and v2 must be sorted vectors
|
||||
##
|
||||
##############################################################################
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array{eltype(qgram)}(undef, length(qgram))
|
||||
i = 0
|
||||
|
@ -33,19 +42,17 @@ function Base.collect(qgram::QGramIterator)
|
|||
end
|
||||
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
||||
## v1 and v2 must be sorted vectors
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||
v1::T1
|
||||
v2::T2
|
||||
struct CountIteratorBinary{T1, T2}
|
||||
v1::Vector{T1}
|
||||
v2::Vector{T2}
|
||||
end
|
||||
|
||||
function Base.iterate(s::CountIterator, state = (1, 1))
|
||||
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
|
||||
CountIteratorBinary(sort(s1), sort(s2))
|
||||
end
|
||||
|
||||
function Base.iterate(s::CountIteratorBinary, state = (1, 1))
|
||||
state1, state2 = state
|
||||
iter1 = state2 > length(s.v2)
|
||||
iter2 = state1 > length(s.v1)
|
||||
|
@ -66,6 +73,40 @@ function Base.iterate(s::CountIterator, state = (1, 1))
|
|||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## CountedIterator that use Dictionary
|
||||
##
|
||||
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
||||
## v1 and v2 must be sorted vectors
|
||||
##
|
||||
##############################################################################
|
||||
struct CountIteratorDictionary{T}
|
||||
d::T
|
||||
end
|
||||
|
||||
function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator)
|
||||
d = Dict{eltype(s1), Tuple{Int, Int}}()
|
||||
for ch1 in s1
|
||||
if haskey(d, ch1)
|
||||
t = d[ch1]
|
||||
d[ch1] = (t[1] + 1, 0)
|
||||
else
|
||||
d[ch1] = (1, 0)
|
||||
end
|
||||
end
|
||||
for ch2 in s2
|
||||
if haskey(d, ch2)
|
||||
t = d[ch2]
|
||||
d[ch2] = (t[1], t[2] + 1)
|
||||
else
|
||||
d[ch2] = (0, 1)
|
||||
end
|
||||
end
|
||||
return values(d)
|
||||
end
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -75,9 +116,9 @@ end
|
|||
abstract type AbstractQGram <: SemiMetric end
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
sort1 = sort(QGramIterator(s1, length(s1), dist.q))
|
||||
sort2 = sort(QGramIterator(s2, length(s2), dist.q))
|
||||
evaluate(dist, CountIterator(sort1, sort2))
|
||||
evaluate(dist,
|
||||
CountIteratorDictionary(QGramIterator(s1, length(s1), dist.q),
|
||||
QGramIterator(s2, length(s2), dist.q)))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
|
Loading…
Reference in New Issue