From 0d1f2e7e9f0943d26a17edbd889cf6664982558b Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Wed, 4 Jul 2018 17:27:40 -0400 Subject: [PATCH] update with dictionary + faster --- .travis.yml | 1 - benchmark/.sublime2Terminal.jl | 1 + benchmark/benchmark.jl | 33 ++++++++--------- src/compare.jl | 2 +- src/distances/edit.jl | 15 ++++---- src/distances/qgram.jl | 67 +++++++++++++++++++++++++++------- 6 files changed, 79 insertions(+), 40 deletions(-) create mode 100644 benchmark/.sublime2Terminal.jl diff --git a/.travis.yml b/.travis.yml index dbd44b1..a69e3d5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: julia julia: -- 0.6 - nightly script: - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl new file mode 100644 index 0000000..1185d75 --- /dev/null +++ b/benchmark/.sublime2Terminal.jl @@ -0,0 +1 @@ +@time f(Jaccard(2), x, y) diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index 001c7f4..ca4a394 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -1,26 +1,23 @@ -using DataStructures, StringDistances +using StringDistances -x = map(randstring, rand(5:25,100_000)) -y = map(randstring, rand(5:25,100_000)) -function f(out, t, x, y) - d = Array(out, length(x)) - @inbounds for i in 1:length(x) - d[i] = evaluate(t, x[i], y[i]) - end +x = map(Base.randstring, rand(5:25,500_000)) +y = map(Base.randstring, rand(5:25,500_000)) +function f(t, x, y) + [evaluate(t, x[i], y[i]) for i in 1:length(x)] end -# similar -@time f(Int, Levenshtein(), x, y) -@time f(Float64, Jaro(), x, y) +# same speed as StringDist +@time f(Levenshtein(), x, y) +@time f(Jaro(), x, y) +@time f(RatcliffObershelp(), x, y) -# 2x slower compared to StringDist -@time f(Int, QGram(2), x, y) -@time f(Float64, Cosine(2), x, y) -@time f(Float64, Jaccard(2), x, y) +# 4x slower compared to StringDist +@time f(Jaccard(2), x, y) +@time f(Cosine(2), x, y) +@time f(QGram(2), x, y) # -@time f(Float64, RatcliffObershelp(), x, y) @@ -29,8 +26,8 @@ end #= Rcode library(stringdist) -x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) -y <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) +x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) +y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) system.time(stringdist(x,y,method='lv', nthread = 1)) system.time(stringdist(x,y,method='jaccard', nthread = 1)) system.time(stringdist(x,y,method='cosine', nthread = 1)) diff --git a/src/compare.jl b/src/compare.jl index 6abd4ea..d14322d 100644 --- a/src/compare.jl +++ b/src/compare.jl @@ -69,7 +69,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString) iter = QGramIterator(s2, len2, len1) out = 0.0 x = iterate(iter) - while x != nothing + while x !== nothing s, state = x curr = compare(dist.dist, s1, s) out = max(out, curr) diff --git a/src/distances/edit.jl b/src/distances/edit.jl index a03c980..6c0f955 100644 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -7,7 +7,7 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1 x1 = iterate(s1) x2 = iterate(s2) l = 0 - while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0) + while (x1 !== nothing) && (x2 !== nothing) && (l < lim || lim < 0) ch1, state1 = x1 ch2, state2 = x2 ch1 != ch2 && break @@ -53,13 +53,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString) v0 = collect(1:(len2 - k)) current = 0 i1 = 1 - while x1 != nothing + while x1 !== nothing ch1, state1 = x1 left = (i1 - 1) current = (i1 - 1) i2 = 1 x2 = x2start - while x2 != nothing + while x2 !== nothing ch2, state2 = x2 # update above, current, left = current, left, v0[i2] @@ -98,7 +98,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri i1 = 1 current = i1 prevch1, = x1 - while (x1 != nothing) + while (x1 !== nothing) ch1, state1 = x1 left = (i1 - 1) current = i1 @@ -106,7 +106,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri prevch2, = x2start x2 = x2start i2 = 1 - while (x2 != nothing) + while (x2 !== nothing) ch2, state2 = x2 above = current thisTransCost = nextTransCost @@ -166,7 +166,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) i2 = 1 x1 = iterate(s1) x2 = iterate(s2) - while (x1 != nothing) + while (x1 !== nothing) ch1, state1 = x1 if i2 <= i1 - maxdist - 1 ch2, state2 = x2 @@ -175,7 +175,8 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) end i2curr = i2 x2curr = x2 - while (x2curr != nothing) && i2curr <= i1 + maxdist + while (x2curr !== nothing) + (i2curr > i1 + maxdist) && break ch2, state2 = x2curr if ch1 == ch2 && !flag[i2curr] m += 1 diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 982c0b8..18ddf8a 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -22,6 +22,15 @@ end Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S} Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) + +############################################################################## +## +## CountedIterator that use Binary Search +## +## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2 +## v1 and v2 must be sorted vectors +## +############################################################################## function Base.collect(qgram::QGramIterator) x = Array{eltype(qgram)}(undef, length(qgram)) i = 0 @@ -33,19 +42,17 @@ function Base.collect(qgram::QGramIterator) end Base.sort(qgram::QGramIterator) = sort!(collect(qgram)) -############################################################################## -## -## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2 -## v1 and v2 must be sorted vectors -## -############################################################################## -struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector} - v1::T1 - v2::T2 +struct CountIteratorBinary{T1, T2} + v1::Vector{T1} + v2::Vector{T2} end -function Base.iterate(s::CountIterator, state = (1, 1)) +function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator) + CountIteratorBinary(sort(s1), sort(s2)) +end + +function Base.iterate(s::CountIteratorBinary, state = (1, 1)) state1, state2 = state iter1 = state2 > length(s.v2) iter2 = state1 > length(s.v1) @@ -66,6 +73,40 @@ function Base.iterate(s::CountIterator, state = (1, 1)) end +############################################################################## +## +## CountedIterator that use Dictionary +## +## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2 +## v1 and v2 must be sorted vectors +## +############################################################################## +struct CountIteratorDictionary{T} + d::T +end + +function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator) + d = Dict{eltype(s1), Tuple{Int, Int}}() + for ch1 in s1 + if haskey(d, ch1) + t = d[ch1] + d[ch1] = (t[1] + 1, 0) + else + d[ch1] = (1, 0) + end + end + for ch2 in s2 + if haskey(d, ch2) + t = d[ch2] + d[ch2] = (t[1], t[2] + 1) + else + d[ch2] = (0, 1) + end + end + return values(d) +end + + ############################################################################## ## @@ -75,9 +116,9 @@ end abstract type AbstractQGram <: SemiMetric end function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) - sort1 = sort(QGramIterator(s1, length(s1), dist.q)) - sort2 = sort(QGramIterator(s2, length(s2), dist.q)) - evaluate(dist, CountIterator(sort1, sort2)) + evaluate(dist, + CountIteratorDictionary(QGramIterator(s1, length(s1), dist.q), + QGramIterator(s2, length(s2), dist.q))) end ##############################################################################