update with dictionary + faster

2018-07-04 17:27:40 -04:00 · 2018-07-04 17:27:40 -04:00 · 0d1f2e7e9f
parent 9fb8c589f6
commit 0d1f2e7e9f
6 changed files with 79 additions and 40 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,6 +1,5 @@
 language: julia
 julia:
- 0.6
 - nightly
 script:
 - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
--- a/benchmark/.sublime2Terminal.jl
+++ b/benchmark/.sublime2Terminal.jl
@ -0,0 +1 @@
+@time f(Jaccard(2), x, y)
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -1,26 +1,23 @@

-using DataStructures, StringDistances
+using StringDistances

-x = map(randstring, rand(5:25,100_000))
-y = map(randstring, rand(5:25,100_000))
-function f(out, t, x, y)
-    d = Array(out, length(x))
-    @inbounds for i in 1:length(x)
-        d[i] = evaluate(t, x[i], y[i])
-    end
+x = map(Base.randstring, rand(5:25,500_000))
+y = map(Base.randstring, rand(5:25,500_000))
+function f(t, x, y)
+    [evaluate(t, x[i], y[i]) for i in 1:length(x)]
 end

-# similar
-@time f(Int, Levenshtein(), x, y)
-@time f(Float64, Jaro(), x, y)
+# same speed as StringDist
+@time f(Levenshtein(), x, y)
+@time f(Jaro(), x, y)
+@time f(RatcliffObershelp(), x, y)

-# 2x slower compared to StringDist
-@time f(Int, QGram(2), x, y)
-@time f(Float64, Cosine(2), x, y)
-@time f(Float64, Jaccard(2), x, y)
+# 4x slower compared to StringDist
+@time f(Jaccard(2), x, y)
+@time f(Cosine(2), x, y)
+@time f(QGram(2), x, y)

 #
-@time f(Float64, RatcliffObershelp(), x, y)



@ -29,8 +26,8 @@ end

 #= Rcode
 library(stringdist)
-x <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) 
-y <- sapply(sample(5:25,1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
+x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) 
+y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
 system.time(stringdist(x,y,method='lv', nthread = 1))
 system.time(stringdist(x,y,method='jaccard', nthread = 1))
 system.time(stringdist(x,y,method='cosine', nthread = 1))
--- a/src/compare.jl
+++ b/src/compare.jl
@ -69,7 +69,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
    iter = QGramIterator(s2, len2, len1)
    out = 0.0
    x = iterate(iter)
-    while x != nothing
+    while x !== nothing
        s, state = x
        curr = compare(dist.dist, s1, s)
        out = max(out, curr)
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -7,7 +7,7 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
    x1 = iterate(s1)
    x2 = iterate(s2)
    l = 0
-    while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
+    while (x1 !== nothing) && (x2 !== nothing) && (l < lim || lim < 0)
        ch1, state1 = x1
        ch2, state2 = x2
        ch1 != ch2 && break
@ -53,13 +53,13 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
    v0 = collect(1:(len2 - k))
    current = 0
    i1 = 1
-    while x1 != nothing
+    while x1 !== nothing
        ch1, state1 = x1
        left = (i1 - 1)
        current = (i1 - 1)
        i2 = 1
        x2 = x2start
-        while x2 != nothing
+        while x2 !== nothing
            ch2, state2 = x2
            #  update
            above, current, left = current, left, v0[i2]
@ -98,7 +98,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
    i1 = 1
    current = i1
    prevch1, = x1
-    while (x1 != nothing)
+    while (x1 !== nothing)
        ch1, state1 = x1
        left = (i1 - 1) 
        current = i1 
@ -106,7 +106,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
        prevch2, = x2start
        x2 = x2start
        i2 = 1
-        while (x2 != nothing)
+        while (x2 !== nothing)
            ch2, state2 = x2
            above = current
            thisTransCost = nextTransCost
@ -166,7 +166,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
    i2 = 1
    x1 = iterate(s1)
    x2 = iterate(s2)
-    while (x1 != nothing)
+    while (x1 !== nothing)
        ch1, state1 = x1
        if i2 <= i1 - maxdist - 1
            ch2, state2 = x2
@ -175,7 +175,8 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
        end 
        i2curr = i2
        x2curr = x2
-        while (x2curr != nothing) && i2curr <= i1 + maxdist
+        while (x2curr !== nothing)
+            (i2curr > i1 + maxdist) && break
            ch2, state2 = x2curr
            if ch1 == ch2 && !flag[i2curr] 
                m += 1
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -22,6 +22,15 @@ end
 Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
 Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
 Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
+
+##############################################################################
+##
+## CountedIterator that use Binary Search
+##
+## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
+## v1 and v2 must be sorted vectors
+##
+##############################################################################
 function Base.collect(qgram::QGramIterator)
 	x = Array{eltype(qgram)}(undef, length(qgram))
 	i = 0
@ -33,19 +42,17 @@ function Base.collect(qgram::QGramIterator)
 end
 Base.sort(qgram::QGramIterator) = sort!(collect(qgram))

-##############################################################################
-##
-## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
-## v1 and v2 must be sorted vectors
-##
-##############################################################################

-struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
-	v1::T1
-	v2::T2
+struct CountIteratorBinary{T1, T2}
+	v1::Vector{T1}
+	v2::Vector{T2}
 end

-function Base.iterate(s::CountIterator, state = (1, 1))
+function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
+	CountIteratorBinary(sort(s1), sort(s2))
+end
+
+function Base.iterate(s::CountIteratorBinary, state = (1, 1))
 	state1, state2 = state
 	iter1 = state2 > length(s.v2)
 	iter2 = state1 > length(s.v1)
@ -66,6 +73,40 @@ function Base.iterate(s::CountIterator, state = (1, 1))
 end


+##############################################################################
+##
+## CountedIterator that use Dictionary
+##
+## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
+## v1 and v2 must be sorted vectors
+##
+##############################################################################
+struct CountIteratorDictionary{T}
+	d::T
+end
+
+function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator)
+	d = Dict{eltype(s1), Tuple{Int, Int}}()
+	for ch1 in s1
+		if haskey(d, ch1)
+			t = d[ch1]
+			d[ch1] = (t[1] + 1, 0)
+		else
+			d[ch1] = (1, 0)
+		end
+	end
+	for ch2 in s2
+		if haskey(d, ch2)
+			t = d[ch2]
+			d[ch2] = (t[1], t[2] + 1)
+		else
+			d[ch2] = (0, 1)
+		end
+	end
+	return values(d)
+end
+
+

 ##############################################################################
 ##
@ -75,9 +116,9 @@ end
 abstract type AbstractQGram <: SemiMetric end

 function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
-	sort1 = sort(QGramIterator(s1, length(s1), dist.q))
-	sort2 = sort(QGramIterator(s2, length(s2), dist.q))
-	evaluate(dist, CountIterator(sort1, sort2))
+	evaluate(dist, 
+		CountIteratorDictionary(QGramIterator(s1, length(s1), dist.q), 
+		QGramIterator(s2, length(s2), dist.q)))
 end

 ##############################################################################