use binary search

tree
matthieugomez 2015-11-02 12:52:23 -05:00
parent d5b9905d72
commit 5efbfd27a1
4 changed files with 127 additions and 69 deletions

View File

@ -1,5 +1,5 @@
using StringDistances
using DataStructures, StringDistances
x = map(randstring, rand(5:25,100_000))
y = map(randstring, rand(5:25,100_000))
@ -33,4 +33,30 @@ system.time(stringdist(x,y,method='jaccard'))
system.time(stringdist(x,y,method='cosine'))
system.time(stringdist(x,y,method='qgram'))
=#
=#
function f(x, y)
d = Array(Float64, length(x))
sort1 = Array(SubString{ASCIIString}, 25)
sort2 = Array(SubString{ASCIIString}, 25)
@inbounds for i in 1:length(x)
d[i] = evaluate(Jaccard(2), x[i], y[i], sort1 sort2)
end
end
@time f(x, y)
function g(x, y)
d = Array(Float64, length(x))
set1 = Set{SubString{ASCIIString}}()
set2 = Set{SubString{ASCIIString}}()
@inbounds for i in 1:length(x)
d[i] = evaluate(Jaccard(2), x[i], y[i], set1, set2)
end
end
@time g(x, y)

View File

@ -9,7 +9,6 @@ module StringDistances
##############################################################################
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import DataStructures: OrderedSet, OrderedDict
export evaluate,
Hamming, hamming,
Levenshtein, levenshtein,
@ -24,12 +23,12 @@ Winkler
# 1. only do the switch once
# 2. precomputes length(s1), length(s2)
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString, x...)
len1, len2 = length(s1), length(s2)
if len1 > len2
return evaluate(dist, s2, s1, len2, len1)
return evaluate(dist, s2, s1, len2, len1, x...)
else
return evaluate(dist, s1, s2, len1, len2)
return evaluate(dist, s1, s2, len1, len2, x...)
end
end

View File

@ -24,53 +24,49 @@ function Base.done(qgram::QGramIterator, state)
done(qgram.s, idend)
end
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
Base.length(qgram::QGramIterator) = length(qgram.s) - qgram.q + 1
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
function Base.collect(qiter::QGramIterator)
x = Array(eltype(qiter), length(qiter))
i = 0
@inbounds for q in qiter
i += 1
x[i] = q
end
return x
end
##############################################################################
##
## A Bag is a Set that allows duplicated values
## Implemented as Dictionary from elements => number of duplicates
## Define some operations on sorted vector that represent qgrams
##
##############################################################################
type Bag{Tv, Ti <: Integer}
dict::Dict{Tv, Ti}
Bag() = new(Dict{Tv, Ti}())
function _norm2(v::AbstractVector)
out = 0
len = length(v)
istart = 1
while istart <= len
x = v[istart]
iend = searchsortedlast(v, x, istart, len, Base.Forward)
out += (iend - istart + 1)^2
istart = iend + 1
end
return sqrt(out)
end
function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
bag.dict[x] = get(bag.dict, x, zero(Ti)) + one(Ti)
return bag
function _ndistinct(v::AbstractVector)
out = 0
len = length(v)
istart = 1
while istart <= len
x = v[istart]
iend = searchsortedlast(v, x, istart, len, Base.Forward)
out += 1
istart = iend + 1
end
return out
end
Base.sizehint!(bag::Bag, i::Integer) = sizehint!(bag.dict, i)
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x)
v = get(bag.dict, x, zero(Ti))
if v > zero(Ti)
bag.dict[x] = v - one(Ti)
end
return x
end
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
function Bag(s::QGramIterator)
bag = Bag{eltype(s), UInt}()
sizehint!(bag, length(s))
for x in s
push!(bag, x)
end
return bag
end
function Base.Set(s::QGramIterator)
set = Set{eltype(s)}()
sizehint!(set, length(s))
for x in s
push!(set, x)
end
return set
end
##############################################################################
##
## q-gram
@ -89,15 +85,27 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Int
len2 == 0 && return 0
q1 = QGramIterator(s1, dist.q)
q2 = QGramIterator(s2, dist.q)
sort1 = sort!(collect(q1))
lenq1 = length(sort1)
bag2 = Bag(q2)
for ch in q1
delete!(bag2, ch)
end
# number non matched in s1 : n1 - (n2 - length(bag))
# number non matched in s2 : length(bag)
return length(q1) - length(q2) + 2 * length(bag2)
q2 = QGramIterator(s2, dist.q)
sort2 = sort!(collect(q2))
lenq2 = length(sort2)
numerator = 0
i1start = 1
i2start = 1
while i1start <= lenq1
ch1 = sort1[i1start]
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
numerator += first(i2range) - i2start
numerator += abs((i1end - i1start + 1) - length(i2range))
i1start = i1end + 1
i2start = last(i2range) + 1
end
numerator += lenq2 - i2start + 1
return numerator
end
qgram(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
@ -114,19 +122,33 @@ type Cosine{T <: Integer} <: SemiMetric
end
Cosine() = Cosine(2)
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len2 == 0 && return 0.0
q1 = QGramIterator(s1, dist.q)
q2 = QGramIterator(s2, dist.q)
sort1 = sort!(collect(q1))
lenq1 = length(sort1)
q2 = QGramIterator(s2, dist.q)
sort2 = sort!(collect(q2))
lenq2 = length(sort2)
bag1 = Bag(q1)
bag2 = Bag(q2)
numerator = 0
for (k, v1) in bag1.dict
numerator += v1 * get(bag2.dict, k, 0)
norm1 = 0
i1start = 1
i2start = 1
while i1start <= lenq1
ch1 = sort1[i1start]
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
numerator += (i1end - i1start + 1) * length(i2range)
norm1 += (i1end - i1start + 1)^2
i1start = i1end + 1
i2start = last(i2range) + 1
end
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
denominator = sqrt(norm1) * _norm2(sort2)
return denominator != 0 ? 1.0 - numerator / denominator : s1 == s2 ? 0.0 : 1.0
end
@ -142,7 +164,6 @@ cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine
## return 1.0 if smaller than qgram
##
##############################################################################
type Jaccard{T <: Integer} <: SemiMetric
q::T
end
@ -152,17 +173,29 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
len2 == 0 && return 0.0
q1 = QGramIterator(s1, dist.q)
q2 = QGramIterator(s2, dist.q)
sort1 = sort!(collect(q1))
lenq1 = length(q1)
q2 = QGramIterator(s2, dist.q)
sort2 = sort!(collect(q2))
lenq2 = length(q2)
set1 = Set(q1)
set2 = Set(q2)
numerator = 0
for x in set1
if x in set2
numerator += 1
end
i1start = 1
i2start = 1
norm1 = 0
while i1start <= lenq1
ch1 = sort1[i1start]
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
numerator += length(i2range) > 0
norm1 += 1
i1start = i1end + 1
i2start = last(i2range) + 1
end
denominator = length(set1) + length(set2) - numerator
norm2 = _ndistinct(sort2)
denominator = norm1 + norm2 - numerator
return denominator != 0 ? 1.0 - numerator / denominator : s1 == s2 ? 0.0 : 1.0
end

View File

@ -109,7 +109,7 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
(Winkler(Jaro(), 0.1, 1.0), [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 1 0 4 4 7 8 4 13 32 8 6 5]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))