use binary search
parent
d5b9905d72
commit
5efbfd27a1
|
@ -1,5 +1,5 @@
|
|||
|
||||
using StringDistances
|
||||
using DataStructures, StringDistances
|
||||
|
||||
x = map(randstring, rand(5:25,100_000))
|
||||
y = map(randstring, rand(5:25,100_000))
|
||||
|
@ -33,4 +33,30 @@ system.time(stringdist(x,y,method='jaccard'))
|
|||
system.time(stringdist(x,y,method='cosine'))
|
||||
system.time(stringdist(x,y,method='qgram'))
|
||||
|
||||
=#
|
||||
=#
|
||||
|
||||
|
||||
|
||||
function f(x, y)
|
||||
d = Array(Float64, length(x))
|
||||
sort1 = Array(SubString{ASCIIString}, 25)
|
||||
sort2 = Array(SubString{ASCIIString}, 25)
|
||||
@inbounds for i in 1:length(x)
|
||||
d[i] = evaluate(Jaccard(2), x[i], y[i], sort1 sort2)
|
||||
end
|
||||
end
|
||||
@time f(x, y)
|
||||
|
||||
|
||||
|
||||
|
||||
function g(x, y)
|
||||
d = Array(Float64, length(x))
|
||||
set1 = Set{SubString{ASCIIString}}()
|
||||
set2 = Set{SubString{ASCIIString}}()
|
||||
@inbounds for i in 1:length(x)
|
||||
d[i] = evaluate(Jaccard(2), x[i], y[i], set1, set2)
|
||||
end
|
||||
end
|
||||
@time g(x, y)
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ module StringDistances
|
|||
##############################################################################
|
||||
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import DataStructures: OrderedSet, OrderedDict
|
||||
export evaluate,
|
||||
Hamming, hamming,
|
||||
Levenshtein, levenshtein,
|
||||
|
@ -24,12 +23,12 @@ Winkler
|
|||
|
||||
# 1. only do the switch once
|
||||
# 2. precomputes length(s1), length(s2)
|
||||
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString, x...)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return evaluate(dist, s2, s1, len2, len1)
|
||||
return evaluate(dist, s2, s1, len2, len1, x...)
|
||||
else
|
||||
return evaluate(dist, s1, s2, len1, len2)
|
||||
return evaluate(dist, s1, s2, len1, len2, x...)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
157
src/qgram.jl
157
src/qgram.jl
|
@ -24,53 +24,49 @@ function Base.done(qgram::QGramIterator, state)
|
|||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
|
||||
Base.length(qgram::QGramIterator) = length(qgram.s) - qgram.q + 1
|
||||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||||
|
||||
function Base.collect(qiter::QGramIterator)
|
||||
x = Array(eltype(qiter), length(qiter))
|
||||
i = 0
|
||||
@inbounds for q in qiter
|
||||
i += 1
|
||||
x[i] = q
|
||||
end
|
||||
return x
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## A Bag is a Set that allows duplicated values
|
||||
## Implemented as Dictionary from elements => number of duplicates
|
||||
## Define some operations on sorted vector that represent qgrams
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
type Bag{Tv, Ti <: Integer}
|
||||
dict::Dict{Tv, Ti}
|
||||
Bag() = new(Dict{Tv, Ti}())
|
||||
function _norm2(v::AbstractVector)
|
||||
out = 0
|
||||
len = length(v)
|
||||
istart = 1
|
||||
while istart <= len
|
||||
x = v[istart]
|
||||
iend = searchsortedlast(v, x, istart, len, Base.Forward)
|
||||
out += (iend - istart + 1)^2
|
||||
istart = iend + 1
|
||||
end
|
||||
return sqrt(out)
|
||||
end
|
||||
|
||||
function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv)
|
||||
bag.dict[x] = get(bag.dict, x, zero(Ti)) + one(Ti)
|
||||
return bag
|
||||
function _ndistinct(v::AbstractVector)
|
||||
out = 0
|
||||
len = length(v)
|
||||
istart = 1
|
||||
while istart <= len
|
||||
x = v[istart]
|
||||
iend = searchsortedlast(v, x, istart, len, Base.Forward)
|
||||
out += 1
|
||||
istart = iend + 1
|
||||
end
|
||||
return out
|
||||
end
|
||||
Base.sizehint!(bag::Bag, i::Integer) = sizehint!(bag.dict, i)
|
||||
|
||||
function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x)
|
||||
v = get(bag.dict, x, zero(Ti))
|
||||
if v > zero(Ti)
|
||||
bag.dict[x] = v - one(Ti)
|
||||
end
|
||||
return x
|
||||
end
|
||||
|
||||
Base.length(bag::Bag) = convert(Int, sum(values(bag.dict)))
|
||||
|
||||
function Bag(s::QGramIterator)
|
||||
bag = Bag{eltype(s), UInt}()
|
||||
sizehint!(bag, length(s))
|
||||
for x in s
|
||||
push!(bag, x)
|
||||
end
|
||||
return bag
|
||||
end
|
||||
|
||||
function Base.Set(s::QGramIterator)
|
||||
set = Set{eltype(s)}()
|
||||
sizehint!(set, length(s))
|
||||
for x in s
|
||||
push!(set, x)
|
||||
end
|
||||
return set
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## q-gram
|
||||
|
@ -89,15 +85,27 @@ function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString, len1::Int
|
|||
len2 == 0 && return 0
|
||||
|
||||
q1 = QGramIterator(s1, dist.q)
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort1 = sort!(collect(q1))
|
||||
lenq1 = length(sort1)
|
||||
|
||||
bag2 = Bag(q2)
|
||||
for ch in q1
|
||||
delete!(bag2, ch)
|
||||
end
|
||||
# number non matched in s1 : n1 - (n2 - length(bag))
|
||||
# number non matched in s2 : length(bag)
|
||||
return length(q1) - length(q2) + 2 * length(bag2)
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort2 = sort!(collect(q2))
|
||||
lenq2 = length(sort2)
|
||||
|
||||
numerator = 0
|
||||
i1start = 1
|
||||
i2start = 1
|
||||
while i1start <= lenq1
|
||||
ch1 = sort1[i1start]
|
||||
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
|
||||
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
|
||||
numerator += first(i2range) - i2start
|
||||
numerator += abs((i1end - i1start + 1) - length(i2range))
|
||||
i1start = i1end + 1
|
||||
i2start = last(i2range) + 1
|
||||
end
|
||||
numerator += lenq2 - i2start + 1
|
||||
return numerator
|
||||
end
|
||||
|
||||
qgram(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(QGram(q), s1::AbstractString, s2::AbstractString)
|
||||
|
@ -114,19 +122,33 @@ type Cosine{T <: Integer} <: SemiMetric
|
|||
end
|
||||
Cosine() = Cosine(2)
|
||||
|
||||
|
||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
len2 == 0 && return 0.0
|
||||
|
||||
q1 = QGramIterator(s1, dist.q)
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort1 = sort!(collect(q1))
|
||||
lenq1 = length(sort1)
|
||||
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort2 = sort!(collect(q2))
|
||||
lenq2 = length(sort2)
|
||||
|
||||
bag1 = Bag(q1)
|
||||
bag2 = Bag(q2)
|
||||
numerator = 0
|
||||
for (k, v1) in bag1.dict
|
||||
numerator += v1 * get(bag2.dict, k, 0)
|
||||
norm1 = 0
|
||||
i1start = 1
|
||||
i2start = 1
|
||||
while i1start <= lenq1
|
||||
ch1 = sort1[i1start]
|
||||
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
|
||||
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
|
||||
numerator += (i1end - i1start + 1) * length(i2range)
|
||||
norm1 += (i1end - i1start + 1)^2
|
||||
i1start = i1end + 1
|
||||
i2start = last(i2range) + 1
|
||||
end
|
||||
denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict)))
|
||||
|
||||
denominator = sqrt(norm1) * _norm2(sort2)
|
||||
return denominator != 0 ? 1.0 - numerator / denominator : s1 == s2 ? 0.0 : 1.0
|
||||
end
|
||||
|
||||
|
@ -142,7 +164,6 @@ cosine(s1::AbstractString, s2::AbstractString; q::Integer = 2) = evaluate(Cosine
|
|||
## return 1.0 if smaller than qgram
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
type Jaccard{T <: Integer} <: SemiMetric
|
||||
q::T
|
||||
end
|
||||
|
@ -152,17 +173,29 @@ function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString, len1::I
|
|||
len2 == 0 && return 0.0
|
||||
|
||||
q1 = QGramIterator(s1, dist.q)
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort1 = sort!(collect(q1))
|
||||
lenq1 = length(q1)
|
||||
|
||||
q2 = QGramIterator(s2, dist.q)
|
||||
sort2 = sort!(collect(q2))
|
||||
lenq2 = length(q2)
|
||||
|
||||
set1 = Set(q1)
|
||||
set2 = Set(q2)
|
||||
numerator = 0
|
||||
for x in set1
|
||||
if x in set2
|
||||
numerator += 1
|
||||
end
|
||||
i1start = 1
|
||||
i2start = 1
|
||||
norm1 = 0
|
||||
while i1start <= lenq1
|
||||
ch1 = sort1[i1start]
|
||||
i1end = searchsortedlast(sort1, ch1, i1start, lenq1, Base.Forward)
|
||||
i2range = searchsorted(sort2, ch1, i2start, lenq2, Base.Forward)
|
||||
numerator += length(i2range) > 0
|
||||
norm1 += 1
|
||||
i1start = i1end + 1
|
||||
i2start = last(i2range) + 1
|
||||
end
|
||||
denominator = length(set1) + length(set2) - numerator
|
||||
|
||||
norm2 = _ndistinct(sort2)
|
||||
denominator = norm1 + norm2 - numerator
|
||||
return denominator != 0 ? 1.0 - numerator / denominator : s1 == s2 ? 0.0 : 1.0
|
||||
end
|
||||
|
||||
|
|
|
@ -109,7 +109,7 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
|||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
|
||||
(Winkler(Jaro(), 0.1, 1.0), [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
(QGram(2), [ 6 7 7 1 1 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
(Jaccard(1), [0.0000000 0.4285714 0.3750000 0.1666667 1.0 0.0000000 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0000000 0.0000000 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 0.0 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 1.0 0.0 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
|
|
Loading…
Reference in New Issue