encore dist.q into type + faster dict

pull/17/head
matthieugomez 2019-01-23 17:22:08 -05:00
parent 2834265e96
commit e24e758289
6 changed files with 59 additions and 127 deletions

View File

@ -1,3 +1,3 @@
julia 0.7
Distances
IterTools
IterTools

View File

@ -1,4 +1,3 @@
module StringDistances
##############################################################################
@ -28,16 +27,11 @@ TokenSort,
TokenSet,
TokenMax
##############################################################################
##
## include
##
##############################################################################
include("utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")

View File

@ -1,6 +1,3 @@
\
##############################################################################
##
## compare
@ -21,15 +18,14 @@ end
function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
# When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2)
min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2)
min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2)
else
1 - evaluate(dist, s1, s2)
end
end
##############################################################################
##
## Winkler
@ -70,7 +66,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(dist.dist, s1, s2)
len1 == 0 && return compare(dist.dist, "", "")
iter = QGramIterator(s2, len2, len1)
iter = QGramIterator{typeof(s2), len1}(s2, len2)
out = 0.0
x = iterate(iter)
while x !== nothing

View File

@ -4,73 +4,22 @@
##
##############################################################################
struct QGramIterator{S <: AbstractString, T <: Integer}
struct QGramIterator{S <: AbstractString, N}
s::S # grapheme
l::Int # length of string
q::T # length of q-grams
end
param(x::QGramIterator{S, N}) where {S, N} = N
function Base.iterate(qgram::QGramIterator,
state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q)))
function Base.iterate(qgram::QGramIterator{S, N},
state = (1, qgram.l < N ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
istart, iend = state
iend > ncodeunits(qgram.s) && return nothing
element = SubString(qgram.s, istart, iend)
element = qgram.s[istart:iend]
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
##############################################################################
##
## CountedIterator that use Binary Search
##
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
##
##############################################################################
struct CountIteratorBinary{T1, T2}
v1::Vector{T1}
v2::Vector{T2}
end
function Base.collect(qgram::QGramIterator)
x = Array{eltype(qgram)}(undef, length(qgram))
i = 0
for q in qgram
i += 1
@inbounds x[i] = q
end
x
end
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
CountIteratorBinary(sort(s1), sort(s2))
end
function Base.iterate(s::CountIteratorBinary, state = (1, 1))
state1, state2 = state
iter1 = state2 > length(s.v2)
iter2 = state1 > length(s.v1)
iter2 && iter1 && return nothing
if iter1
x1 = s.v1[state1]
elseif iter2
x2 = s.v2[state2]
else
x1 = s.v1[state1]
x2 = s.v2[state2]
iter1 = x1 <= x2
iter2 = x2 <= x1
end
nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1
nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2
((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
end
Base.eltype(qgram::QGramIterator{S}) where {S} = S
Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0)
##############################################################################
@ -85,22 +34,37 @@ struct CountIteratorDictionary{T}
d::T
end
function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator)
d = Dict{eltype(s1), Tuple{Int, Int}}()
for ch1 in s1
if haskey(d, ch1)
t = d[ch1]
d[ch1] = (t[1] + 1, 0)
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
K = eltype(s1)
d = Dict{K, NTuple{2, UInt8}}()
sizehint!(d, length(s1))
for ch10 in s1
ch1 = convert(K, ch10)
if !isequal(ch1, ch10)
throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
end
index = Base.ht_keyindex2!(d, ch1)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch1
@inbounds d.vals[index] = (d.vals[index][1] + UInt8(1), UInt8(0))
else
d[ch1] = (1, 0)
Base._setindex!(d, (UInt8(1), UInt8(0)), ch1, -index)
end
end
for ch2 in s2
if haskey(d, ch2)
t = d[ch2]
d[ch2] = (t[1], t[2] + 1)
for ch20 in s2
ch2 = convert(K, ch20)
if !isequal(ch2, ch20)
throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
end
index = Base.ht_keyindex2!(d, ch2)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch2
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + UInt8(1))
else
d[ch2] = (0, 1)
Base._setindex!(d, (UInt8(0), UInt8(1)), ch2, -index)
end
end
return values(d)
@ -113,12 +77,14 @@ end
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract type AbstractQGram <: SemiMetric end
abstract type AbstractQGram{N} <: SemiMetric end
param(x::AbstractQGram{N}) where N = N
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
evaluate(dist,
CountIteratorBinary(QGramIterator(s1, length(s1), dist.q),
QGramIterator(s2, length(s2), dist.q)))
CountIteratorDictionary(QGramIterator{typeof(s1), param(dist)}(s1, length(s1)),
QGramIterator{typeof(s2), param(dist)}(s2, length(s2))))
end
##############################################################################
@ -130,15 +96,14 @@ end
##
##############################################################################
struct QGram{T <: Integer} <: AbstractQGram
q::T
end
QGram() = QGram(2)
struct QGram{N} <: AbstractQGram{N} end
QGram(x::Integer) = QGram{x}()
function evaluate(dist::QGram, countiterator)
n = 0
for (n1, n2) in countiterator
n += abs(n1 - n2)
n += abs(Int(n1) - Int(n2))
end
n
end
@ -150,10 +115,9 @@ end
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
##############################################################################
struct Cosine{T <: Integer} <: AbstractQGram
q::T
end
Cosine() = Cosine(2)
struct Cosine{N} <: AbstractQGram{N} end
Cosine(n::Integer = 2) = Cosine{n}()
function evaluate(dist::Cosine, countiterator)
norm1, norm2, prodnorm = 0, 0, 0
@ -174,10 +138,9 @@ end
##
##############################################################################
struct Jaccard{T <: Integer} <: AbstractQGram
q::T
end
Jaccard() = Jaccard(2)
struct Jaccard{N} <: AbstractQGram{N} end
Jaccard(n::Integer = 2) = Jaccard{n}()
function evaluate(dist::Jaccard, countiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -196,10 +159,9 @@ end
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
##############################################################################
struct SorensenDice{T <: Integer} <: AbstractQGram
q::T
end
SorensenDice() = SorensenDice(2)
struct SorensenDice{N} <: AbstractQGram{N} end
SorensenDice(n::Integer = 2) = SorensenDice{n}()
function evaluate(dist::SorensenDice, countiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -218,10 +180,9 @@ end
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
##############################################################################
struct Overlap{T <: Integer} <: AbstractQGram
q::T
end
Overlap() = Overlap(2)
struct Overlap{N} <: AbstractQGram{N} end
Overlap(n::Integer = 2) = Overlap{n}()
function evaluate(dist::Overlap, countiterator)
ndistinct1, ndistinct2, nintersect = 0, 0, 0

View File

@ -1,18 +0,0 @@
using StringDistances, Test
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1

View File

@ -1,4 +1,3 @@
using StringDistances, Test
# check with weird utf8 strings