update 0.7

pull/7/head
matthieugomez 2018-07-04 12:33:13 -04:00
parent 2389f6f178
commit ba5a54fa84
5 changed files with 48 additions and 48 deletions

View File

@ -7,7 +7,7 @@ module StringDistances
## Export
##
##############################################################################
import Base: eltype, length, start, done, next, ==, hash, isless, convert, show, endof
import Base: eltype, length, iterate, ==, hash, isless, convert, show, endof
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import IterTools: chain
export

View File

@ -67,11 +67,11 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
len1 == len2 && return compare(dist.dist, s1, s2)
len1 == 0 && return compare(dist.dist, "", "")
iter = QGramIterator(s2, len2, len1)
state = start(iter)
s, state = next(iter, state)
state = iterate(iter)
s, state = iterate(iter, state)
out = compare(dist.dist, s1, s)
while !done(iter, state)
s, state = next(iter, state)
while state != nothing
s, state = iterate(iter, state)
curr = compare(dist.dist, s1, s)
out = max(out, curr)
end

View File

@ -2,9 +2,9 @@
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
if length(s1) > length(s2)
start2, start1, size= longest_common_substring(s2, s1)
start2, start1, len = longest_common_substring(s2, s1)
else
start1, start2, size = 0, 0, 0
start1, start2, len = 0, 0, 0
p = zeros(Int, length(s2))
i1 = 0
for ch1 in s1
@ -17,23 +17,23 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > size
start1, start2, size = i1 - currentlength + 1, newp, currentlength
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, size
return start1, start2, len
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
s1before = SubString(s1, start(s1), chr2ind(s1, a[1]) - 1)
s2before = SubString(s2, start(s2), chr2ind(s2, a[2]) - 1)
s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1)
s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1)
matching_blocks!(x, s1before, s2before, start1, start2)
if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2)
s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1))

View File

@ -4,17 +4,19 @@
##############################################################################
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
start1 = firstindex(s1)
start2 = firstindex(s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
state1 = firstindex(s1)
state2 = firstindex(s2)
l = 0
while (start1 <= ncodeunits(s1)) && (start2 <= ncodeunits(s2)) && (l < lim || lim < 0)
ch1, nextstart1 = iterate(s1, start1)
ch2, nextstart2 = iterate(s2, start2)
while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0)
ch1, nextstate1 = iterate(s1, state1)
ch2, nextstate2 = iterate(s2, state2)
ch1 != ch2 && break
l += 1
start1, start2 = nextstart1, nextstart2
state1, state2 = nextstate1, nextstate2
end
return l, start1, start2
return l, state1, state2
end
##############################################################################
@ -44,27 +46,28 @@ struct Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
s2, len2, s1, len1 = reorder(s1, s2)
(start1 > ncodeunits(s1)) && return len2 - k
k, start1, start2 = common_prefix(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
(start1 > ncu1) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v0 = Array{Int}(undef, len2 - k)
@inbounds for i2 in 1:(len2 - k)
for i2 in 1:(len2 - k)
v0[i2] = i2
end
current = zero(0)
state1 = start1
i1 = 0
while state1 <= ncodeunits(s1)
while state1 <= ncu1
i1 += 1
ch1, state1 = iterate(s1, i1)
left = (i1 - 1)
current = (i1 - 1)
state2 = start2
i2 = 0
while state2 <= ncodeunits(s1)
while state2 <= ncu2
i2 += 1
ch2, state2 = iterate(s2, state2)
# update
@ -91,11 +94,12 @@ end
struct DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
s2, len2, s1, len1 = reorder(s1, s2)
(start1 > ncodeunits(s1)) && return len2 - k
(start1 > ncu1) && return len2 - k
v0 = Array{Int}(undef, len2 - k)
@inbounds for i2 in 1:(len2 - k)
@ -107,7 +111,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
current = 0
state1 = start1
i1 = 0
while state1 <= ncodeunits(s1)
while state1 <= ncu1
i1 += 1
prevch1 = ch1
ch1, state1 = iterate(s1, i1)
@ -117,7 +121,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
nextTransCost = 0
state2 = start2
i2 = 0
while state2 <= ncodeunits(s2)
while state2 <= ncu2
i2 += 1
prevch2 = ch2
ch2, state2 = iterate(s2, state2)
@ -161,6 +165,8 @@ struct Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
@ -172,7 +178,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
startstate2 = firstindex(s2)
starti2 = 0
i1_match = fill!(Array{typeof(state1)}(undef, len1), state1)
while state1 <= ncodeunits(s1)
while state1 <= ncu1
ch1, newstate1 = iterate(s1, i1)
i1 += 1
if starti2 < i1 - maxdist - 1

View File

@ -10,22 +10,19 @@ struct QGramIterator{S <: AbstractString, T <: Integer}
q::T # length of q-grams
end
function Base.start(qgram::QGramIterator)
function Base.iterate(qgram::QGramIterator)
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
end
function Base.next(qgram::QGramIterator, state)
function Base.iterate(qgram::QGramIterator, state)
istart, iend = state
iend > nchodeunits(qgram.s) && return nothing
element = SubString(qgram.s, istart, iend)
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array{eltype(qgram)}(length(qgram))
@ -45,16 +42,17 @@ Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
##
##############################################################################
struct CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
v1::T1
v2::T2
end
Base.start(s::CountInterator) = (1, 1)
Base.iterate(s::CountIterator) = (1, 1)
function Base.next(s::CountInterator, state)
function Base.iterate(s::CountIterator, state)
state1, state2 = state
iter1 = done(s.v2, state2)
iter2 = done(s.v1, state1)
state2 > s.v2 && state1 > s.v1 && nothing
iter1 = state2 > length(s.v2)
iter2 = state1 > length(s.v1)
if iter1
@inbounds x1 = s.v1[state1]
elseif iter2
@ -70,10 +68,6 @@ function Base.next(s::CountInterator, state)
((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
end
function Base.done(s::CountInterator, state)
state1, state2 = state
done(s.v2, state2) && done(s.v1, state1)
end
##############################################################################
##
@ -85,7 +79,7 @@ abstract type AbstractQGram <: SemiMetric end
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
sort1 = sort(QGramIterator(s1, length(s1), dist.q))
sort2 = sort(QGramIterator(s2, length(s2), dist.q))
evaluate(dist, CountInterator(sort1, sort2))
evaluate(dist, CountIterator(sort1, sort2))
end
##############################################################################