update 0.7
parent
2389f6f178
commit
ba5a54fa84
|
@ -7,7 +7,7 @@ module StringDistances
|
|||
## Export
|
||||
##
|
||||
##############################################################################
|
||||
import Base: eltype, length, start, done, next, ==, hash, isless, convert, show, endof
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show, endof
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import IterTools: chain
|
||||
export
|
||||
|
|
|
@ -67,11 +67,11 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
|||
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||
len1 == 0 && return compare(dist.dist, "", "")
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
state = start(iter)
|
||||
s, state = next(iter, state)
|
||||
state = iterate(iter)
|
||||
s, state = iterate(iter, state)
|
||||
out = compare(dist.dist, s1, s)
|
||||
while !done(iter, state)
|
||||
s, state = next(iter, state)
|
||||
while state != nothing
|
||||
s, state = iterate(iter, state)
|
||||
curr = compare(dist.dist, s1, s)
|
||||
out = max(out, curr)
|
||||
end
|
||||
|
|
|
@ -2,9 +2,9 @@
|
|||
# Indexes refer to character number, not index (differ for Unicode strings)
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
||||
if length(s1) > length(s2)
|
||||
start2, start1, size= longest_common_substring(s2, s1)
|
||||
start2, start1, len = longest_common_substring(s2, s1)
|
||||
else
|
||||
start1, start2, size = 0, 0, 0
|
||||
start1, start2, len = 0, 0, 0
|
||||
p = zeros(Int, length(s2))
|
||||
i1 = 0
|
||||
for ch1 in s1
|
||||
|
@ -17,23 +17,23 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
|||
if ch1 == ch2
|
||||
newp = oldp > 0 ? oldp : i2
|
||||
currentlength = (i2 - newp + 1)
|
||||
if currentlength > size
|
||||
start1, start2, size = i1 - currentlength + 1, newp, currentlength
|
||||
if currentlength > len
|
||||
start1, start2, len = i1 - currentlength + 1, newp, currentlength
|
||||
end
|
||||
end
|
||||
p[i2], oldp = newp, p[i2]
|
||||
end
|
||||
end
|
||||
end
|
||||
return start1, start2, size
|
||||
return start1, start2, len
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
s1before = SubString(s1, start(s1), chr2ind(s1, a[1]) - 1)
|
||||
s2before = SubString(s2, start(s2), chr2ind(s2, a[2]) - 1)
|
||||
s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1)
|
||||
s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1)
|
||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||
if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2)
|
||||
s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1))
|
||||
|
|
|
@ -4,17 +4,19 @@
|
|||
##############################################################################
|
||||
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
start1 = firstindex(s1)
|
||||
start2 = firstindex(s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
state1 = firstindex(s1)
|
||||
state2 = firstindex(s2)
|
||||
l = 0
|
||||
while (start1 <= ncodeunits(s1)) && (start2 <= ncodeunits(s2)) && (l < lim || lim < 0)
|
||||
ch1, nextstart1 = iterate(s1, start1)
|
||||
ch2, nextstart2 = iterate(s2, start2)
|
||||
while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0)
|
||||
ch1, nextstate1 = iterate(s1, state1)
|
||||
ch2, nextstate2 = iterate(s2, state2)
|
||||
ch1 != ch2 && break
|
||||
l += 1
|
||||
start1, start2 = nextstart1, nextstart2
|
||||
state1, state2 = nextstate1, nextstate2
|
||||
end
|
||||
return l, start1, start2
|
||||
return l, state1, state2
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -44,27 +46,28 @@ struct Levenshtein <: SemiMetric end
|
|||
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
(start1 > ncodeunits(s1)) && return len2 - k
|
||||
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
(start1 > ncu1) && return len2 - k
|
||||
# distance initialized to first row of matrix
|
||||
# => distance between "" and s2[1:i}
|
||||
v0 = Array{Int}(undef, len2 - k)
|
||||
@inbounds for i2 in 1:(len2 - k)
|
||||
for i2 in 1:(len2 - k)
|
||||
v0[i2] = i2
|
||||
end
|
||||
current = zero(0)
|
||||
state1 = start1
|
||||
i1 = 0
|
||||
while state1 <= ncodeunits(s1)
|
||||
while state1 <= ncu1
|
||||
i1 += 1
|
||||
ch1, state1 = iterate(s1, i1)
|
||||
left = (i1 - 1)
|
||||
current = (i1 - 1)
|
||||
state2 = start2
|
||||
i2 = 0
|
||||
while state2 <= ncodeunits(s1)
|
||||
while state2 <= ncu2
|
||||
i2 += 1
|
||||
ch2, state2 = iterate(s2, state2)
|
||||
# update
|
||||
|
@ -91,11 +94,12 @@ end
|
|||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
(start1 > ncodeunits(s1)) && return len2 - k
|
||||
(start1 > ncu1) && return len2 - k
|
||||
|
||||
v0 = Array{Int}(undef, len2 - k)
|
||||
@inbounds for i2 in 1:(len2 - k)
|
||||
|
@ -107,7 +111,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
current = 0
|
||||
state1 = start1
|
||||
i1 = 0
|
||||
while state1 <= ncodeunits(s1)
|
||||
while state1 <= ncu1
|
||||
i1 += 1
|
||||
prevch1 = ch1
|
||||
ch1, state1 = iterate(s1, i1)
|
||||
|
@ -117,7 +121,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
nextTransCost = 0
|
||||
state2 = start2
|
||||
i2 = 0
|
||||
while state2 <= ncodeunits(s2)
|
||||
while state2 <= ncu2
|
||||
i2 += 1
|
||||
prevch2 = ch2
|
||||
ch2, state2 = iterate(s2, state2)
|
||||
|
@ -161,6 +165,8 @@ struct Jaro <: SemiMetric end
|
|||
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
|
@ -172,7 +178,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
startstate2 = firstindex(s2)
|
||||
starti2 = 0
|
||||
i1_match = fill!(Array{typeof(state1)}(undef, len1), state1)
|
||||
while state1 <= ncodeunits(s1)
|
||||
while state1 <= ncu1
|
||||
ch1, newstate1 = iterate(s1, i1)
|
||||
i1 += 1
|
||||
if starti2 < i1 - maxdist - 1
|
||||
|
|
|
@ -10,22 +10,19 @@ struct QGramIterator{S <: AbstractString, T <: Integer}
|
|||
q::T # length of q-grams
|
||||
end
|
||||
|
||||
function Base.start(qgram::QGramIterator)
|
||||
|
||||
function Base.iterate(qgram::QGramIterator)
|
||||
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
||||
end
|
||||
function Base.next(qgram::QGramIterator, state)
|
||||
function Base.iterate(qgram::QGramIterator, state)
|
||||
istart, iend = state
|
||||
iend > nchodeunits(qgram.s) && return nothing
|
||||
element = SubString(qgram.s, istart, iend)
|
||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||
element, nextstate
|
||||
end
|
||||
function Base.done(qgram::QGramIterator, state)
|
||||
istart, idend = state
|
||||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
||||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
||||
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array{eltype(qgram)}(length(qgram))
|
||||
|
@ -45,16 +42,17 @@ Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
struct CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||
struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||
v1::T1
|
||||
v2::T2
|
||||
end
|
||||
Base.start(s::CountInterator) = (1, 1)
|
||||
Base.iterate(s::CountIterator) = (1, 1)
|
||||
|
||||
function Base.next(s::CountInterator, state)
|
||||
function Base.iterate(s::CountIterator, state)
|
||||
state1, state2 = state
|
||||
iter1 = done(s.v2, state2)
|
||||
iter2 = done(s.v1, state1)
|
||||
state2 > s.v2 && state1 > s.v1 && nothing
|
||||
iter1 = state2 > length(s.v2)
|
||||
iter2 = state1 > length(s.v1)
|
||||
if iter1
|
||||
@inbounds x1 = s.v1[state1]
|
||||
elseif iter2
|
||||
|
@ -70,10 +68,6 @@ function Base.next(s::CountInterator, state)
|
|||
((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
|
||||
end
|
||||
|
||||
function Base.done(s::CountInterator, state)
|
||||
state1, state2 = state
|
||||
done(s.v2, state2) && done(s.v1, state1)
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -85,7 +79,7 @@ abstract type AbstractQGram <: SemiMetric end
|
|||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
sort1 = sort(QGramIterator(s1, length(s1), dist.q))
|
||||
sort2 = sort(QGramIterator(s2, length(s2), dist.q))
|
||||
evaluate(dist, CountInterator(sort1, sort2))
|
||||
evaluate(dist, CountIterator(sort1, sort2))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
|
Loading…
Reference in New Issue