update to 0.7

pull/7/head
matthieugomez 2018-07-04 14:02:50 -04:00
parent ba5a54fa84
commit 69a008fcf8
6 changed files with 87 additions and 71 deletions

View File

@ -67,13 +67,13 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
len1 == len2 && return compare(dist.dist, s1, s2)
len1 == 0 && return compare(dist.dist, "", "")
iter = QGramIterator(s2, len2, len1)
state = iterate(iter)
s, state = iterate(iter, state)
out = compare(dist.dist, s1, s)
while state != nothing
s, state = iterate(iter, state)
out = 0.0
x = iterate(iter)
while x != nothing
s, state = x
curr = compare(dist.dist, s1, s)
out = max(out, curr)
x = iterate(iter, state)
end
return out
end
@ -97,7 +97,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
s2_end += len2 - s2_end
end
i2_start = nextind(s2, 0, s2_start)
i2_end = s2_end == len2 ? endof(s2) : (nextind(s2, 0, s2_end + 1) - 1)
i2_end = s2_end == len2 ? lastindex(s2) : (nextind(s2, 0, s2_end + 1) - 1)
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
out = max(out, curr)
end
@ -115,12 +115,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric
end
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
if search(s1, Base._default_delims) > 0
s1 = join(sort!(split(s1)), " ")
end
if search(s2, Base._default_delims) > 0
s2 = join(sort!(split(s2)), " ")
end
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(dist.dist, s1, s2)
end
@ -137,8 +133,8 @@ end
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = join(v0, " ")
s1 = join(chain(v0, v1), " ")
s2 = join(chain(v0, v2), " ")
s1 = join(Iterators.flatten((v0, v1)), " ")
s2 = join(Iterators.flatten((v0, v2)), " ")
if isempty(s0)
# otherwise compare(dist, "", "a")== 1.0
compare(dist.dist, s1, s2)

View File

@ -32,12 +32,12 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1)
s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1)
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
matching_blocks!(x, s1before, s2before, start1, start2)
if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2)
s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1))
s2after = SubString(s2, chr2ind(s2, a[2] + a[3]), endof(s2))
if (a[1] + a[3]) <= lastindex(s1) && (a[2] + a[3]) <= lastindex(s2)
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
end
end

View File

@ -4,19 +4,23 @@
##############################################################################
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
state1 = firstindex(s1)
state2 = firstindex(s2)
# in case this loop never happens
out1 = firstindex(s1)
out2 = firstindex(s2)
x1 = iterate(s1)
x2 = iterate(s2)
l = 0
while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0)
ch1, nextstate1 = iterate(s1, state1)
ch2, nextstate2 = iterate(s2, state2)
while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
ch1, state1 = x1
ch2, state2 = x2
ch1 != ch2 && break
out1 = state1
out2 = state2
x1 = iterate(s1, state1)
x2 = iterate(s2, state2)
l += 1
state1, state2 = nextstate1, nextstate2
end
return l, state1, state2
return l, out1, out2
end
##############################################################################
@ -48,28 +52,26 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
# prefix common to both strings can be ignored
s2, len2, s1, len1 = reorder(s1, s2)
k, start1, start2 = common_prefix(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
(start1 > ncu1) && return len2 - k
x1 = iterate(s1, start1)
(x1 == nothing) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v0 = Array{Int}(undef, len2 - k)
for i2 in 1:(len2 - k)
v0[i2] = i2
end
current = zero(0)
state1 = start1
current = 0
i1 = 0
while state1 <= ncu1
while x1 != nothing
i1 += 1
ch1, state1 = iterate(s1, i1)
ch1, state1 = x1
left = (i1 - 1)
current = (i1 - 1)
state2 = start2
i2 = 0
while state2 <= ncu2
x2 = iterate(s2, start2)
while x2 != nothing
i2 += 1
ch2, state2 = iterate(s2, state2)
ch2, state2 = x2
# update
above, current, left = current, left, v0[i2]
if ch1 != ch2
@ -79,7 +81,9 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
left + 1)
end
v0[i2] = current
x2 = iterate(s2, state2)
end
x1 = iterate(s1, state1)
end
return current
end
@ -95,36 +99,32 @@ struct DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
(start1 > ncu1) && return len2 - k
k, state1, start2 = common_prefix(s1, s2)
x1 = iterate(s1, state1)
(x1 == nothing) && return len2 - k
v0 = Array{Int}(undef, len2 - k)
@inbounds for i2 in 1:(len2 - k)
v0[i2] = i2
end
v2 = Array{Int}(undef, len2 - k)
ch1, = iterate(s1, start1)
current = 0
state1 = start1
i1 = 0
while state1 <= ncu1
ch1 = first(s1)
while (x1 != nothing)
i1 += 1
prevch1 = ch1
ch1, state1 = iterate(s1, i1)
ch2, = iterate(s2, start2)
ch1, state1 = x1
x2 = iterate(s2, start2)
left = (i1 - 1)
current = i1
nextTransCost = 0
state2 = start2
ch2, = x2
i2 = 0
while state2 <= ncu2
while (x2 != nothing)
i2 += 1
prevch2 = ch2
ch2, state2 = iterate(s2, state2)
ch2, state2 = x2
above = current
thisTransCost = nextTransCost
nextTransCost = v2[i2]
@ -150,7 +150,9 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
end
end
v0[i2] = current
x2 = iterate(s2, state2)
end
x1 = iterate(s1, state1)
end
return current
end
@ -165,8 +167,6 @@ struct Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
ncu1 = ncodeunits(s1)
ncu2 = ncodeunits(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
@ -174,21 +174,22 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
m = 0
flag = fill(false, len2)
i1 = 0
state1 = firstindex(s1)
startstate2 = firstindex(s2)
starti2 = 0
i1_match = fill!(Array{typeof(state1)}(undef, len1), state1)
while state1 <= ncu1
ch1, newstate1 = iterate(s1, i1)
state1 = firstindex(s1)
i1_match = fill!(Array{Int}(undef, len1), state1)
x1 = iterate(s1)
while (x1 != nothing)
ch1, newstate1 = x1
i1 += 1
if starti2 < i1 - maxdist - 1
startstate2 = iterate(s2, startstate2)
startstate2 = nextind(s2, startstate2)
starti2 += 1
end
i2 = starti2
state2 = startstate2
while state2 <= len2 && i2 <= i1 + maxdist
ch2, state2 = iterate(s2, state2)
x2 = iterate(s2, startstate2)
while (x2 != nothing) && i2 <= i1 + maxdist
ch2, state2 = x2
i2 += 1
if ch1 == ch2 && !flag[i2]
m += 1
@ -196,8 +197,10 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
i1_match[m] = state1
break
end
x2 = iterate(s2, state2)
end
state1 = newstate1
x1 = iterate(s1, state1)
end
# count t transpotsitions
t = 0

View File

@ -11,12 +11,10 @@ struct QGramIterator{S <: AbstractString, T <: Integer}
end
function Base.iterate(qgram::QGramIterator)
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
end
function Base.iterate(qgram::QGramIterator, state)
function Base.iterate(qgram::QGramIterator,
state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q)))
istart, iend = state
iend > nchodeunits(qgram.s) && return nothing
iend > ncodeunits(qgram.s) && return nothing
element = SubString(qgram.s, istart, iend)
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
@ -25,7 +23,7 @@ Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array{eltype(qgram)}(length(qgram))
x = Array{eltype(qgram)}(undef, length(qgram))
i = 0
for q in qgram
i += 1
@ -46,11 +44,10 @@ struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
v1::T1
v2::T2
end
Base.iterate(s::CountIterator) = (1, 1)
function Base.iterate(s::CountIterator, state)
function Base.iterate(s::CountIterator, state = (1, 1))
state1, state2 = state
state2 > s.v2 && state1 > s.v1 && nothing
state2 > length(s.v2) && state1 > length(s.v1) && return nothing
iter1 = state2 > length(s.v2)
iter2 = state1 > length(s.v1)
if iter1

18
test/.sublime2Terminal.jl Normal file
View File

@ -0,0 +1,18 @@
using StringDistances, Test
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1

View File

@ -30,6 +30,7 @@ using StringDistances, Test
@test evaluate(Hamming(), "testing", "this is a test") == 13
@test evaluate(Hamming(), "saturday", "sunday") == 7
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@ -54,6 +55,7 @@ using StringDistances, Test
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777