slower but simpler iteration
parent
d80071590b
commit
49b1f3b439
|
@ -1,3 +1,4 @@
|
||||||
benchmark/
|
benchmark/
|
||||||
PC25
|
PC25
|
||||||
Manifest.toml
|
Manifest.toml
|
||||||
|
draft
|
||||||
|
|
148
src/edit.jl
148
src/edit.jl
|
@ -27,34 +27,20 @@ function (dist::Jaro)(s1, s2)
|
||||||
ch1_match = Vector{eltype(s1)}(undef, len1)
|
ch1_match = Vector{eltype(s1)}(undef, len1)
|
||||||
# m counts number matching characters
|
# m counts number matching characters
|
||||||
m = 0
|
m = 0
|
||||||
i1 = 1
|
i1 = 0
|
||||||
i2 = 1
|
for ch1 in s1
|
||||||
x1 = iterate(s1)
|
i1 += 1
|
||||||
x2 = iterate(s2)
|
i2 = 0
|
||||||
while x1 !== nothing
|
for ch2 in s2
|
||||||
ch1, state1 = x1
|
|
||||||
if i2 <= i1 - maxdist - 1
|
|
||||||
ch2, state2 = x2
|
|
||||||
i2 += 1
|
i2 += 1
|
||||||
x2 = iterate(s2, state2)
|
i2 > i1 + maxdist && break
|
||||||
end
|
if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
|
||||||
i2curr = i2
|
|
||||||
x2curr = x2
|
|
||||||
while x2curr !== nothing
|
|
||||||
i2curr > i1 + maxdist && break
|
|
||||||
ch2, state2 = x2curr
|
|
||||||
if (ch1 == ch2) && !flag[i2curr]
|
|
||||||
m += 1
|
m += 1
|
||||||
flag[i2curr] = true
|
flag[i2] = true
|
||||||
ch1_match[m] = ch1
|
ch1_match[m] = ch1
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
x2curr = iterate(s2, state2)
|
|
||||||
i2curr += 1
|
|
||||||
end
|
end
|
||||||
x1 = iterate(s1, state1)
|
|
||||||
i1 += 1
|
|
||||||
prevstate1 = state1
|
|
||||||
end
|
end
|
||||||
m == 0 && return 1.0
|
m == 0 && return 1.0
|
||||||
# t counts number of transpositions
|
# t counts number of transpositions
|
||||||
|
@ -91,35 +77,35 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, x1, x2start = common_prefix(s1, s2)
|
k = common_prefix(s1, s2)
|
||||||
x1 === nothing && return len2 - k
|
(k == length(s1)) && return len2 - k
|
||||||
# distance initialized to first row of matrix
|
# distance initialized to first row of matrix
|
||||||
# => distance between "" and s2[1:i}
|
# => distance between "" and s2[1:i}
|
||||||
v = collect(1:(len2-k))
|
v = collect(1:(len2-k))
|
||||||
current = 0
|
current = 0
|
||||||
i1 = 1
|
i1 = 0
|
||||||
while x1 !== nothing
|
left = 0
|
||||||
ch1, state1 = x1
|
current = 0
|
||||||
left = i1 - 1
|
min_dist = 0
|
||||||
current = i1 - 1
|
for ch1 in s1
|
||||||
min_dist = i1 - 2
|
i1 += 1
|
||||||
i2 = 1
|
i1 <= k && continue
|
||||||
x2 = x2start
|
left = i1 - k - 1
|
||||||
while x2 !== nothing
|
current = i1 - k - 1
|
||||||
ch2, state2 = x2
|
min_dist = i1 - k - 2
|
||||||
|
i2 = 0
|
||||||
|
for ch2 in s2
|
||||||
|
i2 += 1
|
||||||
|
i2 <= k && continue
|
||||||
# update
|
# update
|
||||||
above, current, left = current, left, v[i2]
|
above, current, left = current, left, v[i2 - k]
|
||||||
if ch1 != ch2
|
if ch1 != ch2
|
||||||
current = min(current + 1, above + 1, left + 1)
|
current = min(current + 1, above + 1, left + 1)
|
||||||
end
|
end
|
||||||
min_dist = min(min_dist, left)
|
min_dist = min(min_dist, left)
|
||||||
v[i2] = current
|
v[i2 - k] = current
|
||||||
x2 = iterate(s2, state2)
|
|
||||||
i2 += 1
|
|
||||||
end
|
end
|
||||||
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
|
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
|
||||||
x1 = iterate(s1, state1)
|
|
||||||
i1 += 1
|
|
||||||
end
|
end
|
||||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||||
return current
|
return current
|
||||||
|
@ -144,8 +130,8 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, x1, x2start = common_prefix(s1, s2)
|
k = common_prefix(s1, s2)
|
||||||
x1 === nothing && return len2 - k
|
(k == length(s1)) && return len2 - k
|
||||||
v = collect(1:(len2-k))
|
v = collect(1:(len2-k))
|
||||||
w = similar(v)
|
w = similar(v)
|
||||||
if max_dist !== nothing
|
if max_dist !== nothing
|
||||||
|
@ -153,57 +139,55 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
|
||||||
i2_start = 1
|
i2_start = 1
|
||||||
i2_end = max_dist
|
i2_end = max_dist
|
||||||
end
|
end
|
||||||
i1 = 1
|
i1 = 0
|
||||||
current = i1
|
current = i1
|
||||||
prevch1, = x1
|
prevch1 = first(s1)
|
||||||
while x1 !== nothing
|
prevch2 = first(s2)
|
||||||
ch1, state1 = x1
|
for ch1 in s1
|
||||||
left = i1 - 1
|
i1 += 1
|
||||||
current = i1
|
i1 <= k && continue
|
||||||
|
left = i1 - k - 1
|
||||||
|
current = i1 - k
|
||||||
nextTransCost = 0
|
nextTransCost = 0
|
||||||
prevch2, = x2start
|
|
||||||
if max_dist !== nothing
|
if max_dist !== nothing
|
||||||
i2_start += (i1 > offset) ? 1 : 0
|
i2_start += (i1 > offset) ? 1 : 0
|
||||||
i2_end = min(i2_end + 1, len2)
|
i2_end = min(i2_end + 1, len2)
|
||||||
end
|
end
|
||||||
x2 = x2start
|
i2 = 0
|
||||||
i2 = 1
|
for ch2 in s2
|
||||||
while x2 !== nothing
|
i2 += 1
|
||||||
ch2, state2 = x2
|
if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end))
|
||||||
if max_dist === nothing || (i2_start <= i2 <= i2_end)
|
prevch2 = ch2
|
||||||
above = current
|
continue
|
||||||
thisTransCost = nextTransCost
|
end
|
||||||
nextTransCost = w[i2]
|
above = current
|
||||||
# cost of diagonal (substitution)
|
thisTransCost = nextTransCost
|
||||||
w[i2] = current = left
|
nextTransCost = w[i2 - k]
|
||||||
# left now equals current cost (which will be diagonal at next iteration)
|
# cost of diagonal (substitution)
|
||||||
left = v[i2]
|
w[i2 - k] = current = left
|
||||||
if ch1 != ch2
|
# left now equals current cost (which will be diagonal at next iteration)
|
||||||
# insertion
|
left = v[i2 - k]
|
||||||
if left < current
|
if ch1 != ch2
|
||||||
current = left
|
# insertion
|
||||||
end
|
if left < current
|
||||||
# deletion
|
current = left
|
||||||
if above < current
|
end
|
||||||
current = above
|
# deletion
|
||||||
end
|
if above < current
|
||||||
current += 1
|
current = above
|
||||||
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
|
end
|
||||||
thisTransCost += 1
|
current += 1
|
||||||
if thisTransCost < current
|
if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2)
|
||||||
current = thisTransCost
|
thisTransCost += 1
|
||||||
end
|
if thisTransCost < current
|
||||||
|
current = thisTransCost
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
v[i2] = current
|
|
||||||
end
|
end
|
||||||
x2 = iterate(s2, state2)
|
v[i2 - k] = current
|
||||||
i2 += 1
|
|
||||||
prevch2 = ch2
|
prevch2 = ch2
|
||||||
end
|
end
|
||||||
max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1
|
max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
|
||||||
x1 = iterate(s1, state1)
|
|
||||||
i1 += 1
|
|
||||||
prevch1 = ch1
|
prevch1 = ch1
|
||||||
end
|
end
|
||||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||||
|
|
15
src/utils.jl
15
src/utils.jl
|
@ -43,18 +43,12 @@ function reorder(s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function common_prefix(s1, s2)
|
function common_prefix(s1, s2)
|
||||||
x1 = iterate(s1)
|
|
||||||
x2 = iterate(s2)
|
|
||||||
l = 0
|
l = 0
|
||||||
while (x1 !== nothing) & (x2 !== nothing)
|
for (ch1, ch2) in zip(s1, s2)
|
||||||
ch1, state1 = x1
|
|
||||||
ch2, state2 = x2
|
|
||||||
ch1 != ch2 && break
|
ch1 != ch2 && break
|
||||||
l += 1
|
l += 1
|
||||||
x1 = iterate(s1, state1)
|
|
||||||
x2 = iterate(s2, state2)
|
|
||||||
end
|
end
|
||||||
return l, x1, x2
|
return l
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -69,7 +63,12 @@ end
|
||||||
function _drop(s, n::Integer)
|
function _drop(s, n::Integer)
|
||||||
Base.Iterators.drop(s, n)
|
Base.Iterators.drop(s, n)
|
||||||
end
|
end
|
||||||
|
|
||||||
function _drop(s::AbstractString, n::Integer)
|
function _drop(s::AbstractString, n::Integer)
|
||||||
|
SubString(s, nextind(s, 0, n + 1), lastindex(s))
|
||||||
|
end
|
||||||
|
|
||||||
|
function _drop(s::StringWithLength, n::Integer)
|
||||||
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
|
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ using StringDistances, Unicode, Test
|
||||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||||
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
@test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1
|
||||||
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
|
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
|
||||||
@test Levenshtein()("", "abc") == 3
|
@test Levenshtein()("", "abc") == 3
|
||||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||||
|
|
Loading…
Reference in New Issue