slower but simpler iteration

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-18 08:18:45 -05:00
parent d80071590b
commit 49b1f3b439
4 changed files with 75 additions and 91 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
benchmark/
PC25
Manifest.toml
draft

View File

@ -27,34 +27,20 @@ function (dist::Jaro)(s1, s2)
ch1_match = Vector{eltype(s1)}(undef, len1)
# m counts number matching characters
m = 0
i1 = 1
i2 = 1
x1 = iterate(s1)
x2 = iterate(s2)
while x1 !== nothing
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
for ch2 in s2
i2 += 1
x2 = iterate(s2, state2)
end
i2curr = i2
x2curr = x2
while x2curr !== nothing
i2curr > i1 + maxdist && break
ch2, state2 = x2curr
if (ch1 == ch2) && !flag[i2curr]
i2 > i1 + maxdist && break
if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
m += 1
flag[i2curr] = true
flag[i2] = true
ch1_match[m] = ch1
break
end
x2curr = iterate(s2, state2)
i2curr += 1
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
m == 0 && return 1.0
# t counts number of transpositions
@ -91,35 +77,35 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2)
x1 === nothing && return len2 - k
k = common_prefix(s1, s2)
(k == length(s1)) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v = collect(1:(len2-k))
current = 0
i1 = 1
while x1 !== nothing
ch1, state1 = x1
left = i1 - 1
current = i1 - 1
min_dist = i1 - 2
i2 = 1
x2 = x2start
while x2 !== nothing
ch2, state2 = x2
i1 = 0
left = 0
current = 0
min_dist = 0
for ch1 in s1
i1 += 1
i1 <= k && continue
left = i1 - k - 1
current = i1 - k - 1
min_dist = i1 - k - 2
i2 = 0
for ch2 in s2
i2 += 1
i2 <= k && continue
# update
above, current, left = current, left, v[i2]
above, current, left = current, left, v[i2 - k]
if ch1 != ch2
current = min(current + 1, above + 1, left + 1)
end
min_dist = min(min_dist, left)
v[i2] = current
x2 = iterate(s2, state2)
i2 += 1
v[i2 - k] = current
end
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
end
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
@ -144,8 +130,8 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2)
x1 === nothing && return len2 - k
k = common_prefix(s1, s2)
(k == length(s1)) && return len2 - k
v = collect(1:(len2-k))
w = similar(v)
if max_dist !== nothing
@ -153,57 +139,55 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
i2_start = 1
i2_end = max_dist
end
i1 = 1
i1 = 0
current = i1
prevch1, = x1
while x1 !== nothing
ch1, state1 = x1
left = i1 - 1
current = i1
prevch1 = first(s1)
prevch2 = first(s2)
for ch1 in s1
i1 += 1
i1 <= k && continue
left = i1 - k - 1
current = i1 - k
nextTransCost = 0
prevch2, = x2start
if max_dist !== nothing
i2_start += (i1 > offset) ? 1 : 0
i2_end = min(i2_end + 1, len2)
end
x2 = x2start
i2 = 1
while x2 !== nothing
ch2, state2 = x2
if max_dist === nothing || (i2_start <= i2 <= i2_end)
above = current
thisTransCost = nextTransCost
nextTransCost = w[i2]
# cost of diagonal (substitution)
w[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v[i2]
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
i2 = 0
for ch2 in s2
i2 += 1
if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end))
prevch2 = ch2
continue
end
above = current
thisTransCost = nextTransCost
nextTransCost = w[i2 - k]
# cost of diagonal (substitution)
w[i2 - k] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v[i2 - k]
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
v[i2] = current
end
x2 = iterate(s2, state2)
i2 += 1
v[i2 - k] = current
prevch2 = ch2
end
max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
prevch1 = ch1
end
max_dist !== nothing && current > max_dist && return max_dist + 1

View File

@ -43,18 +43,12 @@ function reorder(s1, s2)
end
function common_prefix(s1, s2)
x1 = iterate(s1)
x2 = iterate(s2)
l = 0
while (x1 !== nothing) & (x2 !== nothing)
ch1, state1 = x1
ch2, state2 = x2
for (ch1, ch2) in zip(s1, s2)
ch1 != ch2 && break
l += 1
x1 = iterate(s1, state1)
x2 = iterate(s2, state2)
end
return l, x1, x2
return l
end
@ -69,7 +63,12 @@ end
function _drop(s, n::Integer)
Base.Iterators.drop(s, n)
end
function _drop(s::AbstractString, n::Integer)
SubString(s, nextind(s, 0, n + 1), lastindex(s))
end
function _drop(s::StringWithLength, n::Integer)
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
end

View File

@ -27,7 +27,7 @@ using StringDistances, Unicode, Test
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
@test Levenshtein()("", "abc") == 3
@test result_type(Levenshtein(), "hello", "world") == Int