slower but simpler iteration

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-18 08:18:45 -05:00
parent d80071590b
commit 49b1f3b439
4 changed files with 75 additions and 91 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
benchmark/ benchmark/
PC25 PC25
Manifest.toml Manifest.toml
draft

View File

@ -27,34 +27,20 @@ function (dist::Jaro)(s1, s2)
ch1_match = Vector{eltype(s1)}(undef, len1) ch1_match = Vector{eltype(s1)}(undef, len1)
# m counts number matching characters # m counts number matching characters
m = 0 m = 0
i1 = 1 i1 = 0
i2 = 1 for ch1 in s1
x1 = iterate(s1) i1 += 1
x2 = iterate(s2) i2 = 0
while x1 !== nothing for ch2 in s2
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
i2 += 1 i2 += 1
x2 = iterate(s2, state2) i2 > i1 + maxdist && break
end if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
i2curr = i2
x2curr = x2
while x2curr !== nothing
i2curr > i1 + maxdist && break
ch2, state2 = x2curr
if (ch1 == ch2) && !flag[i2curr]
m += 1 m += 1
flag[i2curr] = true flag[i2] = true
ch1_match[m] = ch1 ch1_match[m] = ch1
break break
end end
x2curr = iterate(s2, state2)
i2curr += 1
end end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end end
m == 0 && return 1.0 m == 0 && return 1.0
# t counts number of transpositions # t counts number of transpositions
@ -91,35 +77,35 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored # prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2) k = common_prefix(s1, s2)
x1 === nothing && return len2 - k (k == length(s1)) && return len2 - k
# distance initialized to first row of matrix # distance initialized to first row of matrix
# => distance between "" and s2[1:i} # => distance between "" and s2[1:i}
v = collect(1:(len2-k)) v = collect(1:(len2-k))
current = 0 current = 0
i1 = 1 i1 = 0
while x1 !== nothing left = 0
ch1, state1 = x1 current = 0
left = i1 - 1 min_dist = 0
current = i1 - 1 for ch1 in s1
min_dist = i1 - 2 i1 += 1
i2 = 1 i1 <= k && continue
x2 = x2start left = i1 - k - 1
while x2 !== nothing current = i1 - k - 1
ch2, state2 = x2 min_dist = i1 - k - 2
i2 = 0
for ch2 in s2
i2 += 1
i2 <= k && continue
# update # update
above, current, left = current, left, v[i2] above, current, left = current, left, v[i2 - k]
if ch1 != ch2 if ch1 != ch2
current = min(current + 1, above + 1, left + 1) current = min(current + 1, above + 1, left + 1)
end end
min_dist = min(min_dist, left) min_dist = min(min_dist, left)
v[i2] = current v[i2 - k] = current
x2 = iterate(s2, state2)
i2 += 1
end end
max_dist !== nothing && min_dist > max_dist && return max_dist + 1 max_dist !== nothing && min_dist > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
end end
max_dist !== nothing && current > max_dist && return max_dist + 1 max_dist !== nothing && current > max_dist && return max_dist + 1
return current return current
@ -144,8 +130,8 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored # prefix common to both strings can be ignored
k, x1, x2start = common_prefix(s1, s2) k = common_prefix(s1, s2)
x1 === nothing && return len2 - k (k == length(s1)) && return len2 - k
v = collect(1:(len2-k)) v = collect(1:(len2-k))
w = similar(v) w = similar(v)
if max_dist !== nothing if max_dist !== nothing
@ -153,57 +139,55 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
i2_start = 1 i2_start = 1
i2_end = max_dist i2_end = max_dist
end end
i1 = 1 i1 = 0
current = i1 current = i1
prevch1, = x1 prevch1 = first(s1)
while x1 !== nothing prevch2 = first(s2)
ch1, state1 = x1 for ch1 in s1
left = i1 - 1 i1 += 1
current = i1 i1 <= k && continue
left = i1 - k - 1
current = i1 - k
nextTransCost = 0 nextTransCost = 0
prevch2, = x2start
if max_dist !== nothing if max_dist !== nothing
i2_start += (i1 > offset) ? 1 : 0 i2_start += (i1 > offset) ? 1 : 0
i2_end = min(i2_end + 1, len2) i2_end = min(i2_end + 1, len2)
end end
x2 = x2start i2 = 0
i2 = 1 for ch2 in s2
while x2 !== nothing i2 += 1
ch2, state2 = x2 if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end))
if max_dist === nothing || (i2_start <= i2 <= i2_end) prevch2 = ch2
above = current continue
thisTransCost = nextTransCost end
nextTransCost = w[i2] above = current
# cost of diagonal (substitution) thisTransCost = nextTransCost
w[i2] = current = left nextTransCost = w[i2 - k]
# left now equals current cost (which will be diagonal at next iteration) # cost of diagonal (substitution)
left = v[i2] w[i2 - k] = current = left
if ch1 != ch2 # left now equals current cost (which will be diagonal at next iteration)
# insertion left = v[i2 - k]
if left < current if ch1 != ch2
current = left # insertion
end if left < current
# deletion current = left
if above < current end
current = above # deletion
end if above < current
current += 1 current = above
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2) end
thisTransCost += 1 current += 1
if thisTransCost < current if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2)
current = thisTransCost thisTransCost += 1
end if thisTransCost < current
current = thisTransCost
end end
end end
v[i2] = current
end end
x2 = iterate(s2, state2) v[i2 - k] = current
i2 += 1
prevch2 = ch2 prevch2 = ch2
end end
max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1 max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
prevch1 = ch1 prevch1 = ch1
end end
max_dist !== nothing && current > max_dist && return max_dist + 1 max_dist !== nothing && current > max_dist && return max_dist + 1

View File

@ -43,18 +43,12 @@ function reorder(s1, s2)
end end
function common_prefix(s1, s2) function common_prefix(s1, s2)
x1 = iterate(s1)
x2 = iterate(s2)
l = 0 l = 0
while (x1 !== nothing) & (x2 !== nothing) for (ch1, ch2) in zip(s1, s2)
ch1, state1 = x1
ch2, state2 = x2
ch1 != ch2 && break ch1 != ch2 && break
l += 1 l += 1
x1 = iterate(s1, state1)
x2 = iterate(s2, state2)
end end
return l, x1, x2 return l
end end
@ -69,7 +63,12 @@ end
function _drop(s, n::Integer) function _drop(s, n::Integer)
Base.Iterators.drop(s, n) Base.Iterators.drop(s, n)
end end
function _drop(s::AbstractString, n::Integer) function _drop(s::AbstractString, n::Integer)
SubString(s, nextind(s, 0, n + 1), lastindex(s))
end
function _drop(s::StringWithLength, n::Integer)
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n) StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
end end

View File

@ -27,7 +27,7 @@ using StringDistances, Unicode, Test
@test evaluate(Levenshtein(), "saturday", "sunday") == 3 @test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1 @test evaluate(Levenshtein(), [1, 2, 3], [1, 2, 4]) == 1
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak") @test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
@test Levenshtein()("", "abc") == 3 @test Levenshtein()("", "abc") == 3
@test result_type(Levenshtein(), "hello", "world") == Int @test result_type(Levenshtein(), "hello", "world") == Int