simplify
parent
b19fd3bfb1
commit
cc688b5729
|
@ -1 +1 @@
|
|||
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
|
||||
h(DamerauLevenshtein(), x, y)
|
||||
|
|
|
@ -4,29 +4,30 @@ Random.seed!(2)
|
|||
x = map(Random.randstring, rand(5:25,500_000))
|
||||
y = map(Random.randstring, rand(5:25,500_000))
|
||||
|
||||
function f(t, x, y; min_dist = nothing)
|
||||
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
||||
function f(t, x, y; min_score = nothing)
|
||||
[compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)]
|
||||
end
|
||||
|
||||
@time f(Hamming(), x, y)
|
||||
#0.1s
|
||||
#0.05s
|
||||
@time f(Jaro(), x, y)
|
||||
#0.3s
|
||||
@time f(Levenshtein(), x, y)
|
||||
# 0.35s. A bit faster than StringDist
|
||||
@time f(Levenshtein(), x, y, min_dist = 0.8)
|
||||
@time f(Levenshtein(), x, y, min_score = 0.8)
|
||||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||
# 0.39s. Much faster than StringDist
|
||||
# 0.45s. Much faster than StringDist
|
||||
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
|
||||
# 0.08
|
||||
|
||||
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
|
||||
@time find_best(x[1], y, DamerauLevenshtein())
|
||||
# 0.41
|
||||
@time find_all(x[1], y, Levenshtein())
|
||||
|
||||
@time find_all(x[1], y, DamerauLevenshtein())
|
||||
# 0.09
|
||||
|
||||
|
||||
|
||||
function g(t, x, y)
|
||||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||
end
|
||||
@time g(Jaccard(2), x, y)
|
||||
# 1.6s slower compared to StringDist
|
||||
|
||||
|
||||
|
@ -34,11 +35,18 @@ end
|
|||
|
||||
|
||||
|
||||
|
||||
function h(t, x, y; max_dist = Inf)
|
||||
all(evaluate(t, x[i], y[i]; max_dist = max_dist) == min(max_dist, evaluate(t, x[i], y[i])) for i in eachindex(x))
|
||||
# check
|
||||
function h(t, x, y; min_score = 1/3)
|
||||
out = fill(false, length(x))
|
||||
for i in eachindex(x)
|
||||
if compare(x[i], y[i], t) < min_score
|
||||
out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ 0.0
|
||||
else
|
||||
out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ compare(x[i], y[i], t)
|
||||
end
|
||||
end
|
||||
all(out)
|
||||
end
|
||||
h(Jaro(), x, y)
|
||||
h(Levenshtein(), x, y)
|
||||
h(DamerauLevenshtein(), x, y)
|
||||
|
||||
|
|
|
@ -30,11 +30,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei
|
|||
else
|
||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
||||
out = 1.0 - d / len2
|
||||
if d == -1 || out < min_score
|
||||
return 0.0
|
||||
else
|
||||
return out
|
||||
end
|
||||
out < min_score && return 0.0
|
||||
return out
|
||||
end
|
||||
end
|
||||
|
||||
|
|
72
src/edit.jl
72
src/edit.jl
|
@ -93,7 +93,7 @@ end
|
|||
##
|
||||
## Levenshtein
|
||||
##
|
||||
## Return -1 if distance higher than max_dist
|
||||
## Return max_dist +1 if distance higher than max_dist
|
||||
## This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||
## This is important for find_all
|
||||
##
|
||||
|
@ -112,7 +112,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|||
max_dist = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return -1
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
x1 == nothing && return len2 - k
|
||||
|
@ -133,7 +133,6 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|||
# update
|
||||
above, current, left = current, left, v0[i2]
|
||||
if ch1 != ch2
|
||||
# substitution
|
||||
current = min(current + 1, above + 1, left + 1)
|
||||
end
|
||||
min_dist = min(min_dist, left)
|
||||
|
@ -141,11 +140,11 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|||
x2 = iterate(s2, state2)
|
||||
i2 += 1
|
||||
end
|
||||
max_dist !== nothing && min_dist > max_dist && return -1
|
||||
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
end
|
||||
max_dist !== nothing && current > max_dist && return - 1
|
||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||
return current
|
||||
end
|
||||
|
||||
|
@ -168,12 +167,17 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
max_dist = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return -1
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
(x1 == nothing) && return len2 - k
|
||||
v0 = collect(1:(len2 - k))
|
||||
v2 = similar(v0)
|
||||
if max_dist !== nothing
|
||||
offset = 1 + max_dist - (len2 - len1)
|
||||
i2_start = 1
|
||||
i2_end = max_dist
|
||||
end
|
||||
i1 = 1
|
||||
current = i1
|
||||
prevch1, = x1
|
||||
|
@ -183,51 +187,55 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
current = i1
|
||||
nextTransCost = 0
|
||||
prevch2, = x2start
|
||||
if max_dist !== nothing
|
||||
i2_start += (i1 > offset) ? 1 : 0
|
||||
i2_end = min(i2_end + 1, len2)
|
||||
end
|
||||
x2 = x2start
|
||||
i2 = 1
|
||||
while x2 !== nothing
|
||||
ch2, state2 = x2
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
nextTransCost = v2[i2]
|
||||
# cost of diagonal (substitution)
|
||||
v2[i2] = current = left
|
||||
# left now equals current cost (which will be diagonal at next iteration)
|
||||
left = v0[i2]
|
||||
if ch1 != ch2
|
||||
# insertion
|
||||
if left < current
|
||||
current = left
|
||||
end
|
||||
# deletion
|
||||
if above < current
|
||||
current = above
|
||||
end
|
||||
current += 1
|
||||
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
|
||||
thisTransCost += 1
|
||||
if thisTransCost < current
|
||||
current = thisTransCost
|
||||
if max_dist == nothing || (i2_start <= i2 <= i2_end)
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
nextTransCost = v2[i2]
|
||||
# cost of diagonal (substitution)
|
||||
v2[i2] = current = left
|
||||
# left now equals current cost (which will be diagonal at next iteration)
|
||||
left = v0[i2]
|
||||
if ch1 != ch2
|
||||
# insertion
|
||||
if left < current
|
||||
current = left
|
||||
end
|
||||
# deletion
|
||||
if above < current
|
||||
current = above
|
||||
end
|
||||
current += 1
|
||||
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
|
||||
thisTransCost += 1
|
||||
if thisTransCost < current
|
||||
current = thisTransCost
|
||||
end
|
||||
end
|
||||
end
|
||||
v0[i2] = current
|
||||
end
|
||||
v0[i2] = current
|
||||
x2 = iterate(s2, state2)
|
||||
i2 += 1
|
||||
prevch2 = ch2
|
||||
end
|
||||
max_dist !== nothing && (v0[i1 + len2 - len1] > max_dist) && return -1
|
||||
max_dist !== nothing && v0[i1 + len2 - len1] > max_dist && return max_dist + 1
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevch1 = ch1
|
||||
end
|
||||
max_dist !== nothing && current > max_dist && return - 1
|
||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||
return current
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Ratcliff/Obershelp
|
||||
|
|
17
src/find.jl
17
src/find.jl
|
@ -4,28 +4,27 @@
|
|||
`find_best` returns the element of the iterator `iter` that has the highest similarity score with `s1` according to the distance `dist`.
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
best_score = 0.0
|
||||
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}; min_score = 0.0) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist; min_score = best_score)
|
||||
if score > best_score
|
||||
score = compare(s1, s2, dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
score == 1.0 && return s2
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
min_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
function find_best(s1::AbstractString, iter_s2, dist::PreMetric)
|
||||
best_score = 0.0
|
||||
|
||||
function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0)
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist)
|
||||
if score > best_score
|
||||
if score >= min_score
|
||||
score == 1.0 && return s2
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
min_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
|
|
Loading…
Reference in New Issue