pull/17/head
matthieugomez 2019-08-20 15:38:14 -04:00
parent b19fd3bfb1
commit cc688b5729
5 changed files with 75 additions and 63 deletions

View File

@ -1 +1 @@
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
h(DamerauLevenshtein(), x, y)

View File

@ -4,29 +4,30 @@ Random.seed!(2)
x = map(Random.randstring, rand(5:25,500_000))
y = map(Random.randstring, rand(5:25,500_000))
function f(t, x, y; min_dist = nothing)
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
function f(t, x, y; min_score = nothing)
[compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)]
end
@time f(Hamming(), x, y)
#0.1s
#0.05s
@time f(Jaro(), x, y)
#0.3s
@time f(Levenshtein(), x, y)
# 0.35s. A bit faster than StringDist
@time f(Levenshtein(), x, y, min_dist = 0.8)
@time f(Levenshtein(), x, y, min_score = 0.8)
@time f(DamerauLevenshtein(), x, y)
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
# 0.39s. Much faster than StringDist
# 0.45s. Much faster than StringDist
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
# 0.08
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
@time find_best(x[1], y, DamerauLevenshtein())
# 0.41
@time find_all(x[1], y, Levenshtein())
@time find_all(x[1], y, DamerauLevenshtein())
# 0.09
function g(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
end
@time g(Jaccard(2), x, y)
# 1.6s slower compared to StringDist
@ -34,11 +35,18 @@ end
function h(t, x, y; max_dist = Inf)
all(evaluate(t, x[i], y[i]; max_dist = max_dist) == min(max_dist, evaluate(t, x[i], y[i])) for i in eachindex(x))
# check
function h(t, x, y; min_score = 1/3)
out = fill(false, length(x))
for i in eachindex(x)
if compare(x[i], y[i], t) < min_score
out[i] = compare(x[i], y[i], t ; min_score = min_score) 0.0
else
out[i] = compare(x[i], y[i], t ; min_score = min_score) compare(x[i], y[i], t)
end
end
all(out)
end
h(Jaro(), x, y)
h(Levenshtein(), x, y)
h(DamerauLevenshtein(), x, y)

View File

@ -30,11 +30,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei
else
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
out = 1.0 - d / len2
if d == -1 || out < min_score
return 0.0
else
return out
end
out < min_score && return 0.0
return out
end
end

View File

@ -93,7 +93,7 @@ end
##
## Levenshtein
##
## Return -1 if distance higher than max_dist
## Return max_dist +1 if distance higher than max_dist
## This makes it possible to differentiate distance equalt to max_dist vs strictly higher
## This is important for find_all
##
@ -112,7 +112,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
max_dist = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return -1
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
x1 == nothing && return len2 - k
@ -133,7 +133,6 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
# update
above, current, left = current, left, v0[i2]
if ch1 != ch2
# substitution
current = min(current + 1, above + 1, left + 1)
end
min_dist = min(min_dist, left)
@ -141,11 +140,11 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
x2 = iterate(s2, state2)
i2 += 1
end
max_dist !== nothing && min_dist > max_dist && return -1
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
end
max_dist !== nothing && current > max_dist && return - 1
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
end
@ -168,12 +167,17 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
max_dist = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return -1
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
(x1 == nothing) && return len2 - k
v0 = collect(1:(len2 - k))
v2 = similar(v0)
if max_dist !== nothing
offset = 1 + max_dist - (len2 - len1)
i2_start = 1
i2_end = max_dist
end
i1 = 1
current = i1
prevch1, = x1
@ -183,51 +187,55 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
current = i1
nextTransCost = 0
prevch2, = x2start
if max_dist !== nothing
i2_start += (i1 > offset) ? 1 : 0
i2_end = min(i2_end + 1, len2)
end
x2 = x2start
i2 = 1
while x2 !== nothing
ch2, state2 = x2
above = current
thisTransCost = nextTransCost
nextTransCost = v2[i2]
# cost of diagonal (substitution)
v2[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v0[i2]
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
if max_dist == nothing || (i2_start <= i2 <= i2_end)
above = current
thisTransCost = nextTransCost
nextTransCost = v2[i2]
# cost of diagonal (substitution)
v2[i2] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v0[i2]
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
end
v0[i2] = current
end
v0[i2] = current
x2 = iterate(s2, state2)
i2 += 1
prevch2 = ch2
end
max_dist !== nothing && (v0[i1 + len2 - len1] > max_dist) && return -1
max_dist !== nothing && v0[i1 + len2 - len1] > max_dist && return max_dist + 1
x1 = iterate(s1, state1)
i1 += 1
prevch1 = ch1
end
max_dist !== nothing && current > max_dist && return - 1
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
end
##############################################################################
##
## Ratcliff/Obershelp

View File

@ -4,28 +4,27 @@
`find_best` returns the element of the iterator `iter` that has the highest similarity score with `s1` according to the distance `dist`.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
best_score = 0.0
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}; min_score = 0.0) where T <: Union{Levenshtein, DamerauLevenshtein}
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist; min_score = best_score)
if score > best_score
score = compare(s1, s2, dist; min_score = min_score)
if score >= min_score
score == 1.0 && return s2
best_s2 = s2
best_score = score
min_score = score
end
end
return best_s2
end
function find_best(s1::AbstractString, iter_s2, dist::PreMetric)
best_score = 0.0
function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0)
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist)
if score > best_score
if score >= min_score
score == 1.0 && return s2
best_s2 = s2
best_score = score
min_score = score
end
end
return best_s2