simplify a bit Damerau

pull/57/head
matthieugomez 2021-09-10 17:55:37 -04:00
parent 5bec23d357
commit 4c73b55825
3 changed files with 18 additions and 18 deletions

View File

@ -73,7 +73,7 @@ The function `pairwise` is particularly optimized for QGram-distances (each elem
findall(s, itr, dist::StringDistance; min_score = 0.8)
```
The functions `findnearest` and `findall` are particularly optimized for `Levenshtein`, `DamerauLevenshtein` distances (these distances stop early if the distance is higher than a certain threshold).
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).
### distance modifiers

View File

@ -26,7 +26,7 @@ end
@time f(OptimalStringAlignement(), x, y, min_score = 0.8);
# 0.08
@time f(DamerauLevenshtein(), x, y);
# 2s
# 1.8s
@time f(RatcliffObershelp(), x, y);
# 0.65s

View File

@ -248,22 +248,16 @@ function (dist::DamerauLevenshtein)(s1, s2)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
T = promote_type(eltype(s1), eltype(s2))
da = Dict{T, Int}(x => 0 for x in Iterators.flatten((s1, s2)))
d = zeros(Int, len1 + 2, len2 + 2)
md = len1 + len2
@inbounds for i in 0:len1
d[i + 2, 1] = md
d[i + 2, 2] = i
end
@inbounds for j in 0:len2
d[1, j + 2] = md
d[2, j + 2] = j
end
da = Dict{T, Int}()
sizehint!(da, len1 + len2)
d = zeros(Int, len1 + 1, len2 + 1)
d[:, 1] = 0:len1
d[1, :] = 0:len2
# fill in the distance matrix d
for (i1, ch1) in enumerate(s1)
db = 0
for (i2, ch2) in enumerate(s2)
j1 = da[ch2]
j1 = get(da, ch2, 0)
j2 = db
if ch1 == ch2
cost = 0
@ -271,10 +265,16 @@ function (dist::DamerauLevenshtein)(s1, s2)
else
cost = 1
end
@inbounds d[i1 + 2, i2 + 2] = min(d[i1 + 1, i2 + 1] + cost,
d[i1 + 2, i2 + 1] + 1,
d[i1 + 1, i2 + 2] + 1,
d[j1 + 1, j2 + 1] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
if j1 == 0 || j2 == 0
@inbounds d[i1 + 1, i2 + 1] = min(d[i1, i2] + cost,
d[i1 + 1, i2] + 1,
d[i1, i2 + 1] + 1)
else
@inbounds d[i1 + 1, i2 + 1] = min(d[i1, i2] + cost,
d[i1 + 1, i2] + 1,
d[i1, i2 + 1] + 1,
d[j1, j2] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
end
end
da[ch1] = i1
end