StringDistances.jl/src/edit.jl

261 lines
8.2 KiB
Julia
Executable File

"""
Jaro()
Creates the Jaro metric
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString
function (dist::Jaro)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# If both are empty, the formula in Wikipedia gives 0
# Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
ch1_match = Vector{eltype(s1)}(undef, len1)
# m counts number matching characters
m = 0
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
for ch2 in s2
i2 += 1
i2 > i1 + maxdist && break
if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
m += 1
flag[i2] = true
ch1_match[m] = ch1
break
end
end
end
m == 0 && return 1.0
# t counts number of transpositions
t = 0
i1 = 0
i2 = 0
for ch2 in s2
i2 += 1
if flag[i2]
i1 += 1
t += ch2 != ch1_match[i1]
end
end
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
"""
Levenshtein()
Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: Metric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist + 1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
function (dist::Levenshtein)(s1, s2, max_dist = nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k = common_prefix(s1, s2)
(k == length(s1)) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v = collect(1:(len2-k))
current = 0
i1 = 0
left = 0
current = 0
min_dist = 0
for ch1 in s1
i1 += 1
i1 <= k && continue
left = i1 - k - 1
current = i1 - k - 1
min_dist = i1 - k - 2
i2 = 0
for ch2 in s2
i2 += 1
i2 <= k && continue
# update
above, current, left = current, left, v[i2 - k]
if ch1 != ch2
current = min(current + 1, above + 1, left + 1)
end
min_dist = min(min_dist, left)
v[i2 - k] = current
end
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
end
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
end
"""
DamerauLevenshtein()
Creates the DamerauLevenshtein metric
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
"""
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k = common_prefix(s1, s2)
(k == length(s1)) && return len2 - k
v = collect(1:(len2-k))
w = similar(v)
if max_dist !== nothing
offset = 1 + max_dist - (len2 - len1)
i2_start = 1
i2_end = max_dist
end
i1 = 0
current = i1
prevch1 = first(s1)
prevch2 = first(s2)
for ch1 in s1
i1 += 1
i1 <= k && continue
left = i1 - k - 1
current = i1 - k
nextTransCost = 0
if max_dist !== nothing
i2_start += (i1 > offset) ? 1 : 0
i2_end = min(i2_end + 1, len2)
end
i2 = 0
for ch2 in s2
i2 += 1
if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end))
prevch2 = ch2
continue
end
above = current
thisTransCost = nextTransCost
nextTransCost = w[i2 - k]
# cost of diagonal (substitution)
w[i2 - k] = current = left
# left now equals current cost (which will be diagonal at next iteration)
left = v[i2 - k]
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
end
v[i2 - k] = current
prevch2 = ch2
end
max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
prevch1 = ch1
end
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
end
"""
RatcliffObershelp()
Creates the RatcliffObershelp metric
The distance between two strings is defined as one minus the number of matching characters
divided by the total number of characters in the two strings. Matching characters are those
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
end
function matching_blocks(s1, s2)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
a = longest_common_pattern(s1, s2)
# exit if there is no common substring
a[3] == 0 && return x
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
# add the longest common substring that happens after
matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1),
start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
return x
end
function longest_common_pattern(s1, s2)
if length(s1) > length(s2)
start2, start1, len = longest_common_pattern(s2, s1)
else
start1, start2, len = 0, 0, 0
p = zeros(Int, length(s2))
i1 = 0
for ch1 in s1
i1 += 1
oldp = 0
i2 = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = i2 - newp + 1
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, len
end