StringDistances.jl/src/edit.jl

230 lines
8.5 KiB
Julia
Raw Normal View History

2019-08-18 18:52:37 +02:00
"""
Jaro()
2020-02-26 01:40:14 +01:00
Creates the Jaro distance
2019-08-18 18:52:37 +02:00
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
2020-02-08 17:49:53 +01:00
struct Jaro <: SemiMetric end
2020-02-02 17:47:31 +01:00
2019-08-18 18:52:37 +02:00
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
2020-02-07 14:39:29 +01:00
## accepts any iterator, including AbstractString
2020-02-13 15:44:27 +01:00
function (dist::Jaro)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
2019-08-19 19:54:38 +02:00
s1, s2 = reorder(s1, s2)
2019-08-19 19:33:33 +02:00
len1, len2 = length(s1), length(s2)
2020-02-11 13:39:15 +01:00
# If both are empty, the formula in Wikipedia gives 0
2019-12-13 00:55:41 +01:00
# Add this line so that not the case
2019-08-18 18:52:37 +02:00
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
2019-08-18 18:52:37 +02:00
flag = fill(false, len2)
2020-02-21 16:16:52 +01:00
ch1_match = Vector{eltype(s1)}()
for (i1, ch1) in enumerate(s1)
for (i2, ch2) in enumerate(s2)
2020-02-24 15:41:38 +01:00
# greedy alignement
2020-02-21 16:16:52 +01:00
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
2020-02-18 14:18:45 +01:00
flag[i2] = true
2020-02-21 16:16:52 +01:00
push!(ch1_match, ch1)
2019-08-18 18:52:37 +02:00
break
end
end
end
2020-02-21 16:16:52 +01:00
# m counts number matching characters
m = length(ch1_match)
2019-08-19 19:54:38 +02:00
m == 0 && return 1.0
2020-02-21 16:16:52 +01:00
# t counts number transpositions
2019-08-18 18:52:37 +02:00
t = 0
i1 = 0
2020-02-21 16:16:52 +01:00
for (i2, ch2) in enumerate(s2)
2019-08-18 18:52:37 +02:00
if flag[i2]
i1 += 1
2020-02-11 13:39:15 +01:00
t += ch2 != ch1_match[i1]
2019-08-18 18:52:37 +02:00
end
end
2019-12-12 15:38:20 +01:00
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
2019-08-18 18:52:37 +02:00
end
2019-08-18 01:45:31 +02:00
"""
Levenshtein()
2020-02-26 01:40:14 +01:00
Creates the Levenshtein distance
2015-10-25 16:23:46 +01:00
2019-12-13 00:55:41 +01:00
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct Levenshtein <: Metric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
2020-02-21 16:16:52 +01:00
# Return max_value + 1 if distance higher than max_value
# This makes it possible to differentiate distance equalt to max_value vs strictly higher
2019-12-13 00:55:41 +01:00
# This is important for find_all
2020-02-21 16:16:52 +01:00
function (dist::Levenshtein)(s1, s2, max_value = nothing)
2020-02-13 15:44:27 +01:00
((s1 === missing) | (s2 === missing)) && return missing
2019-08-19 20:04:55 +02:00
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
2020-02-21 16:16:52 +01:00
max_value !== nothing && len2 - len1 > max_value && return max_value + 1
2019-08-17 17:40:26 +02:00
# prefix common to both strings can be ignored
2020-02-18 14:18:45 +01:00
k = common_prefix(s1, s2)
2020-02-18 14:38:20 +01:00
k == len1 && return len2 - k
2015-10-24 18:45:24 +02:00
# distance initialized to first row of matrix
2020-02-21 16:16:52 +01:00
# distance between "" and s2[1:i]
2020-02-02 17:47:31 +01:00
v = collect(1:(len2-k))
2018-07-04 20:02:50 +02:00
current = 0
2020-02-21 16:16:52 +01:00
for (i1, ch1) in enumerate(s1)
2020-02-18 14:18:45 +01:00
i1 <= k && continue
2020-02-21 16:16:52 +01:00
left = current = i1 - k - 1
max_value !== nothing && (value_lb = left - 1)
for (i2, ch2) in enumerate(s2)
2020-02-18 14:18:45 +01:00
i2 <= k && continue
above, current, left = current, left, v[i2 - k]
2015-10-24 18:45:24 +02:00
if ch1 != ch2
2020-02-21 16:16:52 +01:00
current = min(current, above, left) + 1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && (value_lb = min(value_lb, left))
2020-02-18 14:18:45 +01:00
v[i2 - k] = current
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && value_lb > max_value && return max_value + 1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && current > max_value && return max_value + 1
return current
2015-10-23 16:12:51 +02:00
end
2019-08-18 01:45:31 +02:00
"""
DamerauLevenshtein()
2015-11-02 18:54:47 +01:00
2020-02-26 01:40:14 +01:00
Creates the restricted DamerauLevenshtein distance
2019-08-18 01:45:31 +02:00
2019-12-13 00:55:41 +01:00
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
2020-02-19 14:35:17 +01:00
The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
2020-02-19 14:39:09 +01:00
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
2019-08-18 01:45:31 +02:00
"""
2020-02-19 14:42:17 +01:00
2020-02-08 17:49:53 +01:00
struct DamerauLevenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
2020-02-21 16:16:52 +01:00
# Return max_value + 1 if distance higher than max_value
function (dist::DamerauLevenshtein)(s1, s2, max_value = nothing)
2020-02-13 15:44:27 +01:00
((s1 === missing) | (s2 === missing)) && return missing
2019-08-19 19:54:38 +02:00
s1, s2 = reorder(s1, s2)
2019-08-19 19:33:33 +02:00
len1, len2 = length(s1), length(s2)
2020-02-21 16:16:52 +01:00
max_value !== nothing && len2 - len1 > max_value && return max_value + 1
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2020-02-18 14:18:45 +01:00
k = common_prefix(s1, s2)
2020-02-18 14:38:20 +01:00
k == len1 && return len2 - k
2020-02-02 17:47:31 +01:00
v = collect(1:(len2-k))
w = similar(v)
2020-02-21 16:16:52 +01:00
if max_value !== nothing
i2_start = k + 1
i2_end = max_value
2019-08-20 21:38:14 +02:00
end
2020-02-19 14:35:17 +01:00
prevch1, prevch2 = first(s1), first(s2)
2020-02-21 16:16:52 +01:00
current = 0
for (i1, ch1) in enumerate(s1)
2020-02-18 14:18:45 +01:00
i1 <= k && continue
2020-02-21 16:16:52 +01:00
left = current = i1 - k - 1
2015-10-23 16:12:51 +02:00
nextTransCost = 0
2020-02-21 16:16:52 +01:00
if max_value !== nothing
i2_start += (i1 > 1 + max_value - (len2 - len1)) ? 1 : 0
2020-02-19 14:35:17 +01:00
i2_end += (i2_end < len2) ? 1 : 0
2019-08-20 21:38:14 +02:00
end
2020-02-21 16:16:52 +01:00
for (i2, ch2) in enumerate(s2)
i2 <= k && continue
# no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_value cells (upper left is i1)
if (max_value !== nothing) && ((i2 < i2_start) | (i2 > i2_end))
prevch2 = ch2
else
above, current, left = current, left, v[i2 - k]
w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
2020-02-19 14:35:17 +01:00
# left now equals current cost (which will be diagonal at next iteration)
if ch1 != ch2
2020-02-21 16:16:52 +01:00
current = min(left, current, above) + 1
# note that it never happens at i2 = k + 1 because then the two previous characters were equal
if (i1 > 1 + k) & (i2 > 1 + k) && (ch1 == prevch2) && (prevch1 == ch2)
2020-02-19 14:35:17 +01:00
thisTransCost += 1
2020-02-21 16:16:52 +01:00
current = min(current, thisTransCost)
2015-10-23 16:12:51 +02:00
end
end
2020-02-19 14:35:17 +01:00
v[i2 - k] = current
2020-02-21 16:16:52 +01:00
prevch2 = ch2
2015-10-23 16:12:51 +02:00
end
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && v[i1 - k + len2 - len1] > max_value && return max_value + 1
2018-07-04 21:04:06 +02:00
prevch1 = ch1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && current > max_value && return max_value + 1
2015-10-23 16:12:51 +02:00
return current
end
2019-08-18 01:45:31 +02:00
"""
RatcliffObershelp()
2020-02-26 01:40:14 +01:00
Creates the RatcliffObershelp distance
2019-08-17 18:57:35 +02:00
2019-12-13 00:55:41 +01:00
The distance between two strings is defined as one minus the number of matching characters
divided by the total number of characters in the two strings. Matching characters are those
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct RatcliffObershelp <: SemiMetric end
2020-02-13 15:44:27 +01:00
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
2020-02-16 17:12:31 +01:00
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
2019-08-17 22:12:41 +02:00
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
2019-08-17 18:57:35 +02:00
end
function matching_blocks(s1, s2)
2020-02-16 17:12:31 +01:00
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
2019-08-17 18:57:35 +02:00
end
2020-02-16 17:12:31 +01:00
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
2020-02-24 15:41:38 +01:00
n1, n2, len = longest_common_pattern(s1, s2)
2019-08-17 19:18:13 +02:00
# exit if there is no common substring
2020-02-24 15:41:38 +01:00
len == 0 && return x
2019-08-17 19:18:13 +02:00
# add the info of the common to the existing set
2020-02-24 15:41:38 +01:00
push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
2019-08-17 19:18:13 +02:00
# add the longest common substring that happens before
2020-02-24 15:41:38 +01:00
matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
2019-08-17 19:18:13 +02:00
# add the longest common substring that happens after
2020-02-24 15:41:38 +01:00
matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1),
start1 + n1 + len - 1, start2 + n2 + len - 1)
2019-08-17 18:57:35 +02:00
return x
end
2019-12-13 02:01:47 +01:00
2020-02-16 17:12:31 +01:00
function longest_common_pattern(s1, s2)
if length(s1) > length(s2)
start2, start1, len = longest_common_pattern(s2, s1)
2019-12-13 02:01:47 +01:00
else
start1, start2, len = 0, 0, 0
2020-02-16 17:12:31 +01:00
p = zeros(Int, length(s2))
2020-02-21 16:16:52 +01:00
for (i1, ch1) in enumerate(s1)
2019-12-13 02:01:47 +01:00
oldp = 0
2020-02-21 16:16:52 +01:00
for (i2, ch2) in enumerate(s2)
2019-12-13 02:01:47 +01:00
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
2020-02-16 17:12:31 +01:00
currentlength = i2 - newp + 1
2019-12-13 02:01:47 +01:00
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, len
end