StringDistances.jl/src/edit.jl

287 lines
9.4 KiB
Julia
Raw Normal View History

2019-08-18 18:52:37 +02:00
"""
Jaro()
Creates the Jaro metric
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
2020-02-08 17:49:53 +01:00
struct Jaro <: SemiMetric end
2020-02-02 17:47:31 +01:00
2019-08-18 18:52:37 +02:00
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
2020-02-07 14:39:29 +01:00
## accepts any iterator, including AbstractString
2020-02-07 14:31:00 +01:00
function evaluate(dist::Jaro, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
2019-08-19 19:54:38 +02:00
s1, s2 = reorder(s1, s2)
2019-08-19 19:33:33 +02:00
len1, len2 = length(s1), length(s2)
2019-12-13 00:55:41 +01:00
# if both are empty, m = 0 so should be 1.0 according to wikipedia.
# Add this line so that not the case
2019-08-18 18:52:37 +02:00
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
2019-08-18 18:52:37 +02:00
flag = fill(false, len2)
prevstate1 = firstindex(s1)
i1_match = fill(prevstate1, len1)
2019-08-18 18:52:37 +02:00
# m counts number matching characters
m = 0
i1 = 1
i2 = 1
x1 = iterate(s1)
x2 = iterate(s2)
while x1 !== nothing
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
i2 += 1
x2 = iterate(s2, state2)
end
i2curr = i2
x2curr = x2
while x2curr !== nothing
i2curr > i1 + maxdist && break
2019-08-18 18:52:37 +02:00
ch2, state2 = x2curr
if (ch1 == ch2) && !flag[i2curr]
2019-08-18 18:52:37 +02:00
m += 1
flag[i2curr] = true
i1_match[m] = prevstate1
break
end
x2curr = iterate(s2, state2)
i2curr += 1
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
2019-08-19 19:54:38 +02:00
m == 0 && return 1.0
2019-08-18 18:52:37 +02:00
# t counts number of transpositions
t = 0
i1 = 0
i2 = 0
for ch2 in s2
i2 += 1
if flag[i2]
i1 += 1
t += ch2 != iterate(s1, i1_match[i1])[1]
end
end
2019-12-12 15:38:20 +01:00
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
2019-08-18 18:52:37 +02:00
end
2019-08-18 01:45:31 +02:00
"""
Levenshtein()
Creates the Levenshtein metric
2015-10-25 16:23:46 +01:00
2019-12-13 00:55:41 +01:00
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct Levenshtein <: Metric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
2019-12-13 00:55:41 +01:00
# Return max_dist +1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
2020-02-07 14:39:29 +01:00
## accepts any iterator, including AbstractString
2020-02-07 14:31:00 +01:00
function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
2019-08-19 20:04:55 +02:00
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
2019-08-20 21:38:14 +02:00
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
2019-08-17 17:40:26 +02:00
# prefix common to both strings can be ignored
2019-12-13 16:33:06 +01:00
k, x1, x2start = common_prefix(s1, s2)
x1 == nothing && return len2 - k
2015-10-24 18:45:24 +02:00
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
2020-02-02 17:47:31 +01:00
v = collect(1:(len2-k))
2018-07-04 20:02:50 +02:00
current = 0
2018-07-04 21:47:11 +02:00
i1 = 1
2018-07-04 23:27:40 +02:00
while x1 !== nothing
2018-07-04 20:02:50 +02:00
ch1, state1 = x1
2019-08-17 17:40:26 +02:00
left = i1 - 1
current = i1 - 1
min_dist = i1 - 2
2018-07-04 21:47:11 +02:00
i2 = 1
2018-07-04 21:04:06 +02:00
x2 = x2start
2018-07-04 23:27:40 +02:00
while x2 !== nothing
2018-07-04 20:02:50 +02:00
ch2, state2 = x2
2015-10-24 18:45:24 +02:00
# update
2020-02-02 17:47:31 +01:00
above, current, left = current, left, v[i2]
2015-10-24 18:45:24 +02:00
if ch1 != ch2
2019-08-17 17:40:26 +02:00
current = min(current + 1, above + 1, left + 1)
2015-10-23 16:12:51 +02:00
end
min_dist = min(min_dist, left)
2020-02-02 17:47:31 +01:00
v[i2] = current
2018-07-04 20:02:50 +02:00
x2 = iterate(s2, state2)
2018-07-04 21:47:11 +02:00
i2 += 1
2015-10-23 16:12:51 +02:00
end
2019-08-20 21:38:14 +02:00
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
2018-07-04 20:02:50 +02:00
x1 = iterate(s1, state1)
2018-07-04 21:47:11 +02:00
i1 += 1
2015-10-23 16:12:51 +02:00
end
2019-08-20 21:38:14 +02:00
max_dist !== nothing && current > max_dist && return max_dist + 1
return current
2015-10-23 16:12:51 +02:00
end
2019-12-13 00:55:41 +01:00
2020-02-02 17:47:31 +01:00
2019-08-18 01:45:31 +02:00
"""
DamerauLevenshtein()
2015-11-02 18:54:47 +01:00
2019-08-18 01:45:31 +02:00
Creates the DamerauLevenshtein metric
2019-12-13 00:55:41 +01:00
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct DamerauLevenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
2020-02-07 14:39:29 +01:00
## accepts any iterator, including AbstractString
2020-02-07 14:31:00 +01:00
function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
2019-08-19 19:54:38 +02:00
s1, s2 = reorder(s1, s2)
2019-08-19 19:33:33 +02:00
len1, len2 = length(s1), length(s2)
2019-08-20 21:38:14 +02:00
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2019-12-13 16:33:06 +01:00
k, x1, x2start = common_prefix(s1, s2)
2018-07-04 20:02:50 +02:00
(x1 == nothing) && return len2 - k
2020-02-02 17:47:31 +01:00
v = collect(1:(len2-k))
w = similar(v)
2019-08-20 21:38:14 +02:00
if max_dist !== nothing
offset = 1 + max_dist - (len2 - len1)
i2_start = 1
i2_end = max_dist
end
2018-07-04 21:47:11 +02:00
i1 = 1
current = i1
2018-07-04 21:04:06 +02:00
prevch1, = x1
2019-08-17 17:40:26 +02:00
while x1 !== nothing
2018-07-04 20:02:50 +02:00
ch1, state1 = x1
2015-10-24 18:45:24 +02:00
left = (i1 - 1)
current = i1
2015-10-23 16:12:51 +02:00
nextTransCost = 0
2018-07-04 21:04:06 +02:00
prevch2, = x2start
2019-08-20 21:38:14 +02:00
if max_dist !== nothing
i2_start += (i1 > offset) ? 1 : 0
i2_end = min(i2_end + 1, len2)
end
2018-07-04 21:04:06 +02:00
x2 = x2start
2018-07-04 21:47:11 +02:00
i2 = 1
2019-08-17 17:40:26 +02:00
while x2 !== nothing
2018-07-04 20:02:50 +02:00
ch2, state2 = x2
2019-08-20 21:38:14 +02:00
if max_dist == nothing || (i2_start <= i2 <= i2_end)
above = current
thisTransCost = nextTransCost
2020-02-02 17:47:31 +01:00
nextTransCost = w[i2]
2019-08-20 21:38:14 +02:00
# cost of diagonal (substitution)
2020-02-02 17:47:31 +01:00
w[i2] = current = left
2019-08-20 21:38:14 +02:00
# left now equals current cost (which will be diagonal at next iteration)
2020-02-02 17:47:31 +01:00
left = v[i2]
2019-08-20 21:38:14 +02:00
if ch1 != ch2
# insertion
if left < current
current = left
end
# deletion
if above < current
current = above
end
current += 1
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
2015-10-23 16:12:51 +02:00
end
end
2020-02-02 17:47:31 +01:00
v[i2] = current
2015-10-23 16:12:51 +02:00
end
2018-07-04 20:02:50 +02:00
x2 = iterate(s2, state2)
2018-07-04 21:47:11 +02:00
i2 += 1
2018-07-04 21:04:06 +02:00
prevch2 = ch2
2015-10-23 16:12:51 +02:00
end
2020-02-02 17:47:31 +01:00
max_dist !== nothing && v[i1 + len2 - len1] > max_dist && return max_dist + 1
2018-07-04 20:02:50 +02:00
x1 = iterate(s1, state1)
2018-07-04 21:47:11 +02:00
i1 += 1
2018-07-04 21:04:06 +02:00
prevch1 = ch1
2015-10-23 16:12:51 +02:00
end
2019-08-20 21:38:14 +02:00
max_dist !== nothing && current > max_dist && return max_dist + 1
2015-10-23 16:12:51 +02:00
return current
end
2019-08-18 01:45:31 +02:00
"""
RatcliffObershelp()
Creates the RatcliffObershelp metric
2019-08-17 18:57:35 +02:00
2019-12-13 00:55:41 +01:00
The distance between two strings is defined as one minus the number of matching characters
divided by the total number of characters in the two strings. Matching characters are those
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct RatcliffObershelp <: SemiMetric end
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
2019-08-17 18:57:35 +02:00
2019-12-12 21:32:59 +01:00
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
2019-08-17 22:12:41 +02:00
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
2019-08-17 18:57:35 +02:00
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end
2019-12-13 00:55:41 +01:00
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
2020-02-08 17:38:06 +01:00
a = longest_common_pattern(s1, s2, len1 , len2)
2019-08-17 19:18:13 +02:00
# exit if there is no common substring
a[3] == 0 && return x
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
# add the longest common substring that happens after
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
2019-12-13 00:55:41 +01:00
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
2019-08-17 18:57:35 +02:00
return x
end
2019-12-13 02:01:47 +01:00
2020-02-08 17:38:06 +01:00
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
2019-12-13 02:01:47 +01:00
if len1 > len2
2020-02-08 17:38:06 +01:00
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
2019-12-13 02:01:47 +01:00
else
start1, start2, len = 0, 0, 0
p = zeros(Int, len2)
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, len
end