2019-08-18 18:52:37 +02:00
|
|
|
"""
|
|
|
|
Jaro()
|
|
|
|
|
2020-02-26 01:40:14 +01:00
|
|
|
Creates the Jaro distance
|
2019-08-18 18:52:37 +02:00
|
|
|
|
|
|
|
The Jaro distance is defined as
|
|
|
|
|
|
|
|
|
|
|
|
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
|
|
|
|
|
|
|
|
where ``m`` is the number of matching characters and
|
|
|
|
``t`` is half the number of transpositions.
|
|
|
|
"""
|
2020-02-08 17:49:53 +01:00
|
|
|
struct Jaro <: SemiMetric end
|
2020-02-02 17:47:31 +01:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
2020-07-20 16:08:27 +02:00
|
|
|
function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
|
2020-02-13 15:44:27 +01:00
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-08-19 19:54:38 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
2019-08-19 19:33:33 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2020-02-11 13:39:15 +01:00
|
|
|
# If both are empty, the formula in Wikipedia gives 0
|
2019-12-13 00:55:41 +01:00
|
|
|
# Add this line so that not the case
|
2019-08-18 18:52:37 +02:00
|
|
|
len2 == 0 && return 0.0
|
2020-07-19 21:37:49 +02:00
|
|
|
d = max(0, div(len2, 2) - 1)
|
2019-08-18 18:52:37 +02:00
|
|
|
flag = fill(false, len2)
|
2020-02-21 16:16:52 +01:00
|
|
|
ch1_match = Vector{eltype(s1)}()
|
|
|
|
for (i1, ch1) in enumerate(s1)
|
|
|
|
for (i2, ch2) in enumerate(s2)
|
2020-02-24 15:41:38 +01:00
|
|
|
# greedy alignement
|
2020-07-19 21:37:49 +02:00
|
|
|
if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2]
|
2020-02-18 14:18:45 +01:00
|
|
|
flag[i2] = true
|
2020-02-21 16:16:52 +01:00
|
|
|
push!(ch1_match, ch1)
|
2019-08-18 18:52:37 +02:00
|
|
|
break
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2020-02-21 16:16:52 +01:00
|
|
|
# m counts number matching characters
|
|
|
|
m = length(ch1_match)
|
2019-08-19 19:54:38 +02:00
|
|
|
m == 0 && return 1.0
|
2020-02-21 16:16:52 +01:00
|
|
|
# t counts number transpositions
|
2019-08-18 18:52:37 +02:00
|
|
|
t = 0
|
|
|
|
i1 = 0
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i2, ch2) in enumerate(s2)
|
2019-08-18 18:52:37 +02:00
|
|
|
if flag[i2]
|
|
|
|
i1 += 1
|
2020-02-11 13:39:15 +01:00
|
|
|
t += ch2 != ch1_match[i1]
|
2019-08-18 18:52:37 +02:00
|
|
|
end
|
|
|
|
end
|
2019-12-12 15:38:20 +01:00
|
|
|
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
2019-08-18 18:52:37 +02:00
|
|
|
end
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
Levenshtein()
|
|
|
|
|
2020-02-26 01:40:14 +01:00
|
|
|
Creates the Levenshtein distance
|
2015-10-25 16:23:46 +01:00
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
|
|
|
substitutions of a single character) required to change one string into the other.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2020-02-08 17:49:53 +01:00
|
|
|
struct Levenshtein <: Metric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
2020-07-13 17:56:34 +02:00
|
|
|
# Return max_dist + 1 if distance higher than max_dist
|
|
|
|
# to differentiate distance equal to max_dist or not, which is important for find fctions.
|
|
|
|
function (dist::Levenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)
|
2020-02-13 15:44:27 +01:00
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-08-19 20:04:55 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
|
|
|
len1, len2 = length(s1), length(s2)
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
2019-08-17 17:40:26 +02:00
|
|
|
# prefix common to both strings can be ignored
|
2020-02-18 14:18:45 +01:00
|
|
|
k = common_prefix(s1, s2)
|
2020-02-18 14:38:20 +01:00
|
|
|
k == len1 && return len2 - k
|
2015-10-24 18:45:24 +02:00
|
|
|
# distance initialized to first row of matrix
|
2020-02-21 16:16:52 +01:00
|
|
|
# distance between "" and s2[1:i]
|
2020-02-02 17:47:31 +01:00
|
|
|
v = collect(1:(len2-k))
|
2018-07-04 20:02:50 +02:00
|
|
|
current = 0
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i1, ch1) in enumerate(s1)
|
2020-02-18 14:18:45 +01:00
|
|
|
i1 <= k && continue
|
2020-02-21 16:16:52 +01:00
|
|
|
left = current = i1 - k - 1
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && (value_lb = left - 1)
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i2, ch2) in enumerate(s2)
|
2020-02-18 14:18:45 +01:00
|
|
|
i2 <= k && continue
|
|
|
|
above, current, left = current, left, v[i2 - k]
|
2015-10-24 18:45:24 +02:00
|
|
|
if ch1 != ch2
|
2020-02-21 16:16:52 +01:00
|
|
|
current = min(current, above, left) + 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && (value_lb = min(value_lb, left))
|
2020-02-18 14:18:45 +01:00
|
|
|
v[i2 - k] = current
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && value_lb > max_dist && return max_dist + 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
2019-08-20 17:59:23 +02:00
|
|
|
return current
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
DamerauLevenshtein()
|
2015-11-02 18:54:47 +01:00
|
|
|
|
2020-02-26 01:40:14 +01:00
|
|
|
Creates the restricted DamerauLevenshtein distance
|
2019-08-18 01:45:31 +02:00
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
|
|
|
|
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
|
|
|
required to change one string into the other.
|
2020-02-19 14:35:17 +01:00
|
|
|
|
|
|
|
The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing
|
|
|
|
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
|
|
|
|
distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
|
2020-02-19 14:39:09 +01:00
|
|
|
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
|
|
|
|
the triangle inequality.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2020-02-19 14:42:17 +01:00
|
|
|
|
2020-02-08 17:49:53 +01:00
|
|
|
struct DamerauLevenshtein <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
2020-07-13 17:56:34 +02:00
|
|
|
# Return max_dist + 1 if distance higher than max_dist
|
|
|
|
function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)
|
2020-02-13 15:44:27 +01:00
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2019-08-19 19:54:38 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
2019-08-19 19:33:33 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
2015-10-25 16:23:46 +01:00
|
|
|
# prefix common to both strings can be ignored
|
2020-02-18 14:18:45 +01:00
|
|
|
k = common_prefix(s1, s2)
|
2020-02-18 14:38:20 +01:00
|
|
|
k == len1 && return len2 - k
|
2020-02-02 17:47:31 +01:00
|
|
|
v = collect(1:(len2-k))
|
|
|
|
w = similar(v)
|
2020-07-13 17:56:34 +02:00
|
|
|
if max_dist !== nothing
|
2020-07-13 19:44:45 +02:00
|
|
|
i2_start = 0
|
|
|
|
i2_end = max_dist
|
2019-08-20 21:38:14 +02:00
|
|
|
end
|
2020-02-19 14:35:17 +01:00
|
|
|
prevch1, prevch2 = first(s1), first(s2)
|
2020-02-21 16:16:52 +01:00
|
|
|
current = 0
|
|
|
|
for (i1, ch1) in enumerate(s1)
|
2020-02-18 14:18:45 +01:00
|
|
|
i1 <= k && continue
|
2020-07-13 19:40:30 +02:00
|
|
|
left = i1 - k - 1
|
|
|
|
current = left + 1
|
2015-10-23 16:12:51 +02:00
|
|
|
nextTransCost = 0
|
2020-07-13 17:56:34 +02:00
|
|
|
if max_dist !== nothing
|
2020-07-13 19:40:30 +02:00
|
|
|
i2_start += (i1 - k - 1 > max_dist - (len2 - len1)) ? 1 : 0
|
2020-07-13 19:44:45 +02:00
|
|
|
i2_end += (i2_end < len2) ? 1 : 0
|
2019-08-20 21:38:14 +02:00
|
|
|
end
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i2, ch2) in enumerate(s2)
|
2020-07-13 19:40:30 +02:00
|
|
|
if i2 <= k
|
|
|
|
prevch2 = ch2
|
2020-07-13 19:44:45 +02:00
|
|
|
elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
|
2020-07-19 21:37:49 +02:00
|
|
|
# no need to look beyond window of lower right diagonal - max distance cells
|
2020-07-13 19:40:30 +02:00
|
|
|
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
|
2020-02-21 16:16:52 +01:00
|
|
|
prevch2 = ch2
|
|
|
|
else
|
|
|
|
above, current, left = current, left, v[i2 - k]
|
|
|
|
w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
|
2020-02-19 14:35:17 +01:00
|
|
|
# left now equals current cost (which will be diagonal at next iteration)
|
|
|
|
if ch1 != ch2
|
2020-02-21 16:16:52 +01:00
|
|
|
current = min(left, current, above) + 1
|
2020-07-13 17:56:34 +02:00
|
|
|
# never happens at i2 = k + 1 because then the two previous characters were equal
|
2020-07-13 19:44:45 +02:00
|
|
|
if (i1 - k - 1 > 0) & (i2 - k - 1 > 0) && (ch1 == prevch2) && (prevch1 == ch2)
|
2020-02-19 14:35:17 +01:00
|
|
|
thisTransCost += 1
|
2020-02-21 16:16:52 +01:00
|
|
|
current = min(current, thisTransCost)
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
end
|
2020-02-19 14:35:17 +01:00
|
|
|
v[i2 - k] = current
|
2020-02-21 16:16:52 +01:00
|
|
|
prevch2 = ch2
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
end
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1 = ch1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2020-07-13 17:56:34 +02:00
|
|
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
2015-10-23 16:12:51 +02:00
|
|
|
return current
|
|
|
|
end
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
RatcliffObershelp()
|
|
|
|
|
2020-02-26 01:40:14 +01:00
|
|
|
Creates the RatcliffObershelp distance
|
2019-08-17 18:57:35 +02:00
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The distance between two strings is defined as one minus the number of matching characters
|
|
|
|
divided by the total number of characters in the two strings. Matching characters are those
|
|
|
|
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
|
|
|
region on either side of the longest common subsequence.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2020-02-08 17:49:53 +01:00
|
|
|
struct RatcliffObershelp <: SemiMetric end
|
|
|
|
|
2020-07-20 16:08:27 +02:00
|
|
|
function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
|
2020-02-13 15:44:27 +01:00
|
|
|
((s1 === missing) | (s2 === missing)) && return missing
|
2020-02-16 17:12:31 +01:00
|
|
|
s1, s2 = reorder(s1, s2)
|
2019-12-11 20:45:58 +01:00
|
|
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
2019-08-17 22:12:41 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-12-11 20:45:58 +01:00
|
|
|
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
2019-08-17 18:57:35 +02:00
|
|
|
end
|
|
|
|
|
2020-02-09 19:37:37 +01:00
|
|
|
function matching_blocks(s1, s2)
|
2020-02-16 17:12:31 +01:00
|
|
|
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
|
2019-08-17 18:57:35 +02:00
|
|
|
end
|
|
|
|
|
2020-02-16 17:12:31 +01:00
|
|
|
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
|
2020-02-24 15:41:38 +01:00
|
|
|
n1, n2, len = longest_common_pattern(s1, s2)
|
2019-08-17 19:18:13 +02:00
|
|
|
# exit if there is no common substring
|
2020-02-24 15:41:38 +01:00
|
|
|
len == 0 && return x
|
2019-08-17 19:18:13 +02:00
|
|
|
# add the info of the common to the existing set
|
2020-02-24 15:41:38 +01:00
|
|
|
push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
|
2019-08-17 19:18:13 +02:00
|
|
|
# add the longest common substring that happens before
|
2020-02-24 15:41:38 +01:00
|
|
|
matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
|
2019-08-17 19:18:13 +02:00
|
|
|
# add the longest common substring that happens after
|
2020-02-24 15:41:38 +01:00
|
|
|
matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1),
|
|
|
|
start1 + n1 + len - 1, start2 + n2 + len - 1)
|
2019-08-17 18:57:35 +02:00
|
|
|
return x
|
|
|
|
end
|
2019-12-13 02:01:47 +01:00
|
|
|
|
2020-02-16 17:12:31 +01:00
|
|
|
function longest_common_pattern(s1, s2)
|
|
|
|
if length(s1) > length(s2)
|
|
|
|
start2, start1, len = longest_common_pattern(s2, s1)
|
2019-12-13 02:01:47 +01:00
|
|
|
else
|
|
|
|
start1, start2, len = 0, 0, 0
|
2020-02-16 17:12:31 +01:00
|
|
|
p = zeros(Int, length(s2))
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i1, ch1) in enumerate(s1)
|
2019-12-13 02:01:47 +01:00
|
|
|
oldp = 0
|
2020-02-21 16:16:52 +01:00
|
|
|
for (i2, ch2) in enumerate(s2)
|
2019-12-13 02:01:47 +01:00
|
|
|
newp = 0
|
|
|
|
if ch1 == ch2
|
|
|
|
newp = oldp > 0 ? oldp : i2
|
2020-02-16 17:12:31 +01:00
|
|
|
currentlength = i2 - newp + 1
|
2019-12-13 02:01:47 +01:00
|
|
|
if currentlength > len
|
|
|
|
start1, start2, len = i1 - currentlength + 1, newp, currentlength
|
|
|
|
end
|
|
|
|
end
|
|
|
|
p[i2], oldp = newp, p[i2]
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
return start1, start2, len
|
2020-07-13 20:39:21 +02:00
|
|
|
end
|