2019-08-18 18:52:37 +02:00
|
|
|
"""
|
|
|
|
Jaro()
|
|
|
|
|
|
|
|
Creates the Jaro metric
|
|
|
|
|
|
|
|
The Jaro distance is defined as
|
|
|
|
|
|
|
|
|
|
|
|
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
|
|
|
|
|
|
|
|
where ``m`` is the number of matching characters and
|
|
|
|
``t`` is half the number of transpositions.
|
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
struct Jaro <: StringDistance end
|
2019-08-18 18:52:37 +02:00
|
|
|
|
|
|
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
2019-12-12 21:32:59 +01:00
|
|
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
2019-08-19 19:54:38 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
2019-08-19 19:33:33 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-12-13 00:55:41 +01:00
|
|
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia.
|
|
|
|
# Add this line so that not the case
|
2019-08-18 18:52:37 +02:00
|
|
|
len2 == 0 && return 0.0
|
2019-08-20 17:59:23 +02:00
|
|
|
maxdist = max(0, div(len2, 2) - 1)
|
2019-08-18 18:52:37 +02:00
|
|
|
flag = fill(false, len2)
|
|
|
|
prevstate1 = firstindex(s1)
|
2019-08-20 17:59:23 +02:00
|
|
|
i1_match = fill(prevstate1, len1)
|
2019-08-18 18:52:37 +02:00
|
|
|
# m counts number matching characters
|
|
|
|
m = 0
|
|
|
|
i1 = 1
|
|
|
|
i2 = 1
|
|
|
|
x1 = iterate(s1)
|
|
|
|
x2 = iterate(s2)
|
|
|
|
while x1 !== nothing
|
|
|
|
ch1, state1 = x1
|
|
|
|
if i2 <= i1 - maxdist - 1
|
|
|
|
ch2, state2 = x2
|
|
|
|
i2 += 1
|
|
|
|
x2 = iterate(s2, state2)
|
|
|
|
end
|
|
|
|
i2curr = i2
|
|
|
|
x2curr = x2
|
|
|
|
while x2curr !== nothing
|
2019-08-20 17:59:23 +02:00
|
|
|
i2curr > i1 + maxdist && break
|
2019-08-18 18:52:37 +02:00
|
|
|
ch2, state2 = x2curr
|
2019-08-20 17:59:23 +02:00
|
|
|
if (ch1 == ch2) && !flag[i2curr]
|
2019-08-18 18:52:37 +02:00
|
|
|
m += 1
|
|
|
|
flag[i2curr] = true
|
|
|
|
i1_match[m] = prevstate1
|
|
|
|
break
|
|
|
|
end
|
|
|
|
x2curr = iterate(s2, state2)
|
|
|
|
i2curr += 1
|
|
|
|
end
|
|
|
|
x1 = iterate(s1, state1)
|
|
|
|
i1 += 1
|
|
|
|
prevstate1 = state1
|
|
|
|
end
|
2019-08-19 19:54:38 +02:00
|
|
|
m == 0 && return 1.0
|
2019-08-18 18:52:37 +02:00
|
|
|
# t counts number of transpositions
|
|
|
|
t = 0
|
|
|
|
i1 = 0
|
|
|
|
i2 = 0
|
|
|
|
for ch2 in s2
|
|
|
|
i2 += 1
|
|
|
|
if flag[i2]
|
|
|
|
i1 += 1
|
|
|
|
t += ch2 != iterate(s1, i1_match[i1])[1]
|
|
|
|
end
|
|
|
|
end
|
2019-12-12 15:38:20 +01:00
|
|
|
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
2019-08-18 18:52:37 +02:00
|
|
|
end
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
Levenshtein()
|
|
|
|
|
|
|
|
Creates the Levenshtein metric
|
2015-10-25 16:23:46 +01:00
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
|
|
|
substitutions of a single character) required to change one string into the other.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
struct Levenshtein <: StringDistance end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
2019-12-13 00:55:41 +01:00
|
|
|
# Return max_dist +1 if distance higher than max_dist
|
|
|
|
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
|
|
|
# This is important for find_all
|
2019-12-12 21:32:59 +01:00
|
|
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing)
|
2019-08-19 20:04:55 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
2019-08-17 17:40:26 +02:00
|
|
|
# prefix common to both strings can be ignored
|
2019-08-19 19:12:55 +02:00
|
|
|
k, x1, x2start = remove_prefix(s1, s2)
|
2019-08-20 17:59:23 +02:00
|
|
|
x1 == nothing && return len2 - k
|
2015-10-24 18:45:24 +02:00
|
|
|
# distance initialized to first row of matrix
|
|
|
|
# => distance between "" and s2[1:i}
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
2018-07-04 20:02:50 +02:00
|
|
|
current = 0
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
2018-07-04 23:27:40 +02:00
|
|
|
while x1 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2019-08-17 17:40:26 +02:00
|
|
|
left = i1 - 1
|
|
|
|
current = i1 - 1
|
2019-08-19 19:12:55 +02:00
|
|
|
min_dist = i1 - 2
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2018-07-04 21:04:06 +02:00
|
|
|
x2 = x2start
|
2018-07-04 23:27:40 +02:00
|
|
|
while x2 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-24 18:45:24 +02:00
|
|
|
# update
|
|
|
|
above, current, left = current, left, v0[i2]
|
|
|
|
if ch1 != ch2
|
2019-08-17 17:40:26 +02:00
|
|
|
current = min(current + 1, above + 1, left + 1)
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
min_dist = min(min_dist, left)
|
2015-10-24 18:45:24 +02:00
|
|
|
v0[i2] = current
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && min_dist > max_dist && return max_dist + 1
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
2019-08-20 17:59:23 +02:00
|
|
|
return current
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
DamerauLevenshtein()
|
2015-11-02 18:54:47 +01:00
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
Creates the DamerauLevenshtein metric
|
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
|
|
|
|
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
|
|
|
required to change one string into the other.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
struct DamerauLevenshtein <: StringDistance end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
2019-12-12 21:32:59 +01:00
|
|
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing)
|
2019-08-19 19:54:38 +02:00
|
|
|
s1, s2 = reorder(s1, s2)
|
2019-08-19 19:33:33 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
2015-10-25 16:23:46 +01:00
|
|
|
# prefix common to both strings can be ignored
|
2019-08-19 19:12:55 +02:00
|
|
|
k, x1, x2start = remove_prefix(s1, s2)
|
2018-07-04 20:02:50 +02:00
|
|
|
(x1 == nothing) && return len2 - k
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
|
|
|
v2 = similar(v0)
|
2019-08-20 21:38:14 +02:00
|
|
|
if max_dist !== nothing
|
|
|
|
offset = 1 + max_dist - (len2 - len1)
|
|
|
|
i2_start = 1
|
|
|
|
i2_end = max_dist
|
|
|
|
end
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
|
|
|
current = i1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1, = x1
|
2019-08-17 17:40:26 +02:00
|
|
|
while x1 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2015-10-24 18:45:24 +02:00
|
|
|
left = (i1 - 1)
|
|
|
|
current = i1
|
2015-10-23 16:12:51 +02:00
|
|
|
nextTransCost = 0
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2, = x2start
|
2019-08-20 21:38:14 +02:00
|
|
|
if max_dist !== nothing
|
|
|
|
i2_start += (i1 > offset) ? 1 : 0
|
|
|
|
i2_end = min(i2_end + 1, len2)
|
|
|
|
end
|
2018-07-04 21:04:06 +02:00
|
|
|
x2 = x2start
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2019-08-17 17:40:26 +02:00
|
|
|
while x2 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2019-08-20 21:38:14 +02:00
|
|
|
if max_dist == nothing || (i2_start <= i2 <= i2_end)
|
|
|
|
above = current
|
|
|
|
thisTransCost = nextTransCost
|
|
|
|
nextTransCost = v2[i2]
|
|
|
|
# cost of diagonal (substitution)
|
|
|
|
v2[i2] = current = left
|
|
|
|
# left now equals current cost (which will be diagonal at next iteration)
|
|
|
|
left = v0[i2]
|
|
|
|
if ch1 != ch2
|
|
|
|
# insertion
|
|
|
|
if left < current
|
|
|
|
current = left
|
|
|
|
end
|
|
|
|
# deletion
|
|
|
|
if above < current
|
|
|
|
current = above
|
|
|
|
end
|
|
|
|
current += 1
|
|
|
|
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
|
|
|
|
thisTransCost += 1
|
|
|
|
if thisTransCost < current
|
|
|
|
current = thisTransCost
|
|
|
|
end
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
end
|
2019-08-20 21:38:14 +02:00
|
|
|
v0[i2] = current
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2 = ch2
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && v0[i1 + len2 - len1] > max_dist && return max_dist + 1
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1 = ch1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-20 21:38:14 +02:00
|
|
|
max_dist !== nothing && current > max_dist && return max_dist + 1
|
2015-10-23 16:12:51 +02:00
|
|
|
return current
|
|
|
|
end
|
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
RatcliffObershelp()
|
|
|
|
|
|
|
|
Creates the RatcliffObershelp metric
|
2019-08-17 18:57:35 +02:00
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
The distance between two strings is defined as one minus the number of matching characters
|
|
|
|
divided by the total number of characters in the two strings. Matching characters are those
|
|
|
|
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
|
|
|
region on either side of the longest common subsequence.
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
2019-12-12 20:48:52 +01:00
|
|
|
struct RatcliffObershelp <: StringDistance end
|
2019-08-17 18:57:35 +02:00
|
|
|
|
2019-12-12 21:32:59 +01:00
|
|
|
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
2019-12-11 20:45:58 +01:00
|
|
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
2019-08-17 22:12:41 +02:00
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-12-11 20:45:58 +01:00
|
|
|
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
2019-08-17 18:57:35 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
|
|
|
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
|
|
|
end
|
|
|
|
|
2019-12-13 00:55:41 +01:00
|
|
|
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
|
|
|
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
2019-08-17 18:57:35 +02:00
|
|
|
a = longest_common_substring(s1, s2, len1 , len2)
|
2019-08-17 19:18:13 +02:00
|
|
|
# exit if there is no common substring
|
|
|
|
a[3] == 0 && return x
|
|
|
|
# add the info of the common to the existing set
|
|
|
|
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
|
|
|
# add the longest common substring that happens before
|
|
|
|
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
|
|
|
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
|
|
|
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
|
|
|
# add the longest common substring that happens after
|
|
|
|
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
|
|
|
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
2019-12-13 00:55:41 +01:00
|
|
|
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
|
|
|
|
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
2019-08-17 18:57:35 +02:00
|
|
|
return x
|
|
|
|
end
|