2015-10-25 16:23:46 +01:00
|
|
|
|
2015-10-23 16:12:51 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Hamming
|
|
|
|
##
|
|
|
|
##############################################################################
|
2019-08-19 19:12:55 +02:00
|
|
|
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString;
|
|
|
|
max_dist = typemax(Int))
|
2019-08-17 17:40:26 +02:00
|
|
|
current = abs(length(s2) - length(s1))
|
2019-08-19 19:12:55 +02:00
|
|
|
current >= max_dist && return max_dist
|
2015-11-03 19:07:17 +01:00
|
|
|
for (ch1, ch2) in zip(s1, s2)
|
2019-08-17 17:40:26 +02:00
|
|
|
current += ch1 != ch2
|
2019-08-19 19:12:55 +02:00
|
|
|
current >= max_dist && return max_dist
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-17 17:40:26 +02:00
|
|
|
return current
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Jaro
|
|
|
|
##
|
|
|
|
##############################################################################
|
|
|
|
"""
|
|
|
|
Jaro()
|
|
|
|
|
|
|
|
Creates the Jaro metric
|
|
|
|
|
|
|
|
The Jaro distance is defined as
|
|
|
|
|
|
|
|
|
|
|
|
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
|
|
|
|
|
|
|
|
where ``m`` is the number of matching characters and
|
|
|
|
``t`` is half the number of transpositions.
|
|
|
|
"""
|
|
|
|
struct Jaro <: SemiMetric end
|
|
|
|
|
|
|
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
2019-08-19 19:12:55 +02:00
|
|
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
|
|
|
|
max_dist = Inf)
|
2019-08-18 18:52:37 +02:00
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
|
|
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
|
|
|
len2 == 0 && return 0.0
|
2019-08-19 19:12:55 +02:00
|
|
|
# Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4)
|
|
|
|
1 - (2 / 3 + len1 / (3 * len2)) >= max_dist && return max_dist
|
2019-08-18 18:52:37 +02:00
|
|
|
maxdist = max(0, div(len2, 2) - 1)
|
|
|
|
flag = fill(false, len2)
|
|
|
|
prevstate1 = firstindex(s1)
|
|
|
|
i1_match = prevstate1 * ones(Int, len1)
|
|
|
|
# m counts number matching characters
|
|
|
|
m = 0
|
|
|
|
i1 = 1
|
|
|
|
i2 = 1
|
|
|
|
x1 = iterate(s1)
|
|
|
|
x2 = iterate(s2)
|
|
|
|
while x1 !== nothing
|
|
|
|
ch1, state1 = x1
|
|
|
|
if i2 <= i1 - maxdist - 1
|
|
|
|
ch2, state2 = x2
|
|
|
|
i2 += 1
|
|
|
|
x2 = iterate(s2, state2)
|
|
|
|
end
|
|
|
|
i2curr = i2
|
|
|
|
x2curr = x2
|
|
|
|
while x2curr !== nothing
|
|
|
|
(i2curr > i1 + maxdist) && break
|
|
|
|
ch2, state2 = x2curr
|
|
|
|
if (ch1 == ch2) & !flag[i2curr]
|
|
|
|
m += 1
|
|
|
|
flag[i2curr] = true
|
|
|
|
i1_match[m] = prevstate1
|
|
|
|
break
|
|
|
|
end
|
|
|
|
x2curr = iterate(s2, state2)
|
|
|
|
i2curr += 1
|
|
|
|
end
|
|
|
|
x1 = iterate(s1, state1)
|
|
|
|
i1 += 1
|
|
|
|
prevstate1 = state1
|
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
m == 0 && return min(1.0, max_dist)
|
2019-08-18 18:52:37 +02:00
|
|
|
# t counts number of transpositions
|
|
|
|
t = 0
|
|
|
|
i1 = 0
|
|
|
|
i2 = 0
|
|
|
|
for ch2 in s2
|
|
|
|
i2 += 1
|
|
|
|
if flag[i2]
|
|
|
|
i1 += 1
|
|
|
|
t += ch2 != iterate(s1, i1_match[i1])[1]
|
|
|
|
end
|
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
|
|
|
return min(current, max_dist)
|
2019-08-18 18:52:37 +02:00
|
|
|
end
|
|
|
|
|
2015-10-23 16:12:51 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
2015-11-02 18:54:47 +01:00
|
|
|
## Levenshtein
|
2015-10-23 16:12:51 +02:00
|
|
|
##
|
|
|
|
##############################################################################
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
Levenshtein()
|
|
|
|
|
|
|
|
Creates the Levenshtein metric
|
2015-10-25 16:23:46 +01:00
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other.
|
|
|
|
"""
|
2017-05-12 23:41:56 +02:00
|
|
|
struct Levenshtein <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
2019-08-19 19:12:55 +02:00
|
|
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|
|
|
max_dist = typemax(Int))
|
2017-08-05 20:45:19 +02:00
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
2019-08-19 19:12:55 +02:00
|
|
|
len2 - len1 >= max_dist && return max_dist
|
2019-08-17 17:40:26 +02:00
|
|
|
# prefix common to both strings can be ignored
|
2019-08-19 19:12:55 +02:00
|
|
|
k, x1, x2start = remove_prefix(s1, s2)
|
2018-07-04 20:02:50 +02:00
|
|
|
(x1 == nothing) && return len2 - k
|
2015-10-24 18:45:24 +02:00
|
|
|
# distance initialized to first row of matrix
|
|
|
|
# => distance between "" and s2[1:i}
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
2018-07-04 20:02:50 +02:00
|
|
|
current = 0
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
2018-07-04 23:27:40 +02:00
|
|
|
while x1 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2019-08-17 17:40:26 +02:00
|
|
|
left = i1 - 1
|
|
|
|
current = i1 - 1
|
2019-08-19 19:12:55 +02:00
|
|
|
min_dist = i1 - 2
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2018-07-04 21:04:06 +02:00
|
|
|
x2 = x2start
|
2018-07-04 23:27:40 +02:00
|
|
|
while x2 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-24 18:45:24 +02:00
|
|
|
# update
|
|
|
|
above, current, left = current, left, v0[i2]
|
|
|
|
if ch1 != ch2
|
|
|
|
# substitution
|
2019-08-17 17:40:26 +02:00
|
|
|
current = min(current + 1, above + 1, left + 1)
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
min_dist = min(min_dist, left)
|
2015-10-24 18:45:24 +02:00
|
|
|
v0[i2] = current
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
min_dist >= max_dist && return max_dist
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
return min(current, max_dist)
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
|
2015-11-02 18:54:47 +01:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Damerau Levenshtein
|
|
|
|
##
|
|
|
|
##############################################################################
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
DamerauLevenshtein()
|
2015-11-02 18:54:47 +01:00
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
Creates the DamerauLevenshtein metric
|
|
|
|
|
|
|
|
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other.
|
|
|
|
"""
|
2017-05-12 23:41:56 +02:00
|
|
|
struct DamerauLevenshtein <: SemiMetric end
|
2015-10-23 16:12:51 +02:00
|
|
|
|
2019-08-18 18:52:37 +02:00
|
|
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
2019-08-19 19:12:55 +02:00
|
|
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
|
|
|
max_dist = typemax(Int))
|
2018-07-04 18:33:13 +02:00
|
|
|
s2, len2, s1, len1 = reorder(s1, s2)
|
2019-08-19 19:12:55 +02:00
|
|
|
len2 - len1 >= max_dist && return max_dist
|
2015-10-25 16:23:46 +01:00
|
|
|
# prefix common to both strings can be ignored
|
2019-08-19 19:12:55 +02:00
|
|
|
k, x1, x2start = remove_prefix(s1, s2)
|
2018-07-04 20:02:50 +02:00
|
|
|
(x1 == nothing) && return len2 - k
|
2018-07-04 21:26:24 +02:00
|
|
|
v0 = collect(1:(len2 - k))
|
|
|
|
v2 = similar(v0)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 = 1
|
|
|
|
current = i1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1, = x1
|
2019-08-17 17:40:26 +02:00
|
|
|
while x1 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch1, state1 = x1
|
2015-10-24 18:45:24 +02:00
|
|
|
left = (i1 - 1)
|
|
|
|
current = i1
|
2015-10-23 16:12:51 +02:00
|
|
|
nextTransCost = 0
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2, = x2start
|
|
|
|
x2 = x2start
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 = 1
|
2019-08-17 17:40:26 +02:00
|
|
|
while x2 !== nothing
|
2018-07-04 20:02:50 +02:00
|
|
|
ch2, state2 = x2
|
2015-10-23 16:12:51 +02:00
|
|
|
above = current
|
|
|
|
thisTransCost = nextTransCost
|
2015-10-24 18:45:24 +02:00
|
|
|
nextTransCost = v2[i2]
|
|
|
|
# cost of diagonal (substitution)
|
|
|
|
v2[i2] = current = left
|
|
|
|
# left now equals current cost (which will be diagonal at next iteration)
|
|
|
|
left = v0[i2]
|
2015-10-23 16:12:51 +02:00
|
|
|
if ch1 != ch2
|
2015-10-24 18:45:24 +02:00
|
|
|
# insertion
|
2015-10-23 16:12:51 +02:00
|
|
|
if left < current
|
|
|
|
current = left
|
|
|
|
end
|
2015-10-24 18:45:24 +02:00
|
|
|
# deletion
|
2015-10-23 16:12:51 +02:00
|
|
|
if above < current
|
|
|
|
current = above
|
|
|
|
end
|
|
|
|
current += 1
|
2019-08-17 17:40:26 +02:00
|
|
|
if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
|
2015-10-23 16:12:51 +02:00
|
|
|
thisTransCost += 1
|
|
|
|
if thisTransCost < current
|
|
|
|
current = thisTransCost
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
2015-10-24 18:45:24 +02:00
|
|
|
v0[i2] = current
|
2018-07-04 20:02:50 +02:00
|
|
|
x2 = iterate(s2, state2)
|
2018-07-04 21:47:11 +02:00
|
|
|
i2 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch2 = ch2
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
2019-08-19 19:12:55 +02:00
|
|
|
(v0[i1 + len2 - len1] >= max_dist) && return max_dist
|
2018-07-04 20:02:50 +02:00
|
|
|
x1 = iterate(s1, state1)
|
2018-07-04 21:47:11 +02:00
|
|
|
i1 += 1
|
2018-07-04 21:04:06 +02:00
|
|
|
prevch1 = ch1
|
2015-10-23 16:12:51 +02:00
|
|
|
end
|
|
|
|
return current
|
|
|
|
end
|
|
|
|
|
2019-08-17 18:57:35 +02:00
|
|
|
|
2019-08-19 19:12:55 +02:00
|
|
|
|
|
|
|
|
2019-08-17 18:57:35 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Ratcliff/Obershelp
|
|
|
|
##
|
|
|
|
##############################################################################
|
2019-08-18 01:45:31 +02:00
|
|
|
"""
|
|
|
|
RatcliffObershelp()
|
|
|
|
|
|
|
|
Creates the RatcliffObershelp metric
|
2019-08-17 18:57:35 +02:00
|
|
|
|
2019-08-18 01:45:31 +02:00
|
|
|
The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
|
|
|
|
"""
|
2019-08-17 18:57:35 +02:00
|
|
|
struct RatcliffObershelp <: PreMetric end
|
|
|
|
|
2019-08-19 19:12:55 +02:00
|
|
|
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = Inf)
|
2019-08-17 22:12:41 +02:00
|
|
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
|
|
|
len1, len2 = length(s1), length(s2)
|
2019-08-19 19:12:55 +02:00
|
|
|
len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist)
|
2019-08-17 18:57:35 +02:00
|
|
|
end
|
|
|
|
|
|
|
|
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
|
|
|
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
|
|
|
end
|
|
|
|
|
|
|
|
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
|
|
|
a = longest_common_substring(s1, s2, len1 , len2)
|
2019-08-17 19:18:13 +02:00
|
|
|
# exit if there is no common substring
|
|
|
|
a[3] == 0 && return x
|
|
|
|
# add the info of the common to the existing set
|
|
|
|
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
|
|
|
# add the longest common substring that happens before
|
|
|
|
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
|
|
|
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
|
|
|
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
|
|
|
# add the longest common substring that happens after
|
|
|
|
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
|
|
|
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
|
|
|
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
2019-08-17 18:57:35 +02:00
|
|
|
return x
|
|
|
|
end
|
|
|
|
|