2019-08-18 18:52:37 +02:00
"""
Jaro ( )
2020-02-26 01:40:14 +01:00
Creates the Jaro distance
2019-08-18 18:52:37 +02:00
The Jaro distance is defined as
` ` 1 - ( m / | s1 | + m / | s2 | + ( m - t ) / m ) / 3 ` `
where ` ` m ` ` is the number of matching characters and
` ` t ` ` is half the number of transpositions .
"""
2020-02-08 17:49:53 +01:00
struct Jaro <: SemiMetric end
2020-02-02 17:47:31 +01:00
2019-08-18 18:52:37 +02:00
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
2020-02-07 14:39:29 +01:00
## accepts any iterator, including AbstractString
2020-02-13 15:44:27 +01:00
function ( dist :: Jaro ) ( s1 , s2 )
( ( s1 === missing ) | ( s2 === missing ) ) && return missing
2019-08-19 19:54:38 +02:00
s1 , s2 = reorder ( s1 , s2 )
2019-08-19 19:33:33 +02:00
len1 , len2 = length ( s1 ) , length ( s2 )
2020-02-11 13:39:15 +01:00
# If both are empty, the formula in Wikipedia gives 0
2019-12-13 00:55:41 +01:00
# Add this line so that not the case
2019-08-18 18:52:37 +02:00
len2 == 0 && return 0.0
2019-08-20 17:59:23 +02:00
maxdist = max ( 0 , div ( len2 , 2 ) - 1 )
2019-08-18 18:52:37 +02:00
flag = fill ( false , len2 )
2020-02-21 16:16:52 +01:00
ch1_match = Vector { eltype ( s1 ) } ( )
for ( i1 , ch1 ) in enumerate ( s1 )
for ( i2 , ch2 ) in enumerate ( s2 )
2020-02-24 15:41:38 +01:00
# greedy alignement
2020-02-21 16:16:52 +01:00
if ( i2 <= i1 + maxdist ) && ( i2 >= i1 - maxdist ) && ( ch1 == ch2 ) && ! flag [ i2 ]
2020-02-18 14:18:45 +01:00
flag [ i2 ] = true
2020-02-21 16:16:52 +01:00
push! ( ch1_match , ch1 )
2019-08-18 18:52:37 +02:00
break
end
end
end
2020-02-21 16:16:52 +01:00
# m counts number matching characters
m = length ( ch1_match )
2019-08-19 19:54:38 +02:00
m == 0 && return 1.0
2020-02-21 16:16:52 +01:00
# t counts number transpositions
2019-08-18 18:52:37 +02:00
t = 0
i1 = 0
2020-02-21 16:16:52 +01:00
for ( i2 , ch2 ) in enumerate ( s2 )
2019-08-18 18:52:37 +02:00
if flag [ i2 ]
i1 += 1
2020-02-11 13:39:15 +01:00
t += ch2 != ch1_match [ i1 ]
2019-08-18 18:52:37 +02:00
end
end
2019-12-12 15:38:20 +01:00
return 1.0 - ( m / len1 + m / len2 + ( m - t / 2 ) / m ) / 3.0
2019-08-18 18:52:37 +02:00
end
2019-08-18 01:45:31 +02:00
"""
Levenshtein ( )
2020-02-26 01:40:14 +01:00
Creates the Levenshtein distance
2015-10-25 16:23:46 +01:00
2019-12-13 00:55:41 +01:00
The Levenshtein distance is the minimum number of operations ( consisting of insertions , deletions ,
substitutions of a single character ) required to change one string into the other .
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct Levenshtein <: Metric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
2020-02-21 16:16:52 +01:00
# Return max_value + 1 if distance higher than max_value
# This makes it possible to differentiate distance equalt to max_value vs strictly higher
2019-12-13 00:55:41 +01:00
# This is important for find_all
2020-02-21 16:16:52 +01:00
function ( dist :: Levenshtein ) ( s1 , s2 , max_value = nothing )
2020-02-13 15:44:27 +01:00
( ( s1 === missing ) | ( s2 === missing ) ) && return missing
2019-08-19 20:04:55 +02:00
s1 , s2 = reorder ( s1 , s2 )
len1 , len2 = length ( s1 ) , length ( s2 )
2020-02-21 16:16:52 +01:00
max_value !== nothing && len2 - len1 > max_value && return max_value + 1
2019-08-17 17:40:26 +02:00
# prefix common to both strings can be ignored
2020-02-18 14:18:45 +01:00
k = common_prefix ( s1 , s2 )
2020-02-18 14:38:20 +01:00
k == len1 && return len2 - k
2015-10-24 18:45:24 +02:00
# distance initialized to first row of matrix
2020-02-21 16:16:52 +01:00
# distance between "" and s2[1:i]
2020-02-02 17:47:31 +01:00
v = collect ( 1 : ( len2 - k ) )
2018-07-04 20:02:50 +02:00
current = 0
2020-02-21 16:16:52 +01:00
for ( i1 , ch1 ) in enumerate ( s1 )
2020-02-18 14:18:45 +01:00
i1 <= k && continue
2020-02-21 16:16:52 +01:00
left = current = i1 - k - 1
max_value !== nothing && ( value_lb = left - 1 )
for ( i2 , ch2 ) in enumerate ( s2 )
2020-02-18 14:18:45 +01:00
i2 <= k && continue
above , current , left = current , left , v [ i2 - k ]
2015-10-24 18:45:24 +02:00
if ch1 != ch2
2020-02-21 16:16:52 +01:00
current = min ( current , above , left ) + 1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && ( value_lb = min ( value_lb , left ) )
2020-02-18 14:18:45 +01:00
v [ i2 - k ] = current
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && value_lb > max_value && return max_value + 1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && current > max_value && return max_value + 1
2019-08-20 17:59:23 +02:00
return current
2015-10-23 16:12:51 +02:00
end
2019-08-18 01:45:31 +02:00
"""
DamerauLevenshtein ( )
2015-11-02 18:54:47 +01:00
2020-02-26 01:40:14 +01:00
Creates the restricted DamerauLevenshtein distance
2019-08-18 01:45:31 +02:00
2019-12-13 00:55:41 +01:00
The DamerauLevenshtein distance is the minimum number of operations ( consisting of insertions ,
deletions or substitutions of a single character , or transposition of two adjacent characters )
required to change one string into the other .
2020-02-19 14:35:17 +01:00
The restricted distance differs slightly from the classic Damerau - Levenshtein algorithm by imposing
the restriction that no substring is edited more than once . So for example , " CA " to " ABC " has an edit
distanceof 2 by a complete application of Damerau - Levenshtein , but a distance of 3 by this method that
2020-02-19 14:39:09 +01:00
uses the optimal string alignment algorithm . In particular , the restricted distance does not satisfy
the triangle inequality .
2019-08-18 01:45:31 +02:00
"""
2020-02-19 14:42:17 +01:00
2020-02-08 17:49:53 +01:00
struct DamerauLevenshtein <: SemiMetric end
2015-10-23 16:12:51 +02:00
2019-08-18 18:52:37 +02:00
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
2020-02-21 16:16:52 +01:00
# Return max_value + 1 if distance higher than max_value
function ( dist :: DamerauLevenshtein ) ( s1 , s2 , max_value = nothing )
2020-02-13 15:44:27 +01:00
( ( s1 === missing ) | ( s2 === missing ) ) && return missing
2019-08-19 19:54:38 +02:00
s1 , s2 = reorder ( s1 , s2 )
2019-08-19 19:33:33 +02:00
len1 , len2 = length ( s1 ) , length ( s2 )
2020-02-21 16:16:52 +01:00
max_value !== nothing && len2 - len1 > max_value && return max_value + 1
2015-10-25 16:23:46 +01:00
# prefix common to both strings can be ignored
2020-02-18 14:18:45 +01:00
k = common_prefix ( s1 , s2 )
2020-02-18 14:38:20 +01:00
k == len1 && return len2 - k
2020-02-02 17:47:31 +01:00
v = collect ( 1 : ( len2 - k ) )
w = similar ( v )
2020-02-21 16:16:52 +01:00
if max_value !== nothing
i2_start = k + 1
i2_end = max_value
2019-08-20 21:38:14 +02:00
end
2020-02-19 14:35:17 +01:00
prevch1 , prevch2 = first ( s1 ) , first ( s2 )
2020-02-21 16:16:52 +01:00
current = 0
for ( i1 , ch1 ) in enumerate ( s1 )
2020-02-18 14:18:45 +01:00
i1 <= k && continue
2020-02-21 16:16:52 +01:00
left = current = i1 - k - 1
2015-10-23 16:12:51 +02:00
nextTransCost = 0
2020-02-21 16:16:52 +01:00
if max_value !== nothing
i2_start += ( i1 > 1 + max_value - ( len2 - len1 ) ) ? 1 : 0
2020-02-19 14:35:17 +01:00
i2_end += ( i2_end < len2 ) ? 1 : 0
2019-08-20 21:38:14 +02:00
end
2020-02-21 16:16:52 +01:00
for ( i2 , ch2 ) in enumerate ( s2 )
i2 <= k && continue
# no need to look beyond window of lower right diagonal - maxDistance cells (lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_value cells (upper left is i1)
if ( max_value !== nothing ) && ( ( i2 < i2_start ) | ( i2 > i2_end ) )
prevch2 = ch2
else
above , current , left = current , left , v [ i2 - k ]
w [ i2 - k ] , nextTransCost , thisTransCost = current , w [ i2 - k ] , nextTransCost
2020-02-19 14:35:17 +01:00
# left now equals current cost (which will be diagonal at next iteration)
if ch1 != ch2
2020-02-21 16:16:52 +01:00
current = min ( left , current , above ) + 1
# note that it never happens at i2 = k + 1 because then the two previous characters were equal
if ( i1 > 1 + k ) & ( i2 > 1 + k ) && ( ch1 == prevch2 ) && ( prevch1 == ch2 )
2020-02-19 14:35:17 +01:00
thisTransCost += 1
2020-02-21 16:16:52 +01:00
current = min ( current , thisTransCost )
2015-10-23 16:12:51 +02:00
end
end
2020-02-19 14:35:17 +01:00
v [ i2 - k ] = current
2020-02-21 16:16:52 +01:00
prevch2 = ch2
2015-10-23 16:12:51 +02:00
end
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && v [ i1 - k + len2 - len1 ] > max_value && return max_value + 1
2018-07-04 21:04:06 +02:00
prevch1 = ch1
2015-10-23 16:12:51 +02:00
end
2020-02-21 16:16:52 +01:00
max_value !== nothing && current > max_value && return max_value + 1
2015-10-23 16:12:51 +02:00
return current
end
2019-08-18 01:45:31 +02:00
"""
RatcliffObershelp ( )
2020-02-26 01:40:14 +01:00
Creates the RatcliffObershelp distance
2019-08-17 18:57:35 +02:00
2019-12-13 00:55:41 +01:00
The distance between two strings is defined as one minus the number of matching characters
divided by the total number of characters in the two strings . Matching characters are those
in the longest common subsequence plus , recursively , matching characters in the unmatched
region on either side of the longest common subsequence .
2019-08-18 01:45:31 +02:00
"""
2020-02-08 17:49:53 +01:00
struct RatcliffObershelp <: SemiMetric end
2020-02-13 15:44:27 +01:00
function ( dist :: RatcliffObershelp ) ( s1 , s2 )
( ( s1 === missing ) | ( s2 === missing ) ) && return missing
2020-02-16 17:12:31 +01:00
s1 , s2 = reorder ( s1 , s2 )
2019-12-11 20:45:58 +01:00
n_matched = sum ( last . ( matching_blocks ( s1 , s2 ) ) )
2019-08-17 22:12:41 +02:00
len1 , len2 = length ( s1 ) , length ( s2 )
2019-12-11 20:45:58 +01:00
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / ( len1 + len2 )
2019-08-17 18:57:35 +02:00
end
2020-02-09 19:37:37 +01:00
function matching_blocks ( s1 , s2 )
2020-02-16 17:12:31 +01:00
matching_blocks! ( Set { Tuple { Int , Int , Int } } ( ) , s1 , s2 , 1 , 1 )
2019-08-17 18:57:35 +02:00
end
2020-02-16 17:12:31 +01:00
function matching_blocks! ( x :: Set { Tuple { Int , Int , Int } } , s1 , s2 , start1 :: Integer , start2 :: Integer )
2020-02-24 15:41:38 +01:00
n1 , n2 , len = longest_common_pattern ( s1 , s2 )
2019-08-17 19:18:13 +02:00
# exit if there is no common substring
2020-02-24 15:41:38 +01:00
len == 0 && return x
2019-08-17 19:18:13 +02:00
# add the info of the common to the existing set
2020-02-24 15:41:38 +01:00
push! ( x , ( n1 + start1 - 1 , n2 + start2 - 1 , len ) )
2019-08-17 19:18:13 +02:00
# add the longest common substring that happens before
2020-02-24 15:41:38 +01:00
matching_blocks! ( x , _take ( s1 , n1 - 1 ) , _take ( s2 , n2 - 1 ) , start1 , start2 )
2019-08-17 19:18:13 +02:00
# add the longest common substring that happens after
2020-02-24 15:41:38 +01:00
matching_blocks! ( x , _drop ( s1 , n1 + len - 1 ) , _drop ( s2 , n2 + len - 1 ) ,
start1 + n1 + len - 1 , start2 + n2 + len - 1 )
2019-08-17 18:57:35 +02:00
return x
end
2019-12-13 02:01:47 +01:00
2020-02-16 17:12:31 +01:00
function longest_common_pattern ( s1 , s2 )
if length ( s1 ) > length ( s2 )
start2 , start1 , len = longest_common_pattern ( s2 , s1 )
2019-12-13 02:01:47 +01:00
else
start1 , start2 , len = 0 , 0 , 0
2020-02-16 17:12:31 +01:00
p = zeros ( Int , length ( s2 ) )
2020-02-21 16:16:52 +01:00
for ( i1 , ch1 ) in enumerate ( s1 )
2019-12-13 02:01:47 +01:00
oldp = 0
2020-02-21 16:16:52 +01:00
for ( i2 , ch2 ) in enumerate ( s2 )
2019-12-13 02:01:47 +01:00
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
2020-02-16 17:12:31 +01:00
currentlength = i2 - newp + 1
2019-12-13 02:01:47 +01:00
if currentlength > len
start1 , start2 , len = i1 - currentlength + 1 , newp , currentlength
end
end
p [ i2 ] , oldp = newp , p [ i2 ]
end
end
end
return start1 , start2 , len
end