""" Jaro() Creates the Jaro metric The Jaro distance is defined as ``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3`` where ``m`` is the number of matching characters and ``t`` is half the number of transpositions. """ struct Jaro <: SemiMetric end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html ## accepts any iterator, including AbstractString function (dist::Jaro)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) # If both are empty, the formula in Wikipedia gives 0 # Add this line so that not the case len2 == 0 && return 0.0 maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) ch1_match = Vector{eltype(s1)}(undef, len1) # m counts number matching characters m = 0 i1 = 0 for ch1 in s1 i1 += 1 i2 = 0 for ch2 in s2 i2 += 1 i2 > i1 + maxdist && break if (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] m += 1 flag[i2] = true ch1_match[m] = ch1 break end end end m == 0 && return 1.0 # t counts number of transpositions t = 0 i1 = 0 i2 = 0 for ch2 in s2 i2 += 1 if flag[i2] i1 += 1 t += ch2 != ch1_match[i1] end end return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 end """ Levenshtein() Creates the Levenshtein metric The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other. """ struct Levenshtein <: Metric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html # Return max_dist + 1 if distance higher than max_dist # This makes it possible to differentiate distance equalt to max_dist vs strictly higher # This is important for find_all function (dist::Levenshtein)(s1, s2, max_dist = nothing) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored k = common_prefix(s1, s2) (k == length(s1)) && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} v = collect(1:(len2-k)) current = 0 i1 = 0 left = 0 current = 0 min_dist = 0 for ch1 in s1 i1 += 1 i1 <= k && continue left = i1 - k - 1 current = i1 - k - 1 min_dist = i1 - k - 2 i2 = 0 for ch2 in s2 i2 += 1 i2 <= k && continue # update above, current, left = current, left, v[i2 - k] if ch1 != ch2 current = min(current + 1, above + 1, left + 1) end min_dist = min(min_dist, left) v[i2 - k] = current end max_dist !== nothing && min_dist > max_dist && return max_dist + 1 end max_dist !== nothing && current > max_dist && return max_dist + 1 return current end """ DamerauLevenshtein() Creates the DamerauLevenshtein metric The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other. """ struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html # Return max_dist + 1 if distance higher than max_dist function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 # prefix common to both strings can be ignored k = common_prefix(s1, s2) (k == length(s1)) && return len2 - k v = collect(1:(len2-k)) w = similar(v) if max_dist !== nothing offset = 1 + max_dist - (len2 - len1) i2_start = 1 i2_end = max_dist end i1 = 0 current = i1 prevch1 = first(s1) prevch2 = first(s2) for ch1 in s1 i1 += 1 i1 <= k && continue left = i1 - k - 1 current = i1 - k nextTransCost = 0 if max_dist !== nothing i2_start += (i1 > offset) ? 1 : 0 i2_end = min(i2_end + 1, len2) end i2 = 0 for ch2 in s2 i2 += 1 if (i2 <= k) || ((max_dist !== nothing) && !(i2_start <= i2 <= i2_end)) prevch2 = ch2 continue end above = current thisTransCost = nextTransCost nextTransCost = w[i2 - k] # cost of diagonal (substitution) w[i2 - k] = current = left # left now equals current cost (which will be diagonal at next iteration) left = v[i2 - k] if ch1 != ch2 # insertion if left < current current = left end # deletion if above < current current = above end current += 1 if (i1 > 1 + k) & (i2 > 1 + k) & (ch1 == prevch2) & (prevch1 == ch2) thisTransCost += 1 if thisTransCost < current current = thisTransCost end end end v[i2 - k] = current prevch2 = ch2 end max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1 prevch1 = ch1 end max_dist !== nothing && current > max_dist && return max_dist + 1 return current end """ RatcliffObershelp() Creates the RatcliffObershelp metric The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence. """ struct RatcliffObershelp <: SemiMetric end function (dist::RatcliffObershelp)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2) end function matching_blocks(s1, s2) matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1) end function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer) a = longest_common_pattern(s1, s2) # exit if there is no common substring a[3] == 0 && return x # add the info of the common to the existing set push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) # add the longest common substring that happens before matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2) # add the longest common substring that happens after matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1), start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) return x end function longest_common_pattern(s1, s2) if length(s1) > length(s2) start2, start1, len = longest_common_pattern(s2, s1) else start1, start2, len = 0, 0, 0 p = zeros(Int, length(s2)) i1 = 0 for ch1 in s1 i1 += 1 oldp = 0 i2 = 0 for ch2 in s2 i2 += 1 newp = 0 if ch1 == ch2 newp = oldp > 0 ? oldp : i2 currentlength = i2 - newp + 1 if currentlength > len start1, start2, len = i1 - currentlength + 1, newp, currentlength end end p[i2], oldp = newp, p[i2] end end end return start1, start2, len end