StringDistances.jl/src/distances/edit.jl

"""
    Jaro()

Creates the Jaro distance

The Jaro distance is defined as


``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``

where ``m`` is the number of matching characters and 
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end

## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    # If both are empty, the formula in Wikipedia gives 0
    # Add this line so that not the case
    len2 == 0 && return 0.0
    d = max(0, div(len2, 2) - 1)
    flag = fill(false, len2)
    ch1_match = Vector{eltype(s1)}()
    for (i1, ch1) in enumerate(s1)
        for (i2, ch2) in enumerate(s2)
            # greedy alignement
            if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2] 
                flag[i2] = true
                push!(ch1_match, ch1)
                break
            end
        end
    end
    #  m counts number matching characters
    m = length(ch1_match)
    m == 0 && return 1.0
    # t counts number transpositions
    t = 0
    i1 = 0
    for (i2, ch2) in enumerate(s2)
        if flag[i2]
            i1 += 1
            t += ch2 != ch1_match[i1]
        end
    end
    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end

"""
    Levenshtein()

Creates the Levenshtein distance

The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, 
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: Metric end

## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist + 1 if distance higher than max_dist 
# to differentiate distance equal to max_dist or not, which is important for find fctions.
function (dist::Levenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
    k = common_prefix(s1, s2)
    k == len1 && return len2 - k
    # distance initialized to first row of matrix
    # distance between "" and s2[1:i]
    v = collect(1:(len2-k))
    current = 0
    for (i1, ch1) in enumerate(s1)
        i1 <= k && continue
        left = current = i1 - k - 1
        max_dist !== nothing && (value_lb = left - 1)
        for (i2, ch2) in enumerate(s2)
            i2 <= k && continue
            above, current, left = current, left, v[i2 - k]
            if ch1 != ch2
                current = min(current, above, left) + 1
            end
            max_dist !== nothing && (value_lb = min(value_lb, left))
            v[i2 - k] = current
        end
        max_dist !== nothing && value_lb > max_dist && return max_dist + 1
    end
    max_dist !== nothing && current > max_dist && return max_dist + 1 
    return current
end

"""
    DamerauLevenshtein()

Creates the restricted DamerauLevenshtein distance

The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, 
deletions or substitutions of a single character, or transposition of two adjacent characters) 
required to change one string into the other.

The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing 
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit 
distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy 
the triangle inequality.
"""

struct DamerauLevenshtein <: SemiMetric end

## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
    k = common_prefix(s1, s2)
    k == len1 && return len2 - k
    v = collect(1:(len2-k))
    w = similar(v)
    if max_dist !== nothing
        i2_start = 0
        i2_end = max_dist
    end
    prevch1, prevch2 = first(s1), first(s2)
    current = 0
    for (i1, ch1) in enumerate(s1)
        i1 <= k && continue
        left = i1 - k - 1
        current = left + 1
        nextTransCost = 0
        if max_dist !== nothing
            i2_start += (i1 - k - 1 > max_dist - (len2 - len1)) ? 1 : 0
            i2_end += (i2_end < len2) ? 1 : 0
        end
        for (i2, ch2) in enumerate(s2)
            if i2 <= k 
                prevch2 = ch2
            elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
                # no need to look beyond window of lower right diagonal - max distance cells 
                #lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
                prevch2 = ch2
            else
                above, current, left = current, left, v[i2 - k]
                w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost
                # left now equals current cost (which will be diagonal at next iteration)
                if ch1 != ch2
                    current = min(left, current, above) + 1
                    # never happens at i2 = k + 1 because then the two previous characters were equal
                    if (i1 - k - 1 > 0) & (i2 - k - 1 > 0) && (ch1 == prevch2) && (prevch1 == ch2)
                        thisTransCost += 1
                        current = min(current, thisTransCost)
                    end
                end
                v[i2 - k] = current
                prevch2 = ch2
            end
        end
        max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1
        prevch1 = ch1
    end
    max_dist !== nothing && current > max_dist && return max_dist + 1
    return current
end

"""
    RatcliffObershelp()

Creates the RatcliffObershelp distance

The distance between two strings is defined as one minus  the number of matching characters 
divided by the total number of characters in the two strings. Matching characters are those 
in the longest common subsequence plus, recursively, matching characters in the unmatched 
region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end

function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
    len1 + len2 == 0 ? 0. : 1.0 - 2 *  n_matched / (len1 + len2)
end

function matching_blocks(s1, s2)
    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
end

function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
    n1, n2, len = longest_common_pattern(s1, s2)
    # exit if there is no common substring
    len == 0 && return x
    # add the info of the common to the existing set
    push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
    # add the longest common substring that happens before
    matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
    # add the longest common substring that happens after
    matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), 
                    start1 + n1 + len - 1, start2 + n2 + len - 1)
    return x
end

function longest_common_pattern(s1, s2)
    if length(s1) > length(s2)
        start2, start1, len = longest_common_pattern(s2, s1)
    else
        start1, start2, len = 0, 0, 0
        p = zeros(Int, length(s2))
        for (i1, ch1) in enumerate(s1)
            oldp = 0
            for (i2, ch2) in enumerate(s2)
                newp = 0
                if ch1 == ch2
                    newp = oldp > 0 ? oldp : i2
                    currentlength = i2 - newp + 1
                    if currentlength > len
                        start1, start2, len = i1 - currentlength + 1, newp, currentlength
                    end
                end
                p[i2], oldp = newp, p[i2]
            end
        end
    end
    return start1, start2, len
end
clean 2019-08-18 18:52:37 +02:00			`"""`
			`Jaro()`

voc 2020-02-26 01:40:14 +01:00			`Creates the Jaro distance`
clean 2019-08-18 18:52:37 +02:00
			`The Jaro distance is defined as`


			``1 - (m / \|s1\| + m / \|s2\| + (m - t) / m) / 3``

			where ``m`` is the number of matching characters and
			``t`` is half the number of transpositions.
			`"""`
StringDistance is now just an union 2020-02-08 17:49:53 +01:00			`struct Jaro <: SemiMetric end`
Update edit.jl 2020-02-02 17:47:31 +01:00
clean 2019-08-18 18:52:37 +02:00			`## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html`
correct normalize Partial/TokenSort/TokenSet 2020-07-20 16:08:27 +02:00			`function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)`
simplify three arguments form 2020-02-13 15:44:27 +01:00			`((s1 === missing) \| (s2 === missing)) && return missing`
refinement 2019-08-19 19:54:38 +02:00			`s1, s2 = reorder(s1, s2)`
clarify default max_dist 2019-08-19 19:33:33 +02:00			`len1, len2 = length(s1), length(s2)`
allo any iterator in Jaro + add tests 2020-02-11 13:39:15 +01:00			`# If both are empty, the formula in Wikipedia gives 0`
remove trie 2019-12-13 00:55:41 +01:00			`# Add this line so that not the case`
clean 2019-08-18 18:52:37 +02:00			`len2 == 0 && return 0.0`
return 1 if distance over maxdist 2020-07-19 21:37:49 +02:00			`d = max(0, div(len2, 2) - 1)`
clean 2019-08-18 18:52:37 +02:00			`flag = fill(false, len2)`
use enumerate 2020-02-21 16:16:52 +01:00			`ch1_match = Vector{eltype(s1)}()`
			`for (i1, ch1) in enumerate(s1)`
			`for (i2, ch2) in enumerate(s2)`
clean 2020-02-24 15:41:38 +01:00			`# greedy alignement`
return 1 if distance over maxdist 2020-07-19 21:37:49 +02:00			`if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2]`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`flag[i2] = true`
use enumerate 2020-02-21 16:16:52 +01:00			`push!(ch1_match, ch1)`
clean 2019-08-18 18:52:37 +02:00			`break`
			`end`
			`end`
			`end`
use enumerate 2020-02-21 16:16:52 +01:00			`# m counts number matching characters`
			`m = length(ch1_match)`
refinement 2019-08-19 19:54:38 +02:00			`m == 0 && return 1.0`
use enumerate 2020-02-21 16:16:52 +01:00			`# t counts number transpositions`
clean 2019-08-18 18:52:37 +02:00			`t = 0`
			`i1 = 0`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i2, ch2) in enumerate(s2)`
clean 2019-08-18 18:52:37 +02:00			`if flag[i2]`
			`i1 += 1`
allo any iterator in Jaro + add tests 2020-02-11 13:39:15 +01:00			`t += ch2 != ch1_match[i1]`
clean 2019-08-18 18:52:37 +02:00			`end`
			`end`
improve support for missings 2019-12-12 15:38:20 +01:00			`return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0`
clean 2019-08-18 18:52:37 +02:00			`end`

add comments 2019-08-18 01:45:31 +02:00			`"""`
			`Levenshtein()`

voc 2020-02-26 01:40:14 +01:00			`Creates the Levenshtein distance`
add winkler and normalized 2015-10-25 16:23:46 +01:00
remove trie 2019-12-13 00:55:41 +01:00			`The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,`
			`substitutions of a single character) required to change one string into the other.`
add comments 2019-08-18 01:45:31 +02:00			`"""`
StringDistance is now just an union 2020-02-08 17:49:53 +01:00			`struct Levenshtein <: Metric end`
add grams 2015-10-23 16:12:51 +02:00
clean 2019-08-18 18:52:37 +02:00			`## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html`
Update edit.jl 2020-07-13 17:56:34 +02:00			`# Return max_dist + 1 if distance higher than max_dist`
			`# to differentiate distance equal to max_dist or not, which is important for find fctions.`
			`function (dist::Levenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)`
simplify three arguments form 2020-02-13 15:44:27 +01:00			`((s1 === missing) \| (s2 === missing)) && return missing`
tab 2019-08-19 20:04:55 +02:00			`s1, s2 = reorder(s1, s2)`
			`len1, len2 = length(s1), length(s2)`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1`
clean 2019-08-17 17:40:26 +02:00			`# prefix common to both strings can be ignored`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`k = common_prefix(s1, s2)`
update 2020-02-18 14:38:20 +01:00			`k == len1 && return len2 - k`
add unicode support 2015-10-24 18:45:24 +02:00			`# distance initialized to first row of matrix`
use enumerate 2020-02-21 16:16:52 +01:00			`# distance between "" and s2[1:i]`
Update edit.jl 2020-02-02 17:47:31 +01:00			`v = collect(1:(len2-k))`
update to 0.7 2018-07-04 20:02:50 +02:00			`current = 0`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i1, ch1) in enumerate(s1)`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`i1 <= k && continue`
use enumerate 2020-02-21 16:16:52 +01:00			`left = current = i1 - k - 1`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && (value_lb = left - 1)`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i2, ch2) in enumerate(s2)`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`i2 <= k && continue`
			`above, current, left = current, left, v[i2 - k]`
add unicode support 2015-10-24 18:45:24 +02:00			`if ch1 != ch2`
use enumerate 2020-02-21 16:16:52 +01:00			`current = min(current, above, left) + 1`
add grams 2015-10-23 16:12:51 +02:00			`end`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && (value_lb = min(value_lb, left))`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`v[i2 - k] = current`
add grams 2015-10-23 16:12:51 +02:00			`end`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && value_lb > max_dist && return max_dist + 1`
add grams 2015-10-23 16:12:51 +02:00			`end`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && current > max_dist && return max_dist + 1`
Only keep compare for Levenshtein and Damerau 2019-08-20 17:59:23 +02:00			`return current`
add grams 2015-10-23 16:12:51 +02:00			`end`

add comments 2019-08-18 01:45:31 +02:00			`"""`
			`DamerauLevenshtein()`
refractor 2015-11-02 18:54:47 +01:00
voc 2020-02-26 01:40:14 +01:00			`Creates the restricted DamerauLevenshtein distance`
add comments 2019-08-18 01:45:31 +02:00
remove trie 2019-12-13 00:55:41 +01:00			`The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,`
			`deletions or substitutions of a single character, or transposition of two adjacent characters)`
			`required to change one string into the other.`
Update edit.jl 2020-02-19 14:35:17 +01:00
			`The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing`
			`the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit`
			`distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that`
Update edit.jl 2020-02-19 14:39:09 +01:00			`uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy`
			`the triangle inequality.`
add comments 2019-08-18 01:45:31 +02:00			`"""`
Update edit.jl 2020-02-19 14:42:17 +01:00
StringDistance is now just an union 2020-02-08 17:49:53 +01:00			`struct DamerauLevenshtein <: SemiMetric end`
add grams 2015-10-23 16:12:51 +02:00
clean 2019-08-18 18:52:37 +02:00			`## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html`
Update edit.jl 2020-07-13 17:56:34 +02:00			`# Return max_dist + 1 if distance higher than max_dist`
			`function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} = nothing)`
simplify three arguments form 2020-02-13 15:44:27 +01:00			`((s1 === missing) \| (s2 === missing)) && return missing`
refinement 2019-08-19 19:54:38 +02:00			`s1, s2 = reorder(s1, s2)`
clarify default max_dist 2019-08-19 19:33:33 +02:00			`len1, len2 = length(s1), length(s2)`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1`
add winkler and normalized 2015-10-25 16:23:46 +01:00			`# prefix common to both strings can be ignored`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`k = common_prefix(s1, s2)`
update 2020-02-18 14:38:20 +01:00			`k == len1 && return len2 - k`
Update edit.jl 2020-02-02 17:47:31 +01:00			`v = collect(1:(len2-k))`
			`w = similar(v)`
Update edit.jl 2020-07-13 17:56:34 +02:00			`if max_dist !== nothing`
Update edit.jl 2020-07-13 19:44:45 +02:00			`i2_start = 0`
			`i2_end = max_dist`
simplify 2019-08-20 21:38:14 +02:00			`end`
Update edit.jl 2020-02-19 14:35:17 +01:00			`prevch1, prevch2 = first(s1), first(s2)`
use enumerate 2020-02-21 16:16:52 +01:00			`current = 0`
			`for (i1, ch1) in enumerate(s1)`
slower but simpler iteration 2020-02-18 14:18:45 +01:00			`i1 <= k && continue`
solve https://github.com/matthieugomez/StringDistances.jl/issues/30 2020-07-13 19:40:30 +02:00			`left = i1 - k - 1`
			`current = left + 1`
add grams 2015-10-23 16:12:51 +02:00			`nextTransCost = 0`
Update edit.jl 2020-07-13 17:56:34 +02:00			`if max_dist !== nothing`
solve https://github.com/matthieugomez/StringDistances.jl/issues/30 2020-07-13 19:40:30 +02:00			`i2_start += (i1 - k - 1 > max_dist - (len2 - len1)) ? 1 : 0`
Update edit.jl 2020-07-13 19:44:45 +02:00			`i2_end += (i2_end < len2) ? 1 : 0`
simplify 2019-08-20 21:38:14 +02:00			`end`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i2, ch2) in enumerate(s2)`
solve https://github.com/matthieugomez/StringDistances.jl/issues/30 2020-07-13 19:40:30 +02:00			`if i2 <= k`
			`prevch2 = ch2`
Update edit.jl 2020-07-13 19:44:45 +02:00			`elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) \| (i2 - k - 1 >= i2_end))`
return 1 if distance over maxdist 2020-07-19 21:37:49 +02:00			`# no need to look beyond window of lower right diagonal - max distance cells`
solve https://github.com/matthieugomez/StringDistances.jl/issues/30 2020-07-13 19:40:30 +02:00			`#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)`
use enumerate 2020-02-21 16:16:52 +01:00			`prevch2 = ch2`
			`else`
			`above, current, left = current, left, v[i2 - k]`
			`w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost`
Update edit.jl 2020-02-19 14:35:17 +01:00			`# left now equals current cost (which will be diagonal at next iteration)`
			`if ch1 != ch2`
use enumerate 2020-02-21 16:16:52 +01:00			`current = min(left, current, above) + 1`
Update edit.jl 2020-07-13 17:56:34 +02:00			`# never happens at i2 = k + 1 because then the two previous characters were equal`
Update edit.jl 2020-07-13 19:44:45 +02:00			`if (i1 - k - 1 > 0) & (i2 - k - 1 > 0) && (ch1 == prevch2) && (prevch1 == ch2)`
Update edit.jl 2020-02-19 14:35:17 +01:00			`thisTransCost += 1`
use enumerate 2020-02-21 16:16:52 +01:00			`current = min(current, thisTransCost)`
add grams 2015-10-23 16:12:51 +02:00			`end`
			`end`
Update edit.jl 2020-02-19 14:35:17 +01:00			`v[i2 - k] = current`
use enumerate 2020-02-21 16:16:52 +01:00			`prevch2 = ch2`
add grams 2015-10-23 16:12:51 +02:00			`end`
			`end`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && v[i1 - k + len2 - len1] > max_dist && return max_dist + 1`
simplify logic 2018-07-04 21:04:06 +02:00			`prevch1 = ch1`
add grams 2015-10-23 16:12:51 +02:00			`end`
Update edit.jl 2020-07-13 17:56:34 +02:00			`max_dist !== nothing && current > max_dist && return max_dist + 1`
add grams 2015-10-23 16:12:51 +02:00			`return current`
			`end`

add comments 2019-08-18 01:45:31 +02:00			`"""`
			`RatcliffObershelp()`

voc 2020-02-26 01:40:14 +01:00			`Creates the RatcliffObershelp distance`
simplify Radclikff 2019-08-17 18:57:35 +02:00
remove trie 2019-12-13 00:55:41 +01:00			`The distance between two strings is defined as one minus the number of matching characters`
			`divided by the total number of characters in the two strings. Matching characters are those`
			`in the longest common subsequence plus, recursively, matching characters in the unmatched`
			`region on either side of the longest common subsequence.`
add comments 2019-08-18 01:45:31 +02:00			`"""`
StringDistance is now just an union 2020-02-08 17:49:53 +01:00			`struct RatcliffObershelp <: SemiMetric end`

correct normalize Partial/TokenSort/TokenSet 2020-07-20 16:08:27 +02:00			`function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)`
simplify three arguments form 2020-02-13 15:44:27 +01:00			`((s1 === missing) \| (s2 === missing)) && return missing`
simplify Ratcliff 2020-02-16 17:12:31 +01:00			`s1, s2 = reorder(s1, s2)`
result_type for str metrics; fix type instability in RatcliffObershelp 2019-12-11 20:45:58 +01:00			`n_matched = sum(last.(matching_blocks(s1, s2)))`
add 2019-08-17 22:12:41 +02:00			`len1, len2 = length(s1), length(s2)`
result_type for str metrics; fix type instability in RatcliffObershelp 2019-12-11 20:45:58 +01:00			`len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)`
simplify Radclikff 2019-08-17 18:57:35 +02:00			`end`

allow any iterator in. Define evaluate for modifiers. 2020-02-09 19:37:37 +01:00			`function matching_blocks(s1, s2)`
simplify Ratcliff 2020-02-16 17:12:31 +01:00			`matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)`
simplify Radclikff 2019-08-17 18:57:35 +02:00			`end`

simplify Ratcliff 2020-02-16 17:12:31 +01:00			`function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)`
clean 2020-02-24 15:41:38 +01:00			`n1, n2, len = longest_common_pattern(s1, s2)`
Update edit.jl 2019-08-17 19:18:13 +02:00			`# exit if there is no common substring`
clean 2020-02-24 15:41:38 +01:00			`len == 0 && return x`
Update edit.jl 2019-08-17 19:18:13 +02:00			`# add the info of the common to the existing set`
clean 2020-02-24 15:41:38 +01:00			`push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))`
Update edit.jl 2019-08-17 19:18:13 +02:00			`# add the longest common substring that happens before`
clean 2020-02-24 15:41:38 +01:00			`matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)`
Update edit.jl 2019-08-17 19:18:13 +02:00			`# add the longest common substring that happens after`
clean 2020-02-24 15:41:38 +01:00			`matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1),`
			`start1 + n1 + len - 1, start2 + n2 + len - 1)`
simplify Radclikff 2019-08-17 18:57:35 +02:00			`return x`
			`end`
clean 2019-12-13 02:01:47 +01:00
simplify Ratcliff 2020-02-16 17:12:31 +01:00			`function longest_common_pattern(s1, s2)`
			`if length(s1) > length(s2)`
			`start2, start1, len = longest_common_pattern(s2, s1)`
clean 2019-12-13 02:01:47 +01:00			`else`
			`start1, start2, len = 0, 0, 0`
simplify Ratcliff 2020-02-16 17:12:31 +01:00			`p = zeros(Int, length(s2))`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i1, ch1) in enumerate(s1)`
clean 2019-12-13 02:01:47 +01:00			`oldp = 0`
use enumerate 2020-02-21 16:16:52 +01:00			`for (i2, ch2) in enumerate(s2)`
clean 2019-12-13 02:01:47 +01:00			`newp = 0`
			`if ch1 == ch2`
			`newp = oldp > 0 ? oldp : i2`
simplify Ratcliff 2020-02-16 17:12:31 +01:00			`currentlength = i2 - newp + 1`
clean 2019-12-13 02:01:47 +01:00			`if currentlength > len`
			`start1, start2, len = i1 - currentlength + 1, newp, currentlength`
			`end`
			`end`
			`p[i2], oldp = newp, p[i2]`
			`end`
			`end`
			`end`
			`return start1, start2, len`
do not normalize Partial/TokenSet/TokenSort by default 2020-07-13 20:39:21 +02:00			`end`