StringDistances.jl/src/distances/edit.jl


##############################################################################
##
## Hamming
##
##############################################################################

function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
    current = abs(length(s2) - length(s1))
    for (ch1, ch2) in zip(s1, s2)
        current += ch1 != ch2
    end
    return current
end

##############################################################################
##
## Levenshtein
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
##
##############################################################################

struct Levenshtein <: SemiMetric end

function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
    s2, len2, s1, len1 = reorder(s1, s2)
    # prefix common to both strings can be ignored
    k, x1, x2start = common_prefix(s1, s2)
    (x1 == nothing) && return len2 - k
    # distance initialized to first row of matrix
    # => distance between "" and s2[1:i}
    v0 = collect(1:(len2 - k))
    current = 0
    i1 = 1
    while x1 !== nothing
        ch1, state1 = x1
        left = i1 - 1
        current = i1 - 1
        i2 = 1
        x2 = x2start
        while x2 !== nothing
            ch2, state2 = x2
            #  update
            above, current, left = current, left, v0[i2]
            if ch1 != ch2
                # substitution
                current = min(current + 1, above + 1, left + 1)
            end
            v0[i2] = current
            x2 = iterate(s2, state2)
            i2 += 1
        end
        x1 = iterate(s1, state1)
        i1 += 1
    end
    return current
end

##############################################################################
##
## Damerau Levenshtein
## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################

struct DamerauLevenshtein <: SemiMetric end

function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
    s2, len2, s1, len1 = reorder(s1, s2)
    # prefix common to both strings can be ignored
    k, x1, x2start = common_prefix(s1, s2)
    (x1 == nothing) && return len2 - k
    v0 = collect(1:(len2 - k))
    v2 = similar(v0)
    i1 = 1
    current = i1
    prevch1, = x1
    while x1 !== nothing
        ch1, state1 = x1
        left = (i1 - 1) 
        current = i1 
        nextTransCost = 0
        prevch2, = x2start
        x2 = x2start
        i2 = 1
        while x2 !== nothing
            ch2, state2 = x2
            above = current
            thisTransCost = nextTransCost
            nextTransCost = v2[i2]
            # cost of diagonal (substitution)
            v2[i2] = current = left
            # left now equals current cost (which will be diagonal at next iteration)
            left = v0[i2]
            if ch1 != ch2
                # insertion
                if left < current
                    current = left
                end
                # deletion
                if above < current
                    current = above
                end
                current += 1
                if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)
                    thisTransCost += 1
                    if thisTransCost < current
                        current = thisTransCost
                    end
                end
            end
            v0[i2] = current
            x2 = iterate(s2, state2)
            i2 += 1
            prevch2 = ch2
        end
        x1 = iterate(s1, state1)
        i1 += 1
        prevch1 = ch1
    end
    return current
end

##############################################################################
##
## Jaro
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
##
##############################################################################

struct Jaro <: SemiMetric end

function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
    s2, len2, s1, len1 = reorder(s1, s2)
    # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
    len2 == 0 && return 0.0
    maxdist = max(0, div(len2, 2) - 1)
    flag = fill(false, len2)
    prevstate1 = firstindex(s1)
    i1_match = prevstate1 * ones(Int, len1)
    #  m counts number matching characters
    m = 0 
    i1 = 1
    i2 = 1
    x1 = iterate(s1)
    x2 = iterate(s2)
    while x1 !== nothing
        ch1, state1 = x1
        if i2 <= i1 - maxdist - 1
            ch2, state2 = x2
            i2 += 1
            x2 = iterate(s2, state2)
        end 
        i2curr = i2
        x2curr = x2
        while x2curr !== nothing
            (i2curr > i1 + maxdist) && break
            ch2, state2 = x2curr
            if (ch1 == ch2) & !flag[i2curr] 
                m += 1
                flag[i2curr] = true
                i1_match[m] = prevstate1
                break
            end
            x2curr = iterate(s2, state2) 
            i2curr += 1
        end
        x1 = iterate(s1, state1)
        i1 += 1
        prevstate1 = state1
    end
    m == 0 && return 1.0
    # t counts number of transpotsitions
    t = 0
    i1 = 0
    i2 = 0
    for ch2 in s2
        i2 += 1
        if flag[i2]
            i1 += 1
            t += ch2 != iterate(s1, i1_match[i1])[1]
        end
    end
    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end

##############################################################################
##
## Ratcliff/Obershelp
##
##############################################################################

struct RatcliffObershelp <: PreMetric end

function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
    n_matched = sum(last.(matching_blocks(s1, s2)))   
    1.0 - 2 *  n_matched / (length(s1) + length(s2))
end

function matching_blocks(s1::AbstractString, s2::AbstractString)
    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end

function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
    a = longest_common_substring(s1, s2, len1 , len2)
    # if there is a common substring
    if a[3] > 0
        # add the info of the common to the existing set
        push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
        # add the longest common substring that happens before
        s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
        s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
        matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
        # add the longest common substring that happens after
        s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
        s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
        matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
    end
    return x
end
add winkler and normalized 2015-10-25 16:23:46 +01:00
add grams 2015-10-23 16:12:51 +02:00			`##############################################################################`
			`##`
			`## Hamming`
			`##`
			`##############################################################################`

simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)`
clean 2019-08-17 17:40:26 +02:00			`current = abs(length(s2) - length(s1))`
store string length 2015-11-03 19:07:17 +01:00			`for (ch1, ch2) in zip(s1, s2)`
clean 2019-08-17 17:40:26 +02:00			`current += ch1 != ch2`
add grams 2015-10-23 16:12:51 +02:00			`end`
clean 2019-08-17 17:40:26 +02:00			`return current`
add grams 2015-10-23 16:12:51 +02:00			`end`

			`##############################################################################`
			`##`
refractor 2015-11-02 18:54:47 +01:00			`## Levenshtein`
			`## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html`
add grams 2015-10-23 16:12:51 +02:00			`##`
			`##############################################################################`
add winkler and normalized 2015-10-25 16:23:46 +01:00
update to 0.6 2017-05-12 23:41:56 +02:00			`struct Levenshtein <: SemiMetric end`
add grams 2015-10-23 16:12:51 +02:00
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)`
			`s2, len2, s1, len1 = reorder(s1, s2)`
clean 2019-08-17 17:40:26 +02:00			`# prefix common to both strings can be ignored`
simplify logic 2018-07-04 21:04:06 +02:00			`k, x1, x2start = common_prefix(s1, s2)`
update to 0.7 2018-07-04 20:02:50 +02:00			`(x1 == nothing) && return len2 - k`
add unicode support 2015-10-24 18:45:24 +02:00			`# distance initialized to first row of matrix`
			`# => distance between "" and s2[1:i}`
simplify 2018-07-04 21:26:24 +02:00			`v0 = collect(1:(len2 - k))`
update to 0.7 2018-07-04 20:02:50 +02:00			`current = 0`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 = 1`
update with dictionary + faster 2018-07-04 23:27:40 +02:00			`while x1 !== nothing`
update to 0.7 2018-07-04 20:02:50 +02:00			`ch1, state1 = x1`
clean 2019-08-17 17:40:26 +02:00			`left = i1 - 1`
			`current = i1 - 1`
simplify logic 2018-07-04 21:47:11 +02:00			`i2 = 1`
simplify logic 2018-07-04 21:04:06 +02:00			`x2 = x2start`
update with dictionary + faster 2018-07-04 23:27:40 +02:00			`while x2 !== nothing`
update to 0.7 2018-07-04 20:02:50 +02:00			`ch2, state2 = x2`
add unicode support 2015-10-24 18:45:24 +02:00			`# update`
			`above, current, left = current, left, v0[i2]`
			`if ch1 != ch2`
			`# substitution`
clean 2019-08-17 17:40:26 +02:00			`current = min(current + 1, above + 1, left + 1)`
add grams 2015-10-23 16:12:51 +02:00			`end`
add unicode support 2015-10-24 18:45:24 +02:00			`v0[i2] = current`
update to 0.7 2018-07-04 20:02:50 +02:00			`x2 = iterate(s2, state2)`
simplify logic 2018-07-04 21:47:11 +02:00			`i2 += 1`
add grams 2015-10-23 16:12:51 +02:00			`end`
update to 0.7 2018-07-04 20:02:50 +02:00			`x1 = iterate(s1, state1)`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 += 1`
add grams 2015-10-23 16:12:51 +02:00			`end`
			`return current`
			`end`

refractor 2015-11-02 18:54:47 +01:00			`##############################################################################`
			`##`
			`## Damerau Levenshtein`
			`## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html`
			`##`
			`##############################################################################`

update to 0.6 2017-05-12 23:41:56 +02:00			`struct DamerauLevenshtein <: SemiMetric end`
add grams 2015-10-23 16:12:51 +02:00
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)`
update 0.7 2018-07-04 18:33:13 +02:00			`s2, len2, s1, len1 = reorder(s1, s2)`
add winkler and normalized 2015-10-25 16:23:46 +01:00			`# prefix common to both strings can be ignored`
simplify logic 2018-07-04 21:04:06 +02:00			`k, x1, x2start = common_prefix(s1, s2)`
update to 0.7 2018-07-04 20:02:50 +02:00			`(x1 == nothing) && return len2 - k`
simplify 2018-07-04 21:26:24 +02:00			`v0 = collect(1:(len2 - k))`
			`v2 = similar(v0)`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 = 1`
			`current = i1`
simplify logic 2018-07-04 21:04:06 +02:00			`prevch1, = x1`
clean 2019-08-17 17:40:26 +02:00			`while x1 !== nothing`
update to 0.7 2018-07-04 20:02:50 +02:00			`ch1, state1 = x1`
add unicode support 2015-10-24 18:45:24 +02:00			`left = (i1 - 1)`
			`current = i1`
add grams 2015-10-23 16:12:51 +02:00			`nextTransCost = 0`
simplify logic 2018-07-04 21:04:06 +02:00			`prevch2, = x2start`
			`x2 = x2start`
simplify logic 2018-07-04 21:47:11 +02:00			`i2 = 1`
clean 2019-08-17 17:40:26 +02:00			`while x2 !== nothing`
update to 0.7 2018-07-04 20:02:50 +02:00			`ch2, state2 = x2`
add grams 2015-10-23 16:12:51 +02:00			`above = current`
			`thisTransCost = nextTransCost`
add unicode support 2015-10-24 18:45:24 +02:00			`nextTransCost = v2[i2]`
			`# cost of diagonal (substitution)`
			`v2[i2] = current = left`
			`# left now equals current cost (which will be diagonal at next iteration)`
			`left = v0[i2]`
add grams 2015-10-23 16:12:51 +02:00			`if ch1 != ch2`
add unicode support 2015-10-24 18:45:24 +02:00			`# insertion`
add grams 2015-10-23 16:12:51 +02:00			`if left < current`
			`current = left`
			`end`
add unicode support 2015-10-24 18:45:24 +02:00			`# deletion`
add grams 2015-10-23 16:12:51 +02:00			`if above < current`
			`current = above`
			`end`
			`current += 1`
clean 2019-08-17 17:40:26 +02:00			`if (i1 != 1) & (i2 != 1) & (ch1 == prevch2) & (prevch1 == ch2)`
add grams 2015-10-23 16:12:51 +02:00			`thisTransCost += 1`
			`if thisTransCost < current`
			`current = thisTransCost`
			`end`
			`end`
			`end`
add unicode support 2015-10-24 18:45:24 +02:00			`v0[i2] = current`
update to 0.7 2018-07-04 20:02:50 +02:00			`x2 = iterate(s2, state2)`
simplify logic 2018-07-04 21:47:11 +02:00			`i2 += 1`
simplify logic 2018-07-04 21:04:06 +02:00			`prevch2 = ch2`
add grams 2015-10-23 16:12:51 +02:00			`end`
update to 0.7 2018-07-04 20:02:50 +02:00			`x1 = iterate(s1, state1)`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 += 1`
simplify logic 2018-07-04 21:04:06 +02:00			`prevch1 = ch1`
add grams 2015-10-23 16:12:51 +02:00			`end`
			`return current`
			`end`

			`##############################################################################`
			`##`
add RatcliffObershelp 2015-11-04 18:40:30 +01:00			`## Jaro`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html`
clean 2019-08-17 17:40:26 +02:00			`##`
add grams 2015-10-23 16:12:51 +02:00			`##############################################################################`

update to 0.6 2017-05-12 23:41:56 +02:00			`struct Jaro <: SemiMetric end`
add grams 2015-10-23 16:12:51 +02:00
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)`
			`s2, len2, s1, len1 = reorder(s1, s2)`
			`# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case`
return 0 if empty 2015-11-03 19:09:10 +01:00			`len2 == 0 && return 0.0`
all encodings 2015-10-24 19:29:15 +02:00			`maxdist = max(0, div(len2, 2) - 1)`
			`flag = fill(false, len2)`
simplify logic 2018-07-04 21:15:07 +02:00			`prevstate1 = firstindex(s1)`
simplify 2018-07-04 21:26:24 +02:00			`i1_match = prevstate1 * ones(Int, len1)`
clean 2019-08-17 17:40:26 +02:00			`# m counts number matching characters`
simplify 2018-07-04 21:26:24 +02:00			`m = 0`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 = 1`
			`i2 = 1`
update to 0.7 2018-07-04 20:02:50 +02:00			`x1 = iterate(s1)`
simplify logic 2018-07-04 21:15:07 +02:00			`x2 = iterate(s2)`
clean 2019-08-17 17:40:26 +02:00			`while x1 !== nothing`
simplify logic 2018-07-04 21:15:07 +02:00			`ch1, state1 = x1`
simplify logic 2018-07-04 21:47:11 +02:00			`if i2 <= i1 - maxdist - 1`
update to 0.7 2018-07-04 20:02:50 +02:00			`ch2, state2 = x2`
all encodings 2015-10-24 19:29:15 +02:00			`i2 += 1`
simplify logic 2018-07-04 21:15:07 +02:00			`x2 = iterate(s2, state2)`
			`end`
			`i2curr = i2`
			`x2curr = x2`
clean 2019-08-17 17:40:26 +02:00			`while x2curr !== nothing`
update with dictionary + faster 2018-07-04 23:27:40 +02:00			`(i2curr > i1 + maxdist) && break`
simplify logic 2018-07-04 21:15:07 +02:00			`ch2, state2 = x2curr`
clean 2019-08-17 17:40:26 +02:00			`if (ch1 == ch2) & !flag[i2curr]`
add grams 2015-10-23 16:12:51 +02:00			`m += 1`
simplify logic 2018-07-04 21:15:07 +02:00			`flag[i2curr] = true`
			`i1_match[m] = prevstate1`
add grams 2015-10-23 16:12:51 +02:00			`break`
			`end`
simplify logic 2018-07-04 21:15:07 +02:00			`x2curr = iterate(s2, state2)`
simplify logic 2018-07-04 21:47:11 +02:00			`i2curr += 1`
add grams 2015-10-23 16:12:51 +02:00			`end`
update to 0.7 2018-07-04 20:02:50 +02:00			`x1 = iterate(s1, state1)`
simplify logic 2018-07-04 21:47:11 +02:00			`i1 += 1`
simplify logic 2018-07-04 21:15:07 +02:00			`prevstate1 = state1`
add grams 2015-10-23 16:12:51 +02:00			`end`
clean 2019-08-17 17:40:26 +02:00			`m == 0 && return 1.0`
			`# t counts number of transpotsitions`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`t = 0`
			`i1 = 0`
			`i2 = 0`
			`for ch2 in s2`
			`i2 += 1`
			`if flag[i2]`
			`i1 += 1`
0.7 first 2018-07-04 18:07:26 +02:00			`t += ch2 != iterate(s1, i1_match[i1])[1]`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`end`
			`end`
clean 2019-08-17 17:40:26 +02:00			`return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0`
simplify Radclikff 2019-08-17 18:57:35 +02:00			`end`

			`##############################################################################`
			`##`
			`## Ratcliff/Obershelp`
			`##`
			`##############################################################################`

			`struct RatcliffObershelp <: PreMetric end`

			`function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)`
			`n_matched = sum(last.(matching_blocks(s1, s2)))`
			`1.0 - 2 * n_matched / (length(s1) + length(s2))`
			`end`

			`function matching_blocks(s1::AbstractString, s2::AbstractString)`
			`matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)`
			`end`

			`function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)`
			`a = longest_common_substring(s1, s2, len1 , len2)`
			`# if there is a common substring`
			`if a[3] > 0`
			`# add the info of the common to the existing set`
			`push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))`
			`# add the longest common substring that happens before`
			`s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))`
			`s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))`
			`matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)`
			`# add the longest common substring that happens after`
			`s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))`
			`s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))`
			`matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)`
			`end`
			`return x`
			`end`