StringDistances.jl/src/compare.jl

##############################################################################
##
## compare
## compare always return a value between 0 and 1.
##
##############################################################################
"""
    compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)

compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
"""
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
    1.0 - evaluate(dist, s1, s2)
end

function compare(s1::AbstractString, s2::AbstractString, 
    dist::Union{Hamming, Levenshtein, DamerauLevenshtein})
    len = max(length(s1), length(s2))
    len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
end

function compare(s1::AbstractString, s2::AbstractString, 
    dist::AbstractQGramDistance)
    # When string length < q for qgram distance, returns s1 == s2
    len1, len2 = length(s1), length(s2)
    min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2)
    if typeof(dist) <: QGram
        1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
    else
        1 - evaluate(dist, s1, s2)
    end
end

@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)

##############################################################################
##
## Winkler
##
##############################################################################
"""
   Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7)

Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`)
"""
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
    dist::T1
    scaling_factor::T2      # scaling factor. Default to 0.1
    boosting_threshold::T3      # boost threshold. Default to 0.7
end

# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)

function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
    score = compare(s1, s2, dist.dist)
    l = common_prefix(s1, s2, 4)[1]
    # common prefix adjustment
    if score >= dist.boosting_threshold
        score += l * dist.scaling_factor * (1 - score)
    end
    return score
end

##############################################################################
##
## Partial
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
   Partial(dist::Premetric)

Partial is a `PreMetric` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
"""
struct Partial{T <: PreMetric} <: PreMetric
    dist::T
end

# general
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
    s2, len2, s1, len1 = reorder(s1, s2)
    len1 == len2 && return compare(s1, s2, dist.dist)
    len1 == 0 && return compare("", "", dist.dist)
    out = 0.0
    for x in qgram_iterator(s2, len1)
        curr = compare(s1, x, dist.dist)
        out = max(out, curr)
    end
    return out
end

# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
    s2, len2, s1, len1 = reorder(s1, s2)
    len1 == len2 && return compare(s1, s2, dist.dist)
    out = 0.0
    for r in matching_blocks(s1, s2)
        # here I difffer from fuzz.py by making sure the substring of s2 has length len1
        s2_start = r[2] - r[1] + 1
        s2_end = s2_start + len1 - 1
        if s2_start <= 0
            s2_end += 1 - s2_start
            s2_start += 1 - s2_start
        elseif s2_end > len2
            s2_start += len2 - s2_end
            s2_end += len2 - s2_end
        end
        i2_start =  nextind(s2, 0, s2_start)
        i2_end = nextind(s2, 0, s2_end)
        curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
        out = max(out, curr)
    end
    return out
end

##############################################################################
##
## TokenSort
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
   TokenSort(dist::Premetric)

TokenSort is a `PreMetric` modifier that adjusts for differences in word orders by reording words alphabetically.
"""
struct TokenSort{T <: PreMetric} <: PreMetric
    dist::T
end

function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
    compare(s1, s2, dist.dist)
end

##############################################################################
##
## TokenSet
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
   TokenSet(dist::Premetric)

TokenSort is a `PreMetric` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
"""
struct TokenSet{T <: PreMetric} <: PreMetric
    dist::T
end

function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
    v1 = SortedSet(split(s1))
    v2 = SortedSet(split(s2))
    v0 = intersect(v1, v2)
    s0 = join(v0, " ")
    s1 = join(v1, " ")
    s2 = join(v2, " ")
    isempty(s0) && return compare(s1, s2, dist.dist)
    max(compare(s0, s1, dist.dist), 
        compare(s0, s2, dist.dist),
        compare(s1, s2, dist.dist))
end


##############################################################################
##
## TokenMax
##
##############################################################################
"""
   TokenMax(dist::Premetric)

TokenSort is a `PreMetric` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
"""
struct TokenMax{T <: PreMetric} <: PreMetric
    dist::T
end

function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
    dist0 = compare(s1, s2, dist.dist)
    s2, len2, s1, len1 = reorder(s1, s2)
    unbase_scale = 0.95
    # if one string is much much shorter than the other
    if len2 >= 1.5 * len1
        # if strings are of dissimilar length, use partials
        partial = compare(s1, s2, Partial(dist.dist)) 
        ptsor = compare(s1, s2, TokenSort(Partial(dist.dist))) 
        ptser = compare(s1, s2, TokenSet(Partial(dist.dist))) 
        partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
        return max(dist0, 
                partial * partial_scale, 
                ptsor * unbase_scale * partial_scale, 
                ptser * unbase_scale * partial_scale)
    else
        ptsor = compare(s1, s2, TokenSort(dist.dist)) 
        ptser = compare(s1, s2, TokenSet(dist.dist)) 
        return max(dist0, 
                ptsor * unbase_scale, 
                ptser * unbase_scale)
    end
end
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`##############################################################################`
			`##`
			`## compare`
compare 2018-05-17 17:33:55 +02:00			`## compare always return a value between 0 and 1.`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`##`
			`##############################################################################`
Update compare.jl 2019-08-18 01:47:19 +02:00			`"""`
			`compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00
Update compare.jl 2019-08-18 01:47:19 +02:00			compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
			`"""`
simplify code + allow distance in third arg 2019-08-15 17:07:12 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`1.0 - evaluate(dist, s1, s2)`
			`end`

change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString,`
			`dist::Union{Hamming, Levenshtein, DamerauLevenshtein})`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`len = max(length(s1), length(s2))`
compare 2018-05-17 17:33:55 +02:00			`len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len`
simplify len + correct Jaro 2017-08-05 20:45:19 +02:00			`end`
clean tests 2018-05-16 00:39:50 +02:00
change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString,`
			`dist::AbstractQGramDistance)`
compare 2018-05-17 17:33:55 +02:00			`# When string length < q for qgram distance, returns s1 == s2`
update 2019-08-17 19:12:55 +02:00			`len1, len2 = length(s1), length(s2)`
N -> q 2019-08-18 03:41:20 +02:00			`min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2)`
compare 2018-05-17 17:33:55 +02:00			`if typeof(dist) <: QGram`
N -> q 2019-08-18 03:41:20 +02:00			`1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)`
compare 2018-05-17 17:33:55 +02:00			`else`
			`1 - evaluate(dist, s1, s2)`
			`end`
clean tests 2018-05-16 00:39:50 +02:00			`end`
reorganize 2018-05-17 17:38:55 +02:00
change compare 2019-08-17 18:26:24 +02:00			`@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)`

reorganize 2018-05-17 17:38:55 +02:00			`##############################################################################`
			`##`
			`## Winkler`
			`##`
			`##############################################################################`
add comments 2019-08-18 01:45:31 +02:00			`"""`
			`Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7)`
reorganize 2018-05-17 17:38:55 +02:00
add comments 2019-08-18 01:45:31 +02:00			Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`)
			`"""`
reorganize 2018-05-17 17:38:55 +02:00			`struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric`
			`dist::T1`
			`scaling_factor::T2 # scaling factor. Default to 0.1`
add comments 2019-08-18 01:45:31 +02:00			`boosting_threshold::T3 # boost threshold. Default to 0.7`
reorganize 2018-05-17 17:38:55 +02:00			`end`

			`# restrict to distance between 0 and 1`
			`Winkler(x) = Winkler(x, 0.1, 0.7)`

change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)`
			`score = compare(s1, s2, dist.dist)`
reorganize 2018-05-17 17:38:55 +02:00			`l = common_prefix(s1, s2, 4)[1]`
			`# common prefix adjustment`
add comments 2019-08-18 01:45:31 +02:00			`if score >= dist.boosting_threshold`
reorganize 2018-05-17 17:38:55 +02:00			`score += l * dist.scaling_factor * (1 - score)`
			`end`
			`return score`
			`end`

			`##############################################################################`
			`##`
			`## Partial`
			`## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/`
			`##`
			`##############################################################################`
add comments 2019-08-18 01:45:31 +02:00			`"""`
			`Partial(dist::Premetric)`

			Partial is a `PreMetric` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
			`"""`
reorganize 2018-05-17 17:38:55 +02:00			`struct Partial{T <: PreMetric} <: PreMetric`
			`dist::T`
			`end`

			`# general`
change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::Partial)`
reorganize 2018-05-17 17:38:55 +02:00			`s2, len2, s1, len1 = reorder(s1, s2)`
change compare 2019-08-17 18:26:24 +02:00			`len1 == len2 && return compare(s1, s2, dist.dist)`
			`len1 == 0 && return compare("", "", dist.dist)`
update to 0.7 2018-07-04 20:02:50 +02:00			`out = 0.0`
update 2019-08-17 19:12:55 +02:00			`for x in qgram_iterator(s2, len1)`
			`curr = compare(s1, x, dist.dist)`
reorganize 2018-05-17 17:38:55 +02:00			`out = max(out, curr)`
			`end`
			`return out`
			`end`

			`# Specialization for RatcliffObershelp distance`
			`# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py`
change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})`
reorganize 2018-05-17 17:38:55 +02:00			`s2, len2, s1, len1 = reorder(s1, s2)`
change compare 2019-08-17 18:26:24 +02:00			`len1 == len2 && return compare(s1, s2, dist.dist)`
reorganize 2018-05-17 17:38:55 +02:00			`out = 0.0`
update 2019-08-17 19:12:55 +02:00			`for r in matching_blocks(s1, s2)`
reorganize 2018-05-17 17:38:55 +02:00			`# here I difffer from fuzz.py by making sure the substring of s2 has length len1`
			`s2_start = r[2] - r[1] + 1`
			`s2_end = s2_start + len1 - 1`
			`if s2_start <= 0`
			`s2_end += 1 - s2_start`
			`s2_start += 1 - s2_start`
			`elseif s2_end > len2`
			`s2_start += len2 - s2_end`
			`s2_end += len2 - s2_end`
			`end`
0.7 first 2018-07-04 18:07:26 +02:00			`i2_start = nextind(s2, 0, s2_start)`
correct Partial 2019-08-14 00:18:04 +02:00			`i2_end = nextind(s2, 0, s2_end)`
change compare 2019-08-17 18:26:24 +02:00			`curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())`
reorganize 2018-05-17 17:38:55 +02:00			`out = max(out, curr)`
			`end`
			`return out`
			`end`

			`##############################################################################`
			`##`
			`## TokenSort`
			`## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/`
			`##`
			`##############################################################################`
add comments 2019-08-18 01:45:31 +02:00			`"""`
			`TokenSort(dist::Premetric)`

			TokenSort is a `PreMetric` modifier that adjusts for differences in word orders by reording words alphabetically.
			`"""`
reorganize 2018-05-17 17:38:55 +02:00			`struct TokenSort{T <: PreMetric} <: PreMetric`
			`dist::T`
			`end`

change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)`
update to 0.7 2018-07-04 20:02:50 +02:00			`s1 = join(sort!(split(s1)), " ")`
			`s2 = join(sort!(split(s2)), " ")`
change compare 2019-08-17 18:26:24 +02:00			`compare(s1, s2, dist.dist)`
reorganize 2018-05-17 17:38:55 +02:00			`end`

			`##############################################################################`
			`##`
			`## TokenSet`
			`## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/`
			`##`
			`##############################################################################`
add comments 2019-08-18 01:45:31 +02:00			`"""`
			`TokenSet(dist::Premetric)`

			TokenSort is a `PreMetric` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
			`"""`
reorganize 2018-05-17 17:38:55 +02:00			`struct TokenSet{T <: PreMetric} <: PreMetric`
			`dist::T`
			`end`

change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)`
simplify TokenSet 2019-08-17 21:50:17 +02:00			`v1 = SortedSet(split(s1))`
			`v2 = SortedSet(split(s2))`
			`v0 = intersect(v1, v2)`
reorganize 2018-05-17 17:38:55 +02:00			`s0 = join(v0, " ")`
simplify TokenSet 2019-08-17 21:50:17 +02:00			`s1 = join(v1, " ")`
			`s2 = join(v2, " ")`
add 2019-08-17 22:12:41 +02:00			`isempty(s0) && return compare(s1, s2, dist.dist)`
update 2019-08-17 19:12:55 +02:00			`max(compare(s0, s1, dist.dist),`
add tests 2019-08-17 21:46:22 +02:00			`compare(s0, s2, dist.dist),`
add 2019-08-17 22:12:41 +02:00			`compare(s1, s2, dist.dist))`
reorganize 2018-05-17 17:38:55 +02:00			`end`


			`##############################################################################`
			`##`
			`## TokenMax`
			`##`
			`##############################################################################`
add comments 2019-08-18 01:45:31 +02:00			`"""`
			`TokenMax(dist::Premetric)`

			TokenSort is a `PreMetric` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
			`"""`
reorganize 2018-05-17 17:38:55 +02:00			`struct TokenMax{T <: PreMetric} <: PreMetric`
			`dist::T`
			`end`

change compare 2019-08-17 18:26:24 +02:00			`function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)`
			`dist0 = compare(s1, s2, dist.dist)`
reorganize 2018-05-17 17:38:55 +02:00			`s2, len2, s1, len1 = reorder(s1, s2)`
			`unbase_scale = 0.95`
			`# if one string is much much shorter than the other`
			`if len2 >= 1.5 * len1`
			`# if strings are of dissimilar length, use partials`
change compare 2019-08-17 18:26:24 +02:00			`partial = compare(s1, s2, Partial(dist.dist))`
			`ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))`
			`ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))`
reorganize 2018-05-17 17:38:55 +02:00			`partial_scale = len2 > (8 * len1) ? 0.6 : 0.9`
			`return max(dist0,`
			`partial * partial_scale,`
			`ptsor * unbase_scale * partial_scale,`
			`ptser * unbase_scale * partial_scale)`
			`else`
change compare 2019-08-17 18:26:24 +02:00			`ptsor = compare(s1, s2, TokenSort(dist.dist))`
			`ptser = compare(s1, s2, TokenSet(dist.dist))`
reorganize 2018-05-17 17:38:55 +02:00			`return max(dist0,`
			`ptsor * unbase_scale,`
			`ptser * unbase_scale)`
			`end`
			`end`