From 5cbbfc5bdec1deab97beabafc1c2fe0d3f92c43c Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Sun, 9 Feb 2020 13:37:37 -0500 Subject: [PATCH] allow any iterator in. Define evaluate for modifiers. --- README.md | 4 +- src/StringDistances.jl | 29 ++++-- src/edit.jl | 27 +++--- src/{compare.jl => modifier.jl} | 154 ++++++++++++++++---------------- src/qgram.jl | 11 ++- src/utils.jl | 29 +++++- test/modifiers.jl | 14 +-- 7 files changed, 157 insertions(+), 111 deletions(-) rename src/{compare.jl => modifier.jl} (52%) diff --git a/README.md b/README.md index 3e109cc..094c1f7 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,11 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/ The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is: ```julia -compare(s1::AbstractString, s2::AbstractString, dist::StringDistance) +compare(s1, s2, dist::StringDistance) ``` +where `s1` and `s2` can be any iterator with a `length` method (e.g. `AbstractString`, `GraphemeIterator`, `AbstractVector`...). + - Edit Distances - [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()` diff --git a/src/StringDistances.jl b/src/StringDistances.jl index b49016f..a53a563 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -3,24 +3,37 @@ module StringDistances using Distances import Distances: evaluate, result_type +isnormalized(dist::SemiMetric) = false + include("utils.jl") include("edit.jl") include("qgram.jl") -include("compare.jl") +include("modifier.jl") + const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax} -include("find.jl") -############################################################################## -## -## Distances API -## -############################################################################## +""" + compare(s1, s2, dist) +return a similarity score between 0 and 1 for the strings `s1` and +`s2` based on the distance `dist`. + +### Examples +```julia-repl +julia> compare("martha", "marhta", Levenshtein()) +0.6666666666666667 +``` +""" +function compare(s1, s2, dist::StringDistance; min_score = 0.0) + 1 - evaluate(normalize(dist), s1, s2, 1 - min_score) +end + +# distance API function result_type(dist::StringDistance, s1, s2) typeof(evaluate(dist, "", "")) end - +include("find.jl") ############################################################################## ## diff --git a/src/edit.jl b/src/edit.jl index 9f7aea6..3a6d76a 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -12,10 +12,11 @@ where ``m`` is the number of matching characters and ``t`` is half the number of transpositions. """ struct Jaro <: SemiMetric end +isnormalized(::Jaro) = true ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html ## accepts any iterator, including AbstractString -function evaluate(dist::Jaro, s1, s2) +function evaluate(dist::Jaro, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -87,7 +88,7 @@ struct Levenshtein <: Metric end # This makes it possible to differentiate distance equalt to max_dist vs strictly higher # This is important for find_all ## accepts any iterator, including AbstractString -function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing) +function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -142,7 +143,7 @@ struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## accepts any iterator, including AbstractString -function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing) +function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -226,20 +227,20 @@ region on either side of the longest common subsequence. """ struct RatcliffObershelp <: SemiMetric end -evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing -evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing +isnormalized(::RatcliffObershelp) = true -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) +function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing) + (ismissing(s1) | ismissing(s2)) && return missing n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2) end -function matching_blocks(s1::AbstractString, s2::AbstractString) +function matching_blocks(s1, s2) matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1) end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, len1::Integer, len2::Integer, start1::Integer, start2::Integer) a = longest_common_pattern(s1, s2, len1 , len2) # exit if there is no common substring @@ -247,18 +248,18 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: # add the info of the common to the existing set push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) # add the longest common substring that happens before - s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1)) - s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1)) + s1before = _take(s1, a[1] - 1) + s2before = _take(s2, a[2] - 1) matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2) # add the longest common substring that happens after - s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) - s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2)) + s1after = _drop(s1, a[1] + a[3] - 1) + s2after = _drop(s2, a[2] + a[3] - 1) matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) return x end -function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function longest_common_pattern(s1, s2, len1::Integer, len2::Integer) if len1 > len2 start2, start1, len = longest_common_pattern(s2, s1, len2, len1) else diff --git a/src/compare.jl b/src/modifier.jl similarity index 52% rename from src/compare.jl rename to src/modifier.jl index 1ffdfa1..7b6899b 100755 --- a/src/compare.jl +++ b/src/modifier.jl @@ -1,42 +1,36 @@ -""" - compare(s1, s2, dist) - -return a similarity score between 0 and 1 for the strings `s1` and -`s2` based on the distance `dist`. - -### Examples -```julia-repl -julia> compare("martha", "marhta", Levenshtein()) -0.6666666666666667 -``` -""" -function compare(s1, s2, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) - 1.0 - evaluate(dist, s1, s2) +struct Normalize{S <: SemiMetric} <: SemiMetric + dist::S end +function normalize(dist::SemiMetric) + isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist) +end +isnormalized(dist::Normalize) = true -function compare(s1, s2, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) + +function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0) (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 - d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score))) - out = 1.0 - d / len2 - out < min_score ? 0.0 : out + d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist)) + out = d / len2 + out > max_dist ? 1.0 : out end -function compare(s1, s2, dist::QGramDistance; min_score = 0.0) +function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0) (ismissing(s1) | ismissing(s2)) && return missing # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len1 <= dist.q - 1 && return convert(Float64, s1 == s2) - if typeof(dist) <: QGram - 1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) + len1 <= dist.dist.q - 1 && return convert(Float64, !(s1 == s2)) + if typeof(dist.dist) <: QGram + evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2) else - 1.0 - evaluate(dist, s1, s2) + evaluate(dist.dist, s1, s2) end end + """ Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4) @@ -52,19 +46,22 @@ struct Winkler{S <: SemiMetric} <: SemiMetric p::Float64 # scaling factor. Default to 0.1 threshold::Float64 # boost threshold. Default to 0.7 maxlength::Integer # max length of common prefix. Default to 4 + Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength) end -function Winkler(dist; p = 0.1, threshold = 0.7, maxlength = 4) +function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4) p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one") - Winkler(dist, 0.1, 0.7, 4) + Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4) end +isnormalized(dist::Winkler) = true -function compare(s1, s2, dist::Winkler; min_score = 0.0) + +function evaluate(dist::Winkler, s1, s2, max_dist = 1.0) # cannot do min_score because of boosting threshold - score = compare(s1, s2, dist.dist) - if score >= dist.threshold + score = evaluate(dist.dist, s1, s2) + if score <= 1 - dist.threshold l = common_prefix(s1, s2)[1] - score += min(l, dist.maxlength) * dist.p * (1 - score) + score -= min(l, dist.maxlength) * dist.p * score end return score end @@ -88,27 +85,30 @@ julia> compare(s1, s2, Partial(RatcliffObershelp())) """ struct Partial{S <: SemiMetric} <: SemiMetric dist::S + Partial{S}(dist::S) where {S <: SemiMetric} = new(dist) end +Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist)) +isnormalized(dist::Partial) = true -function compare(s1, s2, dist::Partial; min_score = 0.0) +function evaluate(dist::Partial, s1, s2, max_dist = 1.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score) - len1 == 0 && return 1.0 - out = 0.0 + len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist) + len1 == 0 && return 0.0 + out = 1.0 for x in qgrams(s2, len1) - curr = compare(s1, x, dist.dist; min_score = min_score) - out = max(out, curr) - min_score = max(out, min_score) + curr = evaluate(dist.dist, s1, x, max_dist) + out = min(out, curr) + max_dist = min(out, max_dist) end return out end -function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0) +function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len1 == len2 && return compare(s1, s2, dist.dist) - out = 0.0 + len1 == len2 && return evaluate(dist.dist, s1, s2) + out = 1.0 for r in matching_blocks(s1, s2) # Make sure the substring of s2 has length len1 s2_start = r[2] - r[1] + 1 @@ -120,10 +120,9 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO s2_start += len2 - s2_end s2_end += len2 - s2_end end - i2_start = nextind(s2, 0, s2_start) - i2_end = nextind(s2, 0, s2_end) - curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp()) - out = max(out, curr) + curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end)) + + out = min(out, curr) end return out end @@ -147,13 +146,16 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp())) """ struct TokenSort{S <: SemiMetric} <: SemiMetric dist::S + TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist) end +TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)) +isnormalized(dist::TokenSort) = true # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -function compare(s1, s2, dist::TokenSort; min_score = 0.0) +function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0) s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") - compare(s1, s2, dist.dist; min_score = min_score) + evaluate(dist.dist, s1, s2, max_dist) end @@ -175,23 +177,26 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp())) """ struct TokenSet{S <: SemiMetric} <: SemiMetric dist::S + TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist) end +TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist)) +isnormalized(dist::TokenSet) = true # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -function compare(s1, s2, dist::TokenSet; min_score = 0.0) +function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0) v1 = unique!(sort!(split(s1))) v2 = unique!(sort!(split(s2))) v0 = intersect(v1, v2) s0 = join(v0, " ") s1 = join(v1, " ") s2 = join(v2, " ") - isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score) - score_01 = compare(s0, s1, dist.dist; min_score = min_score) - min_score = max(min_score, score_01) - score_02 = compare(s0, s2, dist.dist; min_score = min_score) - min_score = max(min_score, score_02) - score_12 = compare(s1, s2, dist.dist; min_score = min_score) - max(score_01, score_02, score_12) + isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist) + score_01 = evaluate(dist.dist, s0, s1, max_dist) + max_dist = min(max_dist, score_01) + score_02 = evaluate(dist.dist, s0, s2, max_dist) + max_dist = min(max_dist, score_02) + score_12 = evaluate(dist.dist, s1, s2, max_dist) + min(score_01, score_02, score_12) end @@ -214,36 +219,35 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp())) """ struct TokenMax{S <: SemiMetric} <: SemiMetric dist::S + TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist) end -function compare(s1, s2, dist::TokenMax; min_score = 0.0) +TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist)) +isnormalized(dist::TokenMax) = true + +function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - score = compare(s1, s2, dist.dist; min_score = min_score) - min_score = max(min_score, score) + score = evaluate(dist.dist, s1, s2, max_dist) + min_score = min(max_dist, score) unbase_scale = 0.95 # if one string is much shorter than the other, use partial if length(s2) >= 1.5 * length(s1) partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9 - score_partial = partial_scale * compare(s1, s2, Partial(dist.dist); - min_score = min_score / partial_scale) - min_score = max(min_score, score_partial) - score_sort = unbase_scale * partial_scale * - compare(s1, s2, TokenSort(Partial(dist.dist)); - min_score = min_score / (unbase_scale * partial_scale)) - min_score = max(min_score, score_sort) - score_set = unbase_scale * partial_scale * - compare(s1, s2, TokenSet(Partial(dist.dist)); - min_score = min_score / (unbase_scale * partial_scale)) - return max(score, score_partial, score_sort, score_set) + score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale)) + min_score = min(max_dist, score_partial) + score_sort = 1 - unbase_scale * partial_scale * + (1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) + max_dist = min(max_dist, score_sort) + score_set = 1 - unbase_scale * partial_scale * + (1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) + return min(score, score_partial, score_sort, score_set) else - score_sort = unbase_scale * - compare(s1, s2, TokenSort(dist.dist); - min_score = min_score / unbase_scale) - min_score = max(min_score, score_sort) - score_set = unbase_scale * - compare(s1, s2, TokenSet(dist.dist); - min_score = min_score / unbase_scale) - return max(score, score_sort, score_set) + score_sort = 1 - unbase_scale * + (1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale)) + max_dist = min(max_dist, score_sort) + score_set = 1 - unbase_scale * + (1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale)) + return min(score, score_sort, score_set) end end \ No newline at end of file diff --git a/src/qgram.jl b/src/qgram.jl index df4799e..b1265e4 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -18,12 +18,15 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S} #q-grams of AbstractVector +# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration +# so it does not seem to be worth it. function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s)) state + qgram.q - 1 > lastindex(qgram.s) && return nothing view(qgram.s, state:(state + qgram.q - 1)), state + 1 end Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram)) + """ Return an iterator on the q-gram of a string @@ -120,7 +123,7 @@ struct Cosine <: QGramDistance q::Int end -function evaluate(dist::Cosine, s1, s2) +function evaluate(dist::Cosine, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) norm1, norm2, prodnorm = 0, 0, 0 @@ -147,7 +150,7 @@ struct Jaccard <: QGramDistance q::Int end -function evaluate(dist::Jaccard, s1, s2) +function evaluate(dist::Jaccard, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 @@ -174,7 +177,7 @@ struct SorensenDice <: QGramDistance q::Int end -function evaluate(dist::SorensenDice, s1, s2) +function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 @@ -201,7 +204,7 @@ struct Overlap <: QGramDistance q::Int end -function evaluate(dist::Overlap, s1, s2) +function evaluate(dist::Overlap, s1, s2, max_dist = nothing) (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 diff --git a/src/utils.jl b/src/utils.jl index e5e45d5..2ab98e2 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -21,8 +21,6 @@ function reorder(s1, s2) (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1) end - - function common_prefix(s1, s2) x1 = iterate(s1) x2 = iterate(s2) @@ -36,4 +34,29 @@ function common_prefix(s1, s2) x2 = iterate(s2, state2) end return l, x1, x2 -end \ No newline at end of file +end + + + +function _take(s, n::Integer) + Base.Iterators.take(s, n) +end +function _take(s::AbstractString, n::Integer) + SubString(s, firstindex(s), nextind(s, 0, n)) +end + +function _drop(s, n::Integer) + Base.Iterators.drop(s, n) +end +function _drop(s::AbstractString, n::Integer) + SubString(s, nextind(s, 0, n + 1), lastindex(s)) +end + +function _slice(s, n1::Integer, n2::Integer) + Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1) +end +function _slice(s::AbstractString, n1::Integer, n2::Integer) + SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2)) +end + + diff --git a/test/modifiers.jl b/test/modifiers.jl index fc62919..17c23b4 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -24,13 +24,13 @@ using StringDistances, Test compare("aüa", "aua", DamerauLevenshtein()) # Winkler - @test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4 - @test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4 - @test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4 - @test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4 - @test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 - @test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4 - @test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 + @test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4 + @test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4 + @test compare("dixon", "dicksonx", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.81333 atol = 1e-4 + @test compare("william", "williams", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.975 atol = 1e-4 + @test compare("", "foo", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4 + @test compare("a", "a", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 1.0 atol = 1e-4 + @test compare("abc", "xyz", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4 # RatcliffObershelp @test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0