allow any iterator in. Define evaluate for modifiers.

2020-02-09 13:37:37 -05:00 · 2020-02-09 13:37:37 -05:00 · 5cbbfc5bde
parent a949f7bd62
commit 5cbbfc5bde
7 changed files with 157 additions and 111 deletions
--- a/README.md
+++ b/README.md
@ -10,9 +10,11 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/
 The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is:

 ```julia
-compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
+compare(s1, s2, dist::StringDistance)
 ```

+where `s1` and `s2` can be any iterator with a `length` method (e.g. `AbstractString`, `GraphemeIterator`, `AbstractVector`...).
+
 - Edit Distances
 	- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
 	- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -3,24 +3,37 @@ module StringDistances
 using Distances
 import Distances: evaluate, result_type

+isnormalized(dist::SemiMetric) = false
+

 include("utils.jl")
 include("edit.jl")
 include("qgram.jl")
-include("compare.jl")
+include("modifier.jl")
+
 const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
-include("find.jl")

-##############################################################################
-##
-## Distances API
-##
-##############################################################################
+"""
+    compare(s1, s2, dist)

+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`.
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
+"""
+function compare(s1, s2, dist::StringDistance; min_score = 0.0)
+	1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
+end
+
+# distance API
 function result_type(dist::StringDistance, s1, s2)
    typeof(evaluate(dist, "", ""))
 end
-
+include("find.jl")

 ##############################################################################
 ##
--- a/src/edit.jl
+++ b/src/edit.jl
@ -12,10 +12,11 @@ where ``m`` is the number of matching characters and
 ``t`` is half the number of transpositions.
 """
 struct Jaro <: SemiMetric end
+isnormalized(::Jaro) = true

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 ## accepts any iterator, including AbstractString
-function evaluate(dist::Jaro, s1, s2)
+function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -87,7 +88,7 @@ struct Levenshtein <: Metric end
 # This makes it possible to differentiate distance equalt to max_dist vs strictly higher
 # This is important for find_all
 ## accepts any iterator, including AbstractString
-function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing)
+function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -142,7 +143,7 @@ struct DamerauLevenshtein <: SemiMetric end

 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 ## accepts any iterator, including AbstractString
-function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing)
+function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -226,20 +227,20 @@ region on either side of the longest common subsequence.
 """
 struct RatcliffObershelp <: SemiMetric end

-evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
-evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
+isnormalized(::RatcliffObershelp) = true

-function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
+function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
+    (ismissing(s1) | ismissing(s2)) && return missing
    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
    len1 + len2 == 0 ? 0. : 1.0 - 2 *  n_matched / (len1 + len2)
 end

-function matching_blocks(s1::AbstractString, s2::AbstractString)
+function matching_blocks(s1, s2)
    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
 end

-function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, 
+function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, 
    len1::Integer, len2::Integer, start1::Integer, start2::Integer)
    a = longest_common_pattern(s1, s2, len1 , len2)
    # exit if there is no common substring
@ -247,18 +248,18 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
    # add the info of the common to the existing set
    push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
    # add the longest common substring that happens before
-    s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
-    s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
+    s1before = _take(s1, a[1] - 1)
+    s2before = _take(s2, a[2] - 1)
    matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
    # add the longest common substring that happens after
-    s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
-    s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
+    s1after = _drop(s1, a[1] + a[3] - 1)
+    s2after = _drop(s2, a[2] + a[3] - 1)
    matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, 
        len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
    return x
 end

-function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
+function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
    if len1 > len2
        start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
    else
--- a/src/modifier.jl
+++ b/src/modifier.jl
@ -1,42 +1,36 @@
-"""
-    compare(s1, s2, dist)
-
-return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`.
-
-### Examples
-```julia-repl
-julia> compare("martha", "marhta", Levenshtein())
-0.6666666666666667
-```
-"""
-function compare(s1, s2, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
-    1.0 - evaluate(dist, s1, s2)
+struct Normalize{S <: SemiMetric} <: SemiMetric
+    dist::S
 end
+function normalize(dist::SemiMetric)
+    isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
+end
+isnormalized(dist::Normalize) = true

-function compare(s1, s2,  dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
+
+function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 1.0
-    d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
-    out = 1.0 - d / len2
-    out < min_score ? 0.0 : out
+    d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
+    out = d / len2
+    out > max_dist ? 1.0 : out
 end

-function compare(s1, s2, dist::QGramDistance; min_score = 0.0)
+function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
    (ismissing(s1) | ismissing(s2)) && return missing
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
-    if typeof(dist) <: QGram
-        1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
+    len1 <= dist.dist.q - 1 && return convert(Float64, !(s1 == s2))
+    if typeof(dist.dist) <: QGram
+        evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
    else
-        1.0 - evaluate(dist, s1, s2)
+        evaluate(dist.dist, s1, s2)
    end
 end

+
 """
   Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)

@ -52,19 +46,22 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
    p::Float64          # scaling factor. Default to 0.1
    threshold::Float64  # boost threshold. Default to 0.7
    maxlength::Integer      # max length of common prefix. Default to 4
+    Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
 end

-function Winkler(dist; p = 0.1, threshold = 0.7, maxlength = 4)
+function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
    p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
-    Winkler(dist, 0.1, 0.7, 4)
+    Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
 end
+isnormalized(dist::Winkler) = true

-function compare(s1, s2, dist::Winkler; min_score = 0.0)
+
+function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
    # cannot do min_score because of boosting threshold
-    score = compare(s1, s2, dist.dist)
-    if score >= dist.threshold
+    score = evaluate(dist.dist, s1, s2)
+    if score <= 1 - dist.threshold
        l = common_prefix(s1, s2)[1]
-        score += min(l, dist.maxlength) * dist.p * (1 - score)
+        score -= min(l, dist.maxlength) * dist.p * score
    end
    return score
 end
@ -88,27 +85,30 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
 """
 struct Partial{S <: SemiMetric} <: SemiMetric
    dist::S
+    Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end
+Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
+isnormalized(dist::Partial) = true

-function compare(s1, s2, dist::Partial; min_score = 0.0)
+function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
-    len1 == 0 && return 1.0
-    out = 0.0
+    len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
+    len1 == 0 && return 0.0
+    out = 1.0
    for x in qgrams(s2, len1)
-        curr = compare(s1, x, dist.dist; min_score = min_score)
-        out = max(out, curr)
-        min_score = max(out, min_score)
+        curr = evaluate(dist.dist, s1, x, max_dist)
+        out = min(out, curr)
+        max_dist = min(out, max_dist)
    end
    return out
 end

-function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
+function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len1 == len2 && return compare(s1, s2, dist.dist)
-    out = 0.0
+    len1 == len2 && return evaluate(dist.dist, s1, s2)
+    out = 1.0
    for r in matching_blocks(s1, s2)
        # Make sure the substring of s2 has length len1
        s2_start = r[2] - r[1] + 1
@ -120,10 +120,9 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
            s2_start += len2 - s2_end
            s2_end += len2 - s2_end
        end
-        i2_start = nextind(s2, 0, s2_start)
-        i2_end = nextind(s2, 0, s2_end)
-        curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
-        out = max(out, curr)
+        curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
+
+        out = min(out, curr)
    end
    return out
 end
@ -147,13 +146,16 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
 """
 struct TokenSort{S <: SemiMetric} <: SemiMetric
    dist::S
+    TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end
+TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
+isnormalized(dist::TokenSort) = true

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function compare(s1, s2, dist::TokenSort; min_score = 0.0)
+function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
-    compare(s1, s2, dist.dist; min_score = min_score)
+    evaluate(dist.dist, s1, s2, max_dist)
 end


@ -175,23 +177,26 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
 """
 struct TokenSet{S <: SemiMetric} <: SemiMetric
    dist::S
+    TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end
+TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
+isnormalized(dist::TokenSet) = true

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function compare(s1, s2, dist::TokenSet; min_score = 0.0)
+function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    v1 = unique!(sort!(split(s1)))
    v2 = unique!(sort!(split(s2)))
    v0 = intersect(v1, v2)
    s0 = join(v0, " ")
    s1 = join(v1, " ")
    s2 = join(v2, " ")
-    isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
-    score_01 = compare(s0, s1, dist.dist; min_score = min_score)
-    min_score = max(min_score, score_01)
-    score_02 = compare(s0, s2, dist.dist; min_score = min_score)
-    min_score = max(min_score, score_02)
-    score_12 = compare(s1, s2, dist.dist; min_score = min_score)
-    max(score_01, score_02, score_12)
+    isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
+    score_01 = evaluate(dist.dist, s0, s1, max_dist)
+    max_dist = min(max_dist, score_01)
+    score_02 = evaluate(dist.dist, s0, s2, max_dist)
+    max_dist = min(max_dist, score_02)
+    score_12 = evaluate(dist.dist, s1, s2, max_dist)
+    min(score_01, score_02, score_12)
 end


@ -214,36 +219,35 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
 """
 struct TokenMax{S <: SemiMetric} <: SemiMetric
    dist::S
+    TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end

-function compare(s1, s2, dist::TokenMax; min_score = 0.0)
+TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
+isnormalized(dist::TokenMax) = true
+
+function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    score = compare(s1, s2, dist.dist; min_score = min_score)
-    min_score = max(min_score, score)
+    score = evaluate(dist.dist, s1, s2, max_dist)
+    min_score = min(max_dist, score)
    unbase_scale = 0.95
    # if one string is much shorter than the other, use partial
    if length(s2) >= 1.5 * length(s1)
        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        score_partial = partial_scale * compare(s1, s2, Partial(dist.dist); 
-                                        min_score = min_score / partial_scale) 
-        min_score = max(min_score, score_partial)
-        score_sort = unbase_scale * partial_scale * 
-                compare(s1, s2, TokenSort(Partial(dist.dist)); 
-                            min_score = min_score / (unbase_scale * partial_scale))
-        min_score = max(min_score, score_sort)
-        score_set = unbase_scale * partial_scale * 
-                compare(s1, s2, TokenSet(Partial(dist.dist)); 
-                            min_score = min_score / (unbase_scale * partial_scale)) 
-        return max(score, score_partial, score_sort, score_set)
+        score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
+        min_score = min(max_dist, score_partial)
+        score_sort = 1 - unbase_scale * partial_scale * 
+                (1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
+        max_dist = min(max_dist, score_sort)
+        score_set = 1 - unbase_scale * partial_scale * 
+                (1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
+        return min(score, score_partial, score_sort, score_set)
    else
-        score_sort = unbase_scale * 
-                compare(s1, s2, TokenSort(dist.dist); 
-                            min_score = min_score / unbase_scale)
-        min_score = max(min_score, score_sort)
-        score_set = unbase_scale * 
-                compare(s1, s2, TokenSet(dist.dist); 
-                            min_score = min_score / unbase_scale) 
-        return max(score, score_sort, score_set)
+        score_sort = 1 - unbase_scale * 
+                (1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
+        max_dist = min(max_dist, score_sort)
+        score_set = 1 - unbase_scale * 
+                (1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
+        return min(score, score_sort, score_set)
    end
 end
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -18,12 +18,15 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}


 #q-grams of AbstractVector
+# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
+# so it does not seem to be worth it.
 function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
 	state + qgram.q - 1 > lastindex(qgram.s) && return nothing
 	view(qgram.s, state:(state + qgram.q - 1)), state + 1
 end
 Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))

+
 """
 Return an iterator on the q-gram of a string

@ -120,7 +123,7 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Cosine, s1, s2)
+function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
@ -147,7 +150,7 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Jaccard, s1, s2)
+function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -174,7 +177,7 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function evaluate(dist::SorensenDice, s1, s2)
+function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -201,7 +204,7 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Overlap, s1, s2)
+function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
--- a/src/utils.jl
+++ b/src/utils.jl
@ -21,8 +21,6 @@ function reorder(s1, s2)
    (length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
 end

-
-
 function common_prefix(s1, s2)
    x1 = iterate(s1)
    x2 = iterate(s2)
@ -36,4 +34,29 @@ function common_prefix(s1, s2)
        x2 = iterate(s2, state2)
    end
    return l, x1, x2
-end
+end
+
+
+
+function _take(s, n::Integer)
+    Base.Iterators.take(s, n)
+end
+function _take(s::AbstractString, n::Integer)
+   SubString(s, firstindex(s), nextind(s, 0, n))
+end
+
+function _drop(s, n::Integer)
+    Base.Iterators.drop(s, n)
+end
+function _drop(s::AbstractString, n::Integer)
+   SubString(s, nextind(s, 0, n + 1), lastindex(s))
+end
+
+function _slice(s, n1::Integer, n2::Integer)
+    Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
+end
+function _slice(s::AbstractString, n1::Integer, n2::Integer)
+   SubString(s, nextind(s, 0, n1 + 1),  nextind(s, 0, n2))
+end
+
+
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -24,13 +24,13 @@ using StringDistances, Test
 	compare("aüa", "aua", DamerauLevenshtein())

 	# Winkler
-	@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
-	@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
-	@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
-	@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
-	@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
-	@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
-	@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
+	@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4
+	@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4
+	@test compare("dixon", "dicksonx", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.81333 atol = 1e-4
+	@test compare("william", "williams", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.975 atol = 1e-4
+	@test compare("", "foo", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4
+	@test compare("a", "a", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 1.0 atol = 1e-4
+	@test compare("abc", "xyz", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4

 	# RatcliffObershelp
 	@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp())  ≈ 0.0