simplify three arguments form

2020-02-13 09:44:27 -05:00 · 2020-02-13 09:44:27 -05:00 · 093c536377
parent 41ccf12e45
commit 093c536377
7 changed files with 43 additions and 40 deletions
--- a/Project.toml
+++ b/Project.toml
@ -8,7 +8,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

 [compat]
 julia = "1"
-Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
+Distances = "0.8.1"

 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -8,8 +8,11 @@ include("qgram.jl")
 include("modifiers.jl")

 const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
-Distances.result_type(dist::StringDistance, s1, s2) =  typeof(dist("", ""))
-Distances.evaluate(dist::StringDistance, args...) = dist(args...)
+Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
+
+
+
+
 include("find.jl")

 ##############################################################################
--- a/src/edit.jl
+++ b/src/edit.jl
@ -12,12 +12,11 @@ where ``m`` is the number of matching characters and
 ``t`` is half the number of transpositions.
 """
 struct Jaro <: SemiMetric end
-isnormalized(::Jaro) = true

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 ## accepts any iterator, including AbstractString
-function (dist::Jaro)(s1, s2, max_dist = nothing)
-    (ismissing(s1) | ismissing(s2)) && return missing
+function (dist::Jaro)(s1, s2)
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    # If both are empty, the formula in Wikipedia gives 0
@ -25,6 +24,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
    len2 == 0 && return 0.0
    maxdist = max(0, div(len2, 2) - 1)
    flag = fill(false, len2)
+    prevstate1 = firstindex(s1)
    ch1_match = Vector{eltype(s1)}(undef, len1)
    #  m counts number matching characters
    m = 0 
@ -55,6 +55,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
        end
        x1 = iterate(s1, state1)
        i1 += 1
+        prevstate1 = state1
    end
    m == 0 && return 1.0
    # t counts number of transpositions
@ -82,11 +83,11 @@ substitutions of a single character) required to change one string into the othe
 struct Levenshtein <: Metric end

 ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
-# Return max_dist +1 if distance higher than max_dist
+# Return max_dist + 1 if distance higher than max_dist
 # This makes it possible to differentiate distance equalt to max_dist vs strictly higher
 # This is important for find_all
 function (dist::Levenshtein)(s1, s2, max_dist = nothing)
-    (ismissing(s1) | ismissing(s2)) && return missing
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -125,8 +126,6 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
    return current
 end

-
-
 """
    DamerauLevenshtein()

@ -139,8 +138,9 @@ required to change one string into the other.
 struct DamerauLevenshtein <: SemiMetric end

 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
+# Return max_dist + 1 if distance higher than max_dist
 function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
-    (ismissing(s1) | ismissing(s2)) && return missing
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -223,10 +223,8 @@ region on either side of the longest common subsequence.
 """
 struct RatcliffObershelp <: SemiMetric end

-isnormalized(::RatcliffObershelp) = true
-
-function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
-    (ismissing(s1) | ismissing(s2)) && return missing
+function (dist::RatcliffObershelp)(s1, s2)
+    ((s1 === missing) | (s2 === missing)) && return missing
    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
    len1 + len2 == 0 ? 0. : 1.0 - 2 *  n_matched / (len1 + len2)
--- a/src/modifiers.jl
+++ b/src/modifiers.jl
@ -6,16 +6,12 @@ end

   Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 (or a `missing` if one element is missing)
 """
-function normalize(dist::SemiMetric)
-    isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
-end
-
-isnormalized(dist::SemiMetric) = false
-isnormalized(dist::Normalize) = true
+# also a normalized distance always accept a third argument, max_dist.

+normalize(dist::SemiMetric) = Normalize{typeof(dist)}(dist)

 function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
-    (ismissing(s1) | ismissing(s2)) && return missing
+    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 1.0
@ -25,7 +21,7 @@ function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, ma
 end

 function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
-    (ismissing(s1) | ismissing(s2)) && return missing
+    ((s1 === missing) | (s2 === missing)) && return missing
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -37,6 +33,9 @@ function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
    end
 end

+function (dist::Normalize)(s1, s2, max_dist = 1.0)
+    dist.dist(s1, s2)
+end

 """
   Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
@ -55,15 +54,15 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
    maxlength::Integer      # max length of common prefix. Default to 4
    Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
 end
-isnormalized(dist::Winkler) = true

 function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
    p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
    Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
 end
+normalize(dist::Winkler) = dist

 function (dist::Winkler)(s1, s2, max_dist = 1.0)
-    # cannot do min_score because of boosting threshold
+    # cannot do max_dist because of boosting threshold
    score = dist.dist(s1, s2)
    if score <= 1 - dist.threshold
        l = common_prefix(s1, s2)[1]
@ -94,13 +93,13 @@ struct Partial{S <: SemiMetric} <: SemiMetric
    Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end
 Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
-isnormalized(dist::Partial) = true
+normalize(dist::Partial) = dist

 function (dist::Partial)(s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return dist.dist(s1, s2, max_dist)
-    len1 == 0 && return 0.0
+    len1 == 0 && return 1.0
    out = 1.0
    for x in qgrams(s2, len1)
        curr = dist.dist(s1, x, max_dist)
@ -110,7 +109,7 @@ function (dist::Partial)(s1, s2, max_dist = 1.0)
    return out
 end

-function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
+function (dist::Partial{Normalize{RatcliffObershelp}})(s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return dist.dist(s1, s2)
@ -127,7 +126,6 @@ function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
            s2_end += len2 - s2_end
        end
        curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
-
        out = min(out, curr)
    end
    return out
@ -155,7 +153,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
    TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end
 TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
-isnormalized(dist::TokenSort) = true
+normalize(dist::TokenSort) = dist

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
@ -187,6 +185,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
 end
 TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
 isnormalized(dist::TokenSet) = true
+normalize(dist::TokenSet) = dist

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
@ -229,7 +228,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
 end

 TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
-isnormalized(dist::TokenMax) = true
+normalize(dist::TokenMax) = dist

 function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -97,7 +97,7 @@ struct QGram <: QGramDistance
 end

 function (dist::QGram)(s1, s2)
-	(ismissing(s1) | ismissing(s2)) && return missing
+	((s1 === missing) | (s2 === missing)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
 	for (n1, n2) in itr
@ -122,8 +122,8 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function (dist::Cosine)(s1, s2, max_dist = nothing)
-	(ismissing(s1) | ismissing(s2)) && return missing
+function (dist::Cosine)(s1, s2)
+	((s1 === missing) | (s2 === missing)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
 	for (n1, n2) in itr
@ -149,8 +149,8 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function (dist::Jaccard)(s1, s2, max_dist = nothing)
-	(ismissing(s1) | ismissing(s2)) && return missing
+function (dist::Jaccard)(s1, s2)
+	((s1 === missing) | (s2 === missing)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
@ -176,8 +176,8 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function (dist::SorensenDice)(s1, s2, max_dist = nothing)
-	(ismissing(s1) | ismissing(s2)) && return missing
+function (dist::SorensenDice)(s1, s2)
+	((s1 === missing) | (s2 === missing)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
@ -203,8 +203,8 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function (dist::Overlap)(s1, s2, max_dist = nothing)
-	(ismissing(s1) | ismissing(s2)) && return missing
+function (dist::Overlap)(s1, s2)
+	((s1 === missing) | (s2 === missing)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
--- a/src/utils.jl
+++ b/src/utils.jl
@ -23,6 +23,8 @@ struct StringWithLength{T <: AbstractString} <: AbstractString
    l::Int
 end
 string_with_length(s::AbstractString) = StringWithLength(s, length(s))
+# Not really needed but avoid multi-encapsulation
+string_with_length(s::StringWithLength) = s
 Base.length(s::StringWithLength) = s.l
 Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
 Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -37,6 +37,7 @@ using StringDistances, Unicode, Test
 	@test round(Int, 100 * compare("为人子女者要堂堂正正做人，千万不可作奸犯科，致使父母蒙羞", "此前稍早些时候中国商务部发布消息称，中美经贸高级别磋商双方牵头人通话，中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
 	compare("aüa", "aua", TokenMax(RatcliffObershelp()))

+	@test compare("New York Yankees",  "", Partial(Jaro())) ≈ 0.0
 	@test compare("New York Yankees",  "Yankees", Partial(RatcliffObershelp())) ≈ 1.0
 	@test compare("New York Yankees",  "", Partial(RatcliffObershelp())) ≈ 0.0
 	@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444