add functors

2020-02-12 09:41:46 -05:00 · 2020-02-12 09:41:46 -05:00 · 4806349088
parent 1ccef94f1a
commit 4806349088
7 changed files with 65 additions and 47 deletions
--- a/README.md
+++ b/README.md
@ -50,6 +50,12 @@ evaluate(TokenMax(RatcliffObershelp()), "martha", "marhta")

 A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).

+Alternatively, each `dist` struct can be used as a callable to call the evaluate function of each metric or modified metric, for example:
+```julia
+Jaro()("martha", "marhta")
+Winkler(Jaro())("martha", "marhta")
+QGram(2)("martha", "marhta")
+```
 ## Compare
 The function `compare` is defined as 1 minus the normalized distance between two strings. It always returns a number between 0 and 1: a value of 0 means completely different and a value of 1 means completely similar.
 ```julia
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -1,16 +1,15 @@
 module StringDistances

 using Distances
-import Distances: evaluate, result_type

 include("utils.jl")
 include("edit.jl")
 include("qgram.jl")
 include("modifiers.jl")

-const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
-result_type(dist::StringDistance, s1, s2) =  typeof(evaluate(dist, "", ""))
-
+const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
+Distances.result_type(dist::StringDistance, s1, s2) =  typeof(dist("", ""))
+Distances.evaluate(dist::StringDistance, args...) = dist(args...)
 include("find.jl")

 ##############################################################################
--- a/src/edit.jl
+++ b/src/edit.jl
@ -16,7 +16,7 @@ isnormalized(::Jaro) = true

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 ## accepts any iterator, including AbstractString
-function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
+function (dist::Jaro)(s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -85,7 +85,7 @@ struct Levenshtein <: Metric end
 # Return max_dist +1 if distance higher than max_dist
 # This makes it possible to differentiate distance equalt to max_dist vs strictly higher
 # This is important for find_all
-function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
+function (dist::Levenshtein)(s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -139,7 +139,7 @@ required to change one string into the other.
 struct DamerauLevenshtein <: SemiMetric end

 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
-function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
+function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -225,7 +225,7 @@ struct RatcliffObershelp <: SemiMetric end

 isnormalized(::RatcliffObershelp) = true

-function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
+function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
    (ismissing(s1) | ismissing(s2)) && return missing
    n_matched = sum(last.(matching_blocks(s1, s2)))
    len1, len2 = length(s1), length(s2)
--- a/src/find.jl
+++ b/src/find.jl
@ -10,7 +10,7 @@ julia> compare("martha", "marhta", Levenshtein())
 0.6666666666666667
 ```
 """
-compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
+compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - normalize(dist)(s1, s2, 1 - min_score)


 """
--- a/src/modifiers.jl
+++ b/src/modifiers.jl
@ -14,26 +14,26 @@ isnormalized(dist::SemiMetric) = false
 isnormalized(dist::Normalize) = true


-function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
+function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
    (ismissing(s1) | ismissing(s2)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 1.0
-    d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
+    d = dist.dist(s1, s2, ceil(Int, len2 * max_dist))
    out = d / len2
    out > max_dist ? 1.0 : out
 end

-function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
+function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
    (ismissing(s1) | ismissing(s2)) && return missing
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
    if typeof(dist.dist) <: QGram
-        evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
+        dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
    else
-        evaluate(dist.dist, s1, s2)
+        dist.dist(s1, s2)
    end
 end

@ -55,17 +55,16 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
    maxlength::Integer      # max length of common prefix. Default to 4
    Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
 end
+isnormalized(dist::Winkler) = true

 function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
    p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
    Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
 end
-isnormalized(dist::Winkler) = true

-
-function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
+function (dist::Winkler)(s1, s2, max_dist = 1.0)
    # cannot do min_score because of boosting threshold
-    score = evaluate(dist.dist, s1, s2)
+    score = dist.dist(s1, s2)
    if score <= 1 - dist.threshold
        l = common_prefix(s1, s2)[1]
        score -= min(l, dist.maxlength) * dist.p * score
@ -97,24 +96,24 @@ end
 Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
 isnormalized(dist::Partial) = true

-function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
+function (dist::Partial)(s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
+    len1 == len2 && return dist.dist(s1, s2, max_dist)
    len1 == 0 && return 0.0
    out = 1.0
    for x in qgrams(s2, len1)
-        curr = evaluate(dist.dist, s1, x, max_dist)
+        curr = dist.dist(s1, x, max_dist)
        out = min(out, curr)
        max_dist = min(out, max_dist)
    end
    return out
 end

-function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
+function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len1 == len2 && return evaluate(dist.dist, s1, s2)
+    len1 == len2 && return dist.dist(s1, s2)
    out = 1.0
    for r in matching_blocks(s1, s2)
        # Make sure the substring of s2 has length len1
@ -127,7 +126,7 @@ function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
            s2_start += len2 - s2_end
            s2_end += len2 - s2_end
        end
-        curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
+        curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))

        out = min(out, curr)
    end
@ -159,10 +158,10 @@ TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)
 isnormalized(dist::TokenSort) = true

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
+function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
-    evaluate(dist.dist, s1, s2, max_dist)
+    dist.dist(s1, s2, max_dist)
 end


@ -190,19 +189,19 @@ TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
 isnormalized(dist::TokenSet) = true

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
+function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    v1 = unique!(sort!(split(s1)))
    v2 = unique!(sort!(split(s2)))
    v0 = intersect(v1, v2)
    s0 = join(v0, " ")
    s1 = join(v1, " ")
    s2 = join(v2, " ")
-    isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
-    score_01 = evaluate(dist.dist, s0, s1, max_dist)
+    isempty(s0) && return dist.dist(s1, s2, max_dist)
+    score_01 = dist.dist(s0, s1, max_dist)
    max_dist = min(max_dist, score_01)
-    score_02 = evaluate(dist.dist, s0, s2, max_dist)
+    score_02 = dist.dist(s0, s2, max_dist)
    max_dist = min(max_dist, score_02)
-    score_12 = evaluate(dist.dist, s1, s2, max_dist)
+    score_12 = dist.dist(s1, s2, max_dist)
    min(score_01, score_02, score_12)
 end

@ -232,29 +231,29 @@ end
 TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
 isnormalized(dist::TokenMax) = true

-function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
+function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    score = evaluate(dist.dist, s1, s2, max_dist)
+    score = dist.dist(s1, s2, max_dist)
    min_score = min(max_dist, score)
    unbase_scale = 0.95
    # if one string is much shorter than the other, use partial
    if length(s2) >= 1.5 * length(s1)
        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
+        score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
        min_score = min(max_dist, score_partial)
        score_sort = 1 - unbase_scale * partial_scale * 
-                (1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
+                (1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
        max_dist = min(max_dist, score_sort)
        score_set = 1 - unbase_scale * partial_scale * 
-                (1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
+                (1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
        return min(score, score_partial, score_sort, score_set)
    else
        score_sort = 1 - unbase_scale * 
-                (1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
+                (1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
        max_dist = min(max_dist, score_sort)
        score_set = 1 - unbase_scale * 
-                (1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
+                (1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
        return min(score, score_sort, score_set)
    end
 end
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -45,9 +45,6 @@ qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s,
 qgrams(s, q::Integer) = QGramIterator(collect(s), q)


-
-abstract type QGramDistance <: SemiMetric end
-
 # For two iterators x1 and x2, that define a length and eltype method,
 # this returns a dictionary which, for each element in x1 or x2, 
 # returns a tuple with the numbers of times it appears in x1 and x2
@ -80,6 +77,9 @@ function count_map(s1, s2)
 	return d
 end

+
+abstract type QGramDistance <: SemiMetric end
+
 """
 	QGram(q::Int)

@ -96,7 +96,7 @@ struct QGram <: QGramDistance
 	q::Int
 end

-function evaluate(dist::QGram, s1, s2)
+function (dist::QGram)(s1, s2)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
@ -122,7 +122,7 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
+function (dist::Cosine)(s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
@ -149,7 +149,7 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
+function (dist::Jaccard)(s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -176,7 +176,7 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
+function (dist::SorensenDice)(s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -203,7 +203,7 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
+function (dist::Overlap)(s1, s2, max_dist = nothing)
 	(ismissing(s1) | ismissing(s2)) && return missing
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
--- a/test/distances.jl
+++ b/test/distances.jl
@ -9,11 +9,15 @@ using StringDistances, Unicode, Test
 		@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
 		@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
 		@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
+		@test Jaro()(" vs an", "es an ") ≈ 0.2777777777777777
 		@test result_type(Jaro(), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(Jaro(), "", "")
 		@test ismissing(evaluate(Jaro(), "", missing))
 	end

+
+
+
 	@testset "Levenshtein" begin
 		@test evaluate(Levenshtein(), "", "") == 0
 		@test evaluate(Levenshtein(), "abc", "") == 3
@ -25,6 +29,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
 		@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
 		@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
+		@test Levenshtein()("", "abc") == 3
 		@test result_type(Levenshtein(), "hello", "world") == Int
 		@inferred Int evaluate(Levenshtein(), "", "")
 		@test ismissing(evaluate(Levenshtein(), "", missing))
@ -41,6 +46,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
 		@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
 		@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
+		@test DamerauLevenshtein()("bc", "abc") == 1
 		@test result_type(DamerauLevenshtein(), "hello", "world") == Int
 		@inferred Int evaluate(DamerauLevenshtein(), "", "")
 		@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@ -56,6 +62,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(RatcliffObershelp(), "New York Mets",  "New York Yankees") ≈ 0.24137931034482762
 		@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3
 		@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
+		@test RatcliffObershelp()("pennsylvania",  "pencilvaneya") ≈ 1 - 0.6666666666666
 		@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(RatcliffObershelp(), "", "")
 		@test ismissing(evaluate(RatcliffObershelp(), "", missing))
@ -68,19 +75,23 @@ using StringDistances, Unicode, Test
 		@test evaluate(QGram(1), "abc", "cba") == 0
 		@test evaluate(QGram(1), "abc", "ccc") == 4
 		@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
-		@test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
+		@test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
 		@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak")
+		@test QGram(1)("abc", "cba") == 0
 		@test result_type(QGram(1), "hello", "world") == Int
 		@test ismissing(evaluate(QGram(1), "", missing))
 		@inferred Int evaluate(QGram(1), "", "")
 	end

+
+
 	@testset "Cosine" begin
 		@test isnan(evaluate(Cosine(2), "", "abc"))
 		@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
 		@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
 		@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5
 		@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak")
+		@test Cosine(2)("leia", "leela") ≈ 0.7113249 atol = 1e-4
 		@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(Cosine(2), "", "")
 		@test ismissing(evaluate(Cosine(2), "", missing))
@ -92,6 +103,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
 		@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
 		@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak")
+		@test Jaccard(2)("leia", "leela") ≈ 0.83333 atol = 1e-4
 		@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(Jaccard(1), "", "")
 		@test ismissing(evaluate(Jaccard(1), "", missing))
@ -101,6 +113,7 @@ using StringDistances, Unicode, Test
 		@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
 		@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
 		@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak")
+		@test SorensenDice(2)("night", "nacht") ≈ 0.75 atol = 1e-4
 		@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(SorensenDice(1), "", "")
 		@test ismissing(evaluate(SorensenDice(1), "", missing))
@ -109,6 +122,7 @@ using StringDistances, Unicode, Test
 	@testset "Overlap" begin
 		@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
 		@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
+		@test Overlap(1)("context", "contact") ≈ .2 atol = 1e-4
 		@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
 		@inferred Float64 evaluate(Overlap(1), "", "")
 		@test ismissing(evaluate(Overlap(1), "", missing))