add back normalize for Partial/TokenSort/TokenSet

2020-07-20 08:25:53 -07:00 · 2020-07-20 08:25:53 -07:00 · 04b1902f9e
parent e0ef0e8ec1
commit 04b1902f9e
7 changed files with 158 additions and 32 deletions
--- a/Project.toml
+++ b/Project.toml
@ -1,6 +1,6 @@
 name = "StringDistances"
 uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
-version = "0.7.1"
+version = "0.7.2"

 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
--- a/README.md
+++ b/README.md
@ -22,9 +22,9 @@ The available distances are:
 	- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
 	- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
 - Distance "modifiers" that can be applied to any distance:
-	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum distance between the shorter string and substrings of the longer string.
-	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically. 
-	- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
+	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
+	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. 
+	- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the normalized distance between the intersection of two strings with each string.
 	- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines the normalized distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses.   `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
 	- [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) diminishes the normalized distance of strings with common prefixes.  The Winkler adjustment was originally defined for the Jaro similarity score but it can be defined for any string distance.

--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -3,12 +3,11 @@ module StringDistances
 using Distances

 include("utils.jl")
-include("distances/edit.jl")
-include("distances/qgram.jl")
-include("modifiers.jl")
+include("edit.jl")
+include("qgram.jl")
 include("normalize.jl")

-const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
+const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
 # Distances API
 Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))

--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -14,7 +14,7 @@ where ``m`` is the number of matching characters and
 struct Jaro <: SemiMetric end

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
-function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
+function (dist::Jaro)(s1, s2)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -180,7 +180,7 @@ region on either side of the longest common subsequence.
 """
 struct RatcliffObershelp <: SemiMetric end

-function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
+function (dist::RatcliffObershelp)(s1, s2)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    n_matched = sum(last.(matching_blocks(s1, s2)))
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -1,9 +1,19 @@

-struct Normalized{S <: SemiMetric} <: SemiMetric
+struct Normalize{S <: SemiMetric} <: SemiMetric
    dist::S
 end
+
+"""
+   normalize(dist::SemiMetric)
+
+   Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
+"""
+normalize(dist::SemiMetric) = Normalize(dist)
+normalize(dist::Normalize) = dist
+
+
 # A normalized distance is between 0 and 1, and accept a third argument, max_dist.
-function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
+function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
    ((s1 === missing) | (s2 === missing)) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -13,7 +23,7 @@ function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, m
    out > max_dist ? 1.0 : out
 end

-function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
+function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
    ((s1 === missing) | (s2 === missing)) && return missing
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
@ -27,21 +37,141 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
    out > max_dist ? 1.0 : out
 end

-function (dist::Normalized)(s1, s2, max_dist = 1.0)
+function (dist::Normalize)(s1, s2, max_dist = 1.0)
    out = dist.dist(s1, s2)
    out > max_dist ? 1.0 : out
 end

 """
-   normalize(dist::SemiMetric)
+   Partial(dist)

-   Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
+Creates the `Partial{dist}` distance.
+
+`Partial{dist}` normalizes the string distance `dist` and modify it to return the 
+minimum distance  between the shorter string and substrings of the longer string
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
+0.5483870967741935
+```
 """
-normalize(dist::Partial) = Partial(normalize(dist.dist))
-normalize(dist::TokenSort) = TokenSort(normalize(dist.dist))
-normalize(dist::TokenSet) = TokenSet(normalize(dist.dist))
-normalize(dist::SemiMetric) = Normalized(dist)
-normalize(dist::Normalized) = dist
+struct Partial{S <: SemiMetric} <: SemiMetric
+    dist::S
+    Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
+end
+Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
+normalize(dist::Partial) = dist
+
+function (dist::Partial)(s1, s2, max_dist = 1.0)
+    s1, s2 = reorder(s1, s2)
+    len1, len2 = length(s1), length(s2)
+    out = dist.dist(s1, s2, max_dist)
+    len1 == len2 && return out
+    len1 == 0 && return out
+    for x in qgrams(s2, len1)
+        curr = dist.dist(s1, x, max_dist)
+        out = min(out, curr)
+        max_dist = min(out, max_dist)
+    end
+    return out
+end
+
+function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
+    s1, s2 = reorder(s1, s2)
+    len1, len2 = length(s1), length(s2)
+    len1 == len2 && return dist.dist(s1, s2)
+    out = 1.0
+    for r in matching_blocks(s1, s2)
+        # Make sure the substring of s2 has length len1
+        s2_start = r[2] - r[1] + 1
+        s2_end = s2_start + len1 - 1
+        if s2_start < 1
+            s2_end += 1 - s2_start
+            s2_start += 1 - s2_start
+        elseif s2_end > len2
+            s2_start += len2 - s2_end
+            s2_end += len2 - s2_end
+        end
+        curr = dist.dist(s1, _slice(s2, s2_start, s2_end))
+        out = min(out, curr)
+    end
+    return out
+end
+
+"""
+   TokenSort(dist)
+
+Creates the `TokenSort{dist}` distance.
+
+`TokenSort{dist}` normalizes the string distance `dist` and modify it to adjust for differences 
+in word orders by reording words alphabetically.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
+0.0
+```
+"""
+struct TokenSort{S <: SemiMetric} <: SemiMetric
+    dist::S
+    TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
+end
+TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
+normalize(dist::TokenSort) = dist
+
+# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
+function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
+    s1 = join(sort!(split(s1)), " ")
+    s2 = join(sort!(split(s2)), " ")
+    out = dist.dist(s1, s2, max_dist)
+end
+
+"""
+   TokenSet(dist)
+
+Creates the `TokenSet{dist}` distance.
+
+`TokenSet{dist}` normalizes the string distance `dist` and modify it to adjust for differences 
+in word orders and word numbers by comparing the intersection of two strings with each string.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
+0.0
+```
+"""
+struct TokenSet{S <: SemiMetric} <: SemiMetric
+    dist::S
+    TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
+end
+TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
+normalize(dist::TokenSet) = dist
+
+# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
+function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
+    v1 = unique!(sort!(split(s1)))
+    v2 = unique!(sort!(split(s2)))
+    v0 = intersect(v1, v2)
+    s0 = join(v0, " ")
+    s1 = join(v1, " ")
+    s2 = join(v2, " ")
+    isempty(s0) && return dist.dist(s1, s2, max_dist)
+    score_01 = dist.dist(s0, s1, max_dist)
+    max_dist = min(max_dist, score_01)
+    score_02 = dist.dist(s0, s2, max_dist)
+    max_dist = min(max_dist, score_02)
+    score_12 = dist.dist(s1, s2, max_dist)
+    min(score_01, score_02, score_12)
+end
+



@ -50,7 +180,7 @@ normalize(dist::Normalized) = dist

 Creates the `Winkler{dist, p, threshold, maxlength}` distance.

-`Winkler{dist, p, threshold, length)` modifies the string distance `normalize(dist)` to decrease the 
+`Winkler{dist, p, threshold, length)` normalizes the string distance `dist` and modify it to decrease the 
 distance between  two strings, when their original distance is below some `threshold`.
 The boost is equal to `min(l,  maxlength) * p * dist` where `l` denotes the 
 length of their common prefix and `dist` denotes the original distance
@ -103,10 +233,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
    TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
 end

-function TokenMax(dist::SemiMetric)
-    dist = normalize(dist)
-    TokenMax{typeof(dist)}(dist)
-end
+TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
 normalize(dist::TokenMax) = dist

 function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -98,7 +98,7 @@ struct QGram <: QGramDistance
 	q::Int
 end

-function (dist::QGram)(s1, s2, nothing::Nothing = nothing)
+function (dist::QGram)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	n = 0
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -124,7 +124,7 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function (dist::Cosine)(s1, s2, nothing::Nothing = nothing)
+function (dist::Cosine)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	norm1, norm2, prodnorm = 0, 0, 0
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -151,7 +151,7 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function (dist::Jaccard)(s1, s2, nothing::Nothing = nothing)
+function (dist::Jaccard)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -178,7 +178,7 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function (dist::SorensenDice)(s1, s2, nothing::Nothing = nothing)
+function (dist::SorensenDice)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in  _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -205,7 +205,7 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function (dist::Overlap)(s1, s2, nothing::Nothing = nothing)
+function (dist::Overlap)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -23,7 +23,7 @@ using StringDistances, Unicode, Test
 	compare("aüa", "aua", Levenshtein())
 	compare("aüa", "aua", DamerauLevenshtein())
 	@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
-
+	@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
 	# Winkler
 	@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4
 	@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4