define StringSemiMetric/StringMetric

2021-09-12 14:33:39 -04:00 · 2021-09-12 14:33:39 -04:00 · d9f99986fb
parent 5507822aec
commit d9f99986fb
7 changed files with 57 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -13,9 +13,9 @@ The available distances are:
 - Edit Distances
 	- Hamming Distance `Hamming()`
 	- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
-	- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
-	- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
-	- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
+	- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
+	- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
+	- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
 	- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
 - Q-gram distances compare the set of all substrings of length `q` in each string.
 	- QGram Distance `Qgram(q::Int)`
@ -59,13 +59,13 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
 The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).


-### distance modifiers
-The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.
+### fuzzywuzzy
+The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words (e.g. addresses, company names).

 - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
 - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically. 
 - [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses.   `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
+- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string.   `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)


 ### find
@ -84,4 +84,7 @@ The package also adds some convience function to find the element in a list that
 The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).


+## Notes
+- All string lookups are case sensitive.
+

--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -2,32 +2,9 @@ module StringDistances

 using Distances
 import StatsAPI: pairwise, pairwise!
-
-include("distances/utils.jl")
-include("distances/edit.jl")
-include("distances/qgram.jl")
-include("normalize.jl")
-include("fuzzywuzzy.jl")
-const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
-"""
-    compare(s1, s2, dist)
-
-return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`.
-
-### Examples
-```julia-repl
-julia> compare("martha", "marhta", Levenshtein())
-0.6666666666666667
-```
-"""
-function compare(s1, s2, dist::StringDistance; min_score = 0.0)
-    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
-end 
-include("find.jl")
-include("pairwise.jl")
-
-# Distances API
+abstract type StringSemiMetric <: SemiMetric end
+abstract type StringMetric <: Metric end
+const StringDistance = Union{StringSemiMetric, StringMetric}
 function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
    T = typeof(dist("", ""))
    if (Missing <: s1) | (Missing <: s2)
@ -38,6 +15,15 @@ end
 Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))


+include("distances/utils.jl")
+include("distances/edit.jl")
+include("distances/qgram.jl")
+
+
+include("normalize.jl")
+include("pairwise.jl")
+include("find.jl")
+include("fuzzywuzzy.jl")



@ -47,8 +33,9 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
 ##
 ##############################################################################

-export
-StringDistance,
+export StringDistance,
+StringSemiMetric,
+StringMetric,
 Hamming,
 Jaro,
 JaroWinkler,
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -5,7 +5,7 @@ Creates the Hamming distance

 The Hamming distance is defined as the number of characters that do not match
 """
-struct Hamming{V <: Union{Int, Nothing}} <: SemiMetric
+struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
   max_dist::V
 end
 Hamming() = Hamming(nothing)
@ -36,7 +36,7 @@ The Jaro distance is defined as
 where ``m`` is the number of matching characters and 
 ``t`` is half the number of transpositions.
 """
-struct Jaro <: SemiMetric end
+struct Jaro <: StringSemiMetric end

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 function (dist::Jaro)(s1, s2)
@ -90,7 +90,7 @@ Creates the JaroWinkler distance
 The JaroWinkler distance is defined as the Jaro distance, which is multiplied by
 ``(1-min(l,  maxlength) * p)`` as long as it is lower than `threshold`, and where `l` denotes the length of the common prefix.
 """
-struct JaroWinkler <: SemiMetric
+struct JaroWinkler <: StringSemiMetric
    p::Float64          # scaling factor. Default to 0.1
    threshold::Float64  # boost limit. Default to 0.3
    maxlength::Integer  # max length of common prefix. Default to 4
@ -118,7 +118,7 @@ Creates the Levenshtein distance
 The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, 
 substitutions of a single character) required to change one string into the other.
 """
-struct Levenshtein{V <: Union{Int, Nothing}} <: Metric
+struct Levenshtein{V <: Union{Int, Nothing}} <: StringMetric
   max_dist::V
 end
 Levenshtein() = Levenshtein(nothing)
@ -138,8 +138,7 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
    # prefix common to both strings can be ignored
    k = common_prefix(s1, s2)
    k == len1 && return len2 - k
-    # distance initialized to first row of matrix
-    # distance between "" and s2[1:i]
+    # first row of matrix set to distance between "" and s2[1:i]
    v = collect(1:(len2-k))
    current = 0
    for (i1, ch1) in enumerate(s1)
@ -184,7 +183,7 @@ end
    uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy 
    the triangle inequality.
 """
-struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: SemiMetric
+struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: StringSemiMetric
   max_dist::V
 end
 OptimalStringAlignement() = OptimalStringAlignement(nothing)
@ -263,7 +262,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
 deletions or substitutions of a single character, or transposition of two adjacent characters) 
 required to change one string into the other.
 """
-struct DamerauLevenshtein <: Metric end
+struct DamerauLevenshtein <: StringMetric end

 # https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance
 # https://www.lemoda.net/text-fuzzy/damerau-levenshtein/
@ -291,7 +290,7 @@ function (dist::DamerauLevenshtein)(s1, s2)
            @inbounds pre = min(distm[i1, i2] + !match, 
                                distm[i1 + 1, i2] + 1,
                                distm[i1, i2 + 1] + 1)
-            # avoid lookup if we already know transposition won't be chosen
+            # avoid lookup if we know transposition won't be chosen
            j1 = (i1 == 1 || j2 == 0 || match) ? 0 : get(da, ch2, 0)
            @inbounds distm[i1 + 1, i2 + 1] = (j1 == 0) ? pre : min(pre, distm[j1, j2] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
            if match
@ -313,7 +312,7 @@ divided by the total number of characters in the two strings. Matching character
 in the longest common subsequence plus, recursively, matching characters in the unmatched 
 region on either side of the longest common subsequence.
 """
-struct RatcliffObershelp <: SemiMetric end
+struct RatcliffObershelp <: StringSemiMetric end

 function (dist::RatcliffObershelp)(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -1,4 +1,4 @@
-abstract type AbstractQGramDistance <: SemiMetric end
+abstract type AbstractQGramDistance <: StringSemiMetric end

 """
 	QGram(q::Int)
--- a/src/fuzzywuzzy.jl
+++ b/src/fuzzywuzzy.jl
@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
 0.5483870967741935
 ```
 """
-struct Partial{S <: SemiMetric} <: SemiMetric
+struct Partial{S <: StringDistance} <: StringSemiMetric
    dist::S
 end

@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
 0.0
 ```
 """
-struct TokenSort{S <: SemiMetric} <: SemiMetric
+struct TokenSort{S <: StringDistance} <: StringSemiMetric
    dist::S
 end

@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
 0.0
 ```
 """
-struct TokenSet{S <: SemiMetric} <: SemiMetric
+struct TokenSet{S <: StringDistance} <: StringSemiMetric
    dist::S
 end

@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
 0.05
 ```
 """
-struct TokenMax{S <: SemiMetric} <: SemiMetric
+struct TokenMax{S <: StringDistance} <: StringSemiMetric
    dist::S
    max_dist::Float64
 end
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -1,4 +1,4 @@
-struct Normalized{V <: SemiMetric} <: SemiMetric
+struct Normalized{V <: StringDistance} <: StringSemiMetric
    dist::V
    max_dist::Float64
 end
@ -59,6 +59,22 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
 0.8064 
 ```
 """
-normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
+normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
 normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)

+
+"""
+    compare(s1, s2, dist)
+
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`.
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
+"""
+function compare(s1, s2, dist::StringDistance; min_score = 0.0)
+    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
+end 
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -40,7 +40,7 @@ Set `preprocess` to false if no preprocessing should be used.
 function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
-    ((xs === ys) & (dist isa SemiMetric)) ?
+    (xs === ys) ?
        _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
 end