define StringSemiMetric/StringMetric

pull/57/head
matthieugomez 2021-09-12 14:33:39 -04:00
parent 5507822aec
commit d9f99986fb
7 changed files with 57 additions and 52 deletions

View File

@ -13,9 +13,9 @@ The available distances are:
- Edit Distances
- Hamming Distance `Hamming()`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
- Q-gram distances compare the set of all substrings of length `q` in each string.
- QGram Distance `Qgram(q::Int)`
@ -59,13 +59,13 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).
### distance modifiers
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.
### fuzzywuzzy
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words (e.g. addresses, company names).
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
### find
@ -84,4 +84,7 @@ The package also adds some convience function to find the element in a list that
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).
## Notes
- All string lookups are case sensitive.

View File

@ -2,32 +2,9 @@ module StringDistances
using Distances
import StatsAPI: pairwise, pairwise!
include("distances/utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("normalize.jl")
include("fuzzywuzzy.jl")
const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end
include("find.jl")
include("pairwise.jl")
# Distances API
abstract type StringSemiMetric <: SemiMetric end
abstract type StringMetric <: Metric end
const StringDistance = Union{StringSemiMetric, StringMetric}
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2)
@ -38,6 +15,15 @@ end
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
include("distances/utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("normalize.jl")
include("pairwise.jl")
include("find.jl")
include("fuzzywuzzy.jl")
@ -47,8 +33,9 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
##
##############################################################################
export
StringDistance,
export StringDistance,
StringSemiMetric,
StringMetric,
Hamming,
Jaro,
JaroWinkler,

View File

@ -5,7 +5,7 @@ Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match
"""
struct Hamming{V <: Union{Int, Nothing}} <: SemiMetric
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
max_dist::V
end
Hamming() = Hamming(nothing)
@ -36,7 +36,7 @@ The Jaro distance is defined as
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
struct Jaro <: StringSemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::Jaro)(s1, s2)
@ -90,7 +90,7 @@ Creates the JaroWinkler distance
The JaroWinkler distance is defined as the Jaro distance, which is multiplied by
``(1-min(l, maxlength) * p)`` as long as it is lower than `threshold`, and where `l` denotes the length of the common prefix.
"""
struct JaroWinkler <: SemiMetric
struct JaroWinkler <: StringSemiMetric
p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost limit. Default to 0.3
maxlength::Integer # max length of common prefix. Default to 4
@ -118,7 +118,7 @@ Creates the Levenshtein distance
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein{V <: Union{Int, Nothing}} <: Metric
struct Levenshtein{V <: Union{Int, Nothing}} <: StringMetric
max_dist::V
end
Levenshtein() = Levenshtein(nothing)
@ -138,8 +138,7 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
# prefix common to both strings can be ignored
k = common_prefix(s1, s2)
k == len1 && return len2 - k
# distance initialized to first row of matrix
# distance between "" and s2[1:i]
# first row of matrix set to distance between "" and s2[1:i]
v = collect(1:(len2-k))
current = 0
for (i1, ch1) in enumerate(s1)
@ -184,7 +183,7 @@ end
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
"""
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: SemiMetric
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: StringSemiMetric
max_dist::V
end
OptimalStringAlignement() = OptimalStringAlignement(nothing)
@ -263,7 +262,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
"""
struct DamerauLevenshtein <: Metric end
struct DamerauLevenshtein <: StringMetric end
# https://en.wikipedia.org/wiki/DamerauLevenshtein_distance
# https://www.lemoda.net/text-fuzzy/damerau-levenshtein/
@ -291,7 +290,7 @@ function (dist::DamerauLevenshtein)(s1, s2)
@inbounds pre = min(distm[i1, i2] + !match,
distm[i1 + 1, i2] + 1,
distm[i1, i2 + 1] + 1)
# avoid lookup if we already know transposition won't be chosen
# avoid lookup if we know transposition won't be chosen
j1 = (i1 == 1 || j2 == 0 || match) ? 0 : get(da, ch2, 0)
@inbounds distm[i1 + 1, i2 + 1] = (j1 == 0) ? pre : min(pre, distm[j1, j2] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
if match
@ -313,7 +312,7 @@ divided by the total number of characters in the two strings. Matching character
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end
struct RatcliffObershelp <: StringSemiMetric end
function (dist::RatcliffObershelp)(s1, s2)
(s1 === missing) | (s2 === missing) && return missing

View File

@ -1,4 +1,4 @@
abstract type AbstractQGramDistance <: SemiMetric end
abstract type AbstractQGramDistance <: StringSemiMetric end
"""
QGram(q::Int)

View File

@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
0.5483870967741935
```
"""
struct Partial{S <: SemiMetric} <: SemiMetric
struct Partial{S <: StringDistance} <: StringSemiMetric
dist::S
end
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
0.0
```
"""
struct TokenSort{S <: SemiMetric} <: SemiMetric
struct TokenSort{S <: StringDistance} <: StringSemiMetric
dist::S
end
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
0.0
```
"""
struct TokenSet{S <: SemiMetric} <: SemiMetric
struct TokenSet{S <: StringDistance} <: StringSemiMetric
dist::S
end
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
0.05
```
"""
struct TokenMax{S <: SemiMetric} <: SemiMetric
struct TokenMax{S <: StringDistance} <: StringSemiMetric
dist::S
max_dist::Float64
end

View File

@ -1,4 +1,4 @@
struct Normalized{V <: SemiMetric} <: SemiMetric
struct Normalized{V <: StringDistance} <: StringSemiMetric
dist::V
max_dist::Float64
end
@ -59,6 +59,22 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
0.8064
```
"""
normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end

View File

@ -40,7 +40,7 @@ Set `preprocess` to false if no preprocessing should be used.
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
((xs === ys) & (dist isa SemiMetric)) ?
(xs === ys) ?
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end