define StringSemiMetric/StringMetric
parent
5507822aec
commit
d9f99986fb
15
README.md
15
README.md
|
@ -13,9 +13,9 @@ The available distances are:
|
|||
- Edit Distances
|
||||
- Hamming Distance `Hamming()`
|
||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
||||
- QGram Distance `Qgram(q::Int)`
|
||||
|
@ -59,13 +59,13 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
|
|||
The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).
|
||||
|
||||
|
||||
### distance modifiers
|
||||
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.
|
||||
### fuzzywuzzy
|
||||
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words (e.g. addresses, company names).
|
||||
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
|
||||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
|
||||
|
||||
### find
|
||||
|
@ -84,4 +84,7 @@ The package also adds some convience function to find the element in a list that
|
|||
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).
|
||||
|
||||
|
||||
## Notes
|
||||
- All string lookups are case sensitive.
|
||||
|
||||
|
||||
|
|
|
@ -2,32 +2,9 @@ module StringDistances
|
|||
|
||||
using Distances
|
||||
import StatsAPI: pairwise, pairwise!
|
||||
|
||||
include("distances/utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("normalize.jl")
|
||||
include("fuzzywuzzy.jl")
|
||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
include("find.jl")
|
||||
include("pairwise.jl")
|
||||
|
||||
# Distances API
|
||||
abstract type StringSemiMetric <: SemiMetric end
|
||||
abstract type StringMetric <: Metric end
|
||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
||||
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
|
||||
T = typeof(dist("", ""))
|
||||
if (Missing <: s1) | (Missing <: s2)
|
||||
|
@ -38,6 +15,15 @@ end
|
|||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
||||
|
||||
include("distances/utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
|
||||
|
||||
include("normalize.jl")
|
||||
include("pairwise.jl")
|
||||
include("find.jl")
|
||||
include("fuzzywuzzy.jl")
|
||||
|
||||
|
||||
|
||||
|
@ -47,8 +33,9 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
export
|
||||
StringDistance,
|
||||
export StringDistance,
|
||||
StringSemiMetric,
|
||||
StringMetric,
|
||||
Hamming,
|
||||
Jaro,
|
||||
JaroWinkler,
|
||||
|
|
|
@ -5,7 +5,7 @@ Creates the Hamming distance
|
|||
|
||||
The Hamming distance is defined as the number of characters that do not match
|
||||
"""
|
||||
struct Hamming{V <: Union{Int, Nothing}} <: SemiMetric
|
||||
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
|
||||
max_dist::V
|
||||
end
|
||||
Hamming() = Hamming(nothing)
|
||||
|
@ -36,7 +36,7 @@ The Jaro distance is defined as
|
|||
where ``m`` is the number of matching characters and
|
||||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: SemiMetric end
|
||||
struct Jaro <: StringSemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function (dist::Jaro)(s1, s2)
|
||||
|
@ -90,7 +90,7 @@ Creates the JaroWinkler distance
|
|||
The JaroWinkler distance is defined as the Jaro distance, which is multiplied by
|
||||
``(1-min(l, maxlength) * p)`` as long as it is lower than `threshold`, and where `l` denotes the length of the common prefix.
|
||||
"""
|
||||
struct JaroWinkler <: SemiMetric
|
||||
struct JaroWinkler <: StringSemiMetric
|
||||
p::Float64 # scaling factor. Default to 0.1
|
||||
threshold::Float64 # boost limit. Default to 0.3
|
||||
maxlength::Integer # max length of common prefix. Default to 4
|
||||
|
@ -118,7 +118,7 @@ Creates the Levenshtein distance
|
|||
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
||||
substitutions of a single character) required to change one string into the other.
|
||||
"""
|
||||
struct Levenshtein{V <: Union{Int, Nothing}} <: Metric
|
||||
struct Levenshtein{V <: Union{Int, Nothing}} <: StringMetric
|
||||
max_dist::V
|
||||
end
|
||||
Levenshtein() = Levenshtein(nothing)
|
||||
|
@ -138,8 +138,7 @@ function (dist::Levenshtein{T})(s1, s2) where {T}
|
|||
# prefix common to both strings can be ignored
|
||||
k = common_prefix(s1, s2)
|
||||
k == len1 && return len2 - k
|
||||
# distance initialized to first row of matrix
|
||||
# distance between "" and s2[1:i]
|
||||
# first row of matrix set to distance between "" and s2[1:i]
|
||||
v = collect(1:(len2-k))
|
||||
current = 0
|
||||
for (i1, ch1) in enumerate(s1)
|
||||
|
@ -184,7 +183,7 @@ end
|
|||
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
|
||||
the triangle inequality.
|
||||
"""
|
||||
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: SemiMetric
|
||||
struct OptimalStringAlignement{V <: Union{Int, Nothing}} <: StringSemiMetric
|
||||
max_dist::V
|
||||
end
|
||||
OptimalStringAlignement() = OptimalStringAlignement(nothing)
|
||||
|
@ -263,7 +262,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
|
|||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||
required to change one string into the other.
|
||||
"""
|
||||
struct DamerauLevenshtein <: Metric end
|
||||
struct DamerauLevenshtein <: StringMetric end
|
||||
|
||||
# https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance
|
||||
# https://www.lemoda.net/text-fuzzy/damerau-levenshtein/
|
||||
|
@ -291,7 +290,7 @@ function (dist::DamerauLevenshtein)(s1, s2)
|
|||
@inbounds pre = min(distm[i1, i2] + !match,
|
||||
distm[i1 + 1, i2] + 1,
|
||||
distm[i1, i2 + 1] + 1)
|
||||
# avoid lookup if we already know transposition won't be chosen
|
||||
# avoid lookup if we know transposition won't be chosen
|
||||
j1 = (i1 == 1 || j2 == 0 || match) ? 0 : get(da, ch2, 0)
|
||||
@inbounds distm[i1 + 1, i2 + 1] = (j1 == 0) ? pre : min(pre, distm[j1, j2] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
|
||||
if match
|
||||
|
@ -313,7 +312,7 @@ divided by the total number of characters in the two strings. Matching character
|
|||
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
||||
region on either side of the longest common subsequence.
|
||||
"""
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
struct RatcliffObershelp <: StringSemiMetric end
|
||||
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
abstract type AbstractQGramDistance <: StringSemiMetric end
|
||||
|
||||
"""
|
||||
QGram(q::Int)
|
||||
|
|
|
@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
|
|||
0.5483870967741935
|
||||
```
|
||||
"""
|
||||
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||
struct Partial{S <: StringDistance} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
|
|||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||
struct TokenSort{S <: StringDistance} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
|
|||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||
struct TokenSet{S <: StringDistance} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
|
|||
0.05
|
||||
```
|
||||
"""
|
||||
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
||||
struct TokenMax{S <: StringDistance} <: StringSemiMetric
|
||||
dist::S
|
||||
max_dist::Float64
|
||||
end
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
struct Normalized{V <: SemiMetric} <: SemiMetric
|
||||
struct Normalized{V <: StringDistance} <: StringSemiMetric
|
||||
dist::V
|
||||
max_dist::Float64
|
||||
end
|
||||
|
@ -59,6 +59,22 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
|
|||
0.8064
|
||||
```
|
||||
"""
|
||||
normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||
|
||||
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
|
|
@ -40,7 +40,7 @@ Set `preprocess` to false if no preprocessing should be used.
|
|||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
((xs === ys) & (dist isa SemiMetric)) ?
|
||||
(xs === ys) ?
|
||||
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
|
||||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue