remove StringDistance type (since Distance does not exist)

pull/57/head
matthieugomez 2021-09-12 15:06:31 -04:00
parent d9f99986fb
commit 0e5cd7e4d2
8 changed files with 50 additions and 46 deletions

View File

@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.10.1"
version = "0.11.0"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

View File

@ -5,26 +5,25 @@
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
## Supported Distances
Distances are defined over iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
The package defines two abstract types: `StringSemiMetric <: SemiMetric`, and `StringMetric <: Metric`.
String distances inherit from one of these two types. They act over any pair of iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
The available distances are:
- Edit Distances
- Hamming Distance `Hamming()`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
- Hamming Distance `Hamming() <: SemiStringMetric`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiStringMetric`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: StringMetric`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiStringMetric`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: StringMetric`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiStringMetric`
- Q-gram distances compare the set of all substrings of length `q` in each string.
- QGram Distance `Qgram(q::Int)`
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int)`
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int)`
- QGram Distance `Qgram(q::Int) <: SemiStringMetric`
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int) <: SemiStringMetric`
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int) <: SemiStringMetric`
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int) <: SemiStringMetric`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int) <: SemiStringMetric`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int) <: SemiStringMetric`
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int) <: SemiStringMetric`
## Basic Use

View File

@ -4,15 +4,14 @@ using Distances
import StatsAPI: pairwise, pairwise!
abstract type StringSemiMetric <: SemiMetric end
abstract type StringMetric <: Metric end
const StringDistance = Union{StringSemiMetric, StringMetric}
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2)
T = Union{T, Missing}
end
return T
end
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
include("distances/utils.jl")
@ -33,9 +32,10 @@ include("fuzzywuzzy.jl")
##
##############################################################################
export StringDistance,
export
StringSemiMetric,
StringMetric,
# edit distances
Hamming,
Jaro,
JaroWinkler,
@ -43,6 +43,7 @@ Levenshtein,
OptimalStringAlignement,
DamerauLevenshtein,
RatcliffObershelp,
# Qgram distances
AbstractQGramDistance,
QGramDict,
QGramSortedVector,
@ -53,15 +54,19 @@ SorensenDice,
Overlap,
MorisitaOverlap,
NMD,
qgrams,
# normalize
compare,
# fuzzywuzzy
Partial,
TokenSort,
TokenSet,
TokenMax,
evaluate,
compare,
result_type,
qgrams,
# find
findnearest,
# re-rexport from Distances.jl
evaluate,
result_type,
pairwise,
pairwise!
end

View File

@ -5,7 +5,7 @@ Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match
"""
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
struct Hamming{V <: Union{Int, Nothing}} <: StringMetric
max_dist::V
end
Hamming() = Hamming(nothing)

View File

@ -1,5 +1,5 @@
"""
findnearest(s, itr, dist::StringDistance) -> (x, index)
findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
`findnearest` returns the value and index of the element of `itr` that has the
lowest distance with `s` according to the distance `dist`.
@ -18,7 +18,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
@ -37,15 +37,15 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
end
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_preprocess(dist::StringDistance, s) = s
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score)
end
"""
findall(s, itr , dist::StringDistance; min_score = 0.8)
findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
@ -66,7 +66,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
s = _preprocess(dist, s)
# need collect since @threads requires a length method

View File

@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
0.5483870967741935
```
"""
struct Partial{S <: StringDistance} <: StringSemiMetric
struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S
end
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
0.0
```
"""
struct TokenSort{S <: StringDistance} <: StringSemiMetric
struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S
end
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
0.0
```
"""
struct TokenSet{S <: StringDistance} <: StringSemiMetric
struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S
end
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
0.05
```
"""
struct TokenMax{S <: StringDistance} <: StringSemiMetric
struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S
max_dist::Float64
end

View File

@ -1,4 +1,4 @@
struct Normalized{V <: StringDistance} <: StringSemiMetric
struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::V
max_dist::Float64
end
@ -59,7 +59,7 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
0.8064
```
"""
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
@ -75,6 +75,6 @@ julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end

View File

@ -23,21 +23,21 @@ julia> pairwise(Levenshtein(), iter, iter2)
10.0
```
"""
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
function pairwise(dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
T = result_type(dist, eltype(xs), eltype(ys))
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
"""
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
`Union{StringSemiMetric, StringMetric}` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
Set `preprocess` to false if no preprocessing should be used.
"""
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
function pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
(xs === ys) ?
@ -45,7 +45,7 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
function _symmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector; preprocess = true)
if preprocess
xs = _preprocess_list(dist, xs)
end
@ -59,7 +59,7 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstr
return R
end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
function _asymmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector; preprocess = true)
if preprocess
objxs = _preprocess_list(dist, xs)
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
@ -75,5 +75,5 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
return R
end
_preprocess_list(dist::StringDistance, xs) = xs
_preprocess_list(dist::Union{StringSemiMetric, StringMetric}, xs) = xs
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))