remove StringDistance type (since Distance does not exist)

pull/57/head
matthieugomez 2021-09-12 15:06:31 -04:00
parent d9f99986fb
commit 0e5cd7e4d2
8 changed files with 50 additions and 46 deletions

View File

@ -1,6 +1,6 @@
name = "StringDistances" name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.10.1" version = "0.11.0"
[deps] [deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

View File

@ -5,26 +5,25 @@
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`. The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
## Supported Distances ## Supported Distances
The package defines two abstract types: `StringSemiMetric <: SemiMetric`, and `StringMetric <: Metric`.
Distances are defined over iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`) String distances inherit from one of these two types. They act over any pair of iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
The available distances are: The available distances are:
- Edit Distances - Edit Distances
- Hamming Distance `Hamming()` - Hamming Distance `Hamming() <: SemiStringMetric`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()` - [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiStringMetric`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric` - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: StringMetric`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()` - [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiStringMetric`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric` - [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: StringMetric`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()` - [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiStringMetric`
- Q-gram distances compare the set of all substrings of length `q` in each string. - Q-gram distances compare the set of all substrings of length `q` in each string.
- QGram Distance `Qgram(q::Int)` - QGram Distance `Qgram(q::Int) <: SemiStringMetric`
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int)` - [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int) <: SemiStringMetric`
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)` - [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int) <: SemiStringMetric`
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int) <: SemiStringMetric`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int) <: SemiStringMetric`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)` - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int) <: SemiStringMetric`
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int)` - [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int) <: SemiStringMetric`
## Basic Use ## Basic Use

View File

@ -4,15 +4,14 @@ using Distances
import StatsAPI: pairwise, pairwise! import StatsAPI: pairwise, pairwise!
abstract type StringSemiMetric <: SemiMetric end abstract type StringSemiMetric <: SemiMetric end
abstract type StringMetric <: Metric end abstract type StringMetric <: Metric end
const StringDistance = Union{StringSemiMetric, StringMetric} function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
T = typeof(dist("", "")) T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2) if (Missing <: s1) | (Missing <: s2)
T = Union{T, Missing} T = Union{T, Missing}
end end
return T return T
end end
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
include("distances/utils.jl") include("distances/utils.jl")
@ -33,9 +32,10 @@ include("fuzzywuzzy.jl")
## ##
############################################################################## ##############################################################################
export StringDistance, export
StringSemiMetric, StringSemiMetric,
StringMetric, StringMetric,
# edit distances
Hamming, Hamming,
Jaro, Jaro,
JaroWinkler, JaroWinkler,
@ -43,6 +43,7 @@ Levenshtein,
OptimalStringAlignement, OptimalStringAlignement,
DamerauLevenshtein, DamerauLevenshtein,
RatcliffObershelp, RatcliffObershelp,
# Qgram distances
AbstractQGramDistance, AbstractQGramDistance,
QGramDict, QGramDict,
QGramSortedVector, QGramSortedVector,
@ -53,15 +54,19 @@ SorensenDice,
Overlap, Overlap,
MorisitaOverlap, MorisitaOverlap,
NMD, NMD,
qgrams,
# normalize
compare,
# fuzzywuzzy
Partial, Partial,
TokenSort, TokenSort,
TokenSet, TokenSet,
TokenMax, TokenMax,
evaluate, # find
compare,
result_type,
qgrams,
findnearest, findnearest,
# re-rexport from Distances.jl
evaluate,
result_type,
pairwise, pairwise,
pairwise! pairwise!
end end

View File

@ -5,7 +5,7 @@ Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match The Hamming distance is defined as the number of characters that do not match
""" """
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric struct Hamming{V <: Union{Int, Nothing}} <: StringMetric
max_dist::V max_dist::V
end end
Hamming() = Hamming(nothing) Hamming() = Hamming(nothing)

View File

@ -1,5 +1,5 @@
""" """
findnearest(s, itr, dist::StringDistance) -> (x, index) findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
`findnearest` returns the value and index of the element of `itr` that has the `findnearest` returns the value and index of the element of `itr` that has the
lowest distance with `s` according to the distance `dist`. lowest distance with `s` according to the distance `dist`.
@ -18,7 +18,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing) (nothing, nothing)
``` ```
""" """
function findnearest(s, itr, dist::StringDistance; min_score = 0.0) function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score) min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()] scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()] is = [0 for _ in 1:Threads.nthreads()]
@ -37,15 +37,15 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
end end
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing _preprocess(dist::AbstractQGramDistance, ::Missing) = missing
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q) _preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_preprocess(dist::StringDistance, s) = s _preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0) function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)" @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score) findnearest(s, itr, dist; min_score = min_score)
end end
""" """
findall(s, itr , dist::StringDistance; min_score = 0.8) findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a `findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`. similarity score higher or equal than `min_score` according to the distance `dist`.
@ -66,7 +66,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1} 0-element Array{Int64,1}
``` ```
""" """
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8) function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()] out = [Int[] for _ in 1:Threads.nthreads()]
s = _preprocess(dist, s) s = _preprocess(dist, s)
# need collect since @threads requires a length method # need collect since @threads requires a length method

View File

@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
0.5483870967741935 0.5483870967741935
``` ```
""" """
struct Partial{S <: StringDistance} <: StringSemiMetric struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
0.0 0.0
``` ```
""" """
struct TokenSort{S <: StringDistance} <: StringSemiMetric struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
0.0 0.0
``` ```
""" """
struct TokenSet{S <: StringDistance} <: StringSemiMetric struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
end end
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
0.05 0.05
``` ```
""" """
struct TokenMax{S <: StringDistance} <: StringSemiMetric struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::S dist::S
max_dist::Float64 max_dist::Float64
end end

View File

@ -1,4 +1,4 @@
struct Normalized{V <: StringDistance} <: StringSemiMetric struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
dist::V dist::V
max_dist::Float64 max_dist::Float64
end end
@ -59,7 +59,7 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
0.8064 0.8064
``` ```
""" """
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist) normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist) normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
@ -75,6 +75,6 @@ julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667 0.6666666666666667
``` ```
""" """
function compare(s1, s2, dist::StringDistance; min_score = 0.0) function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2) 1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end end

View File

@ -23,21 +23,21 @@ julia> pairwise(Levenshtein(), iter, iter2)
10.0 10.0
``` ```
""" """
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) function pairwise(dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
T = result_type(dist, eltype(xs), eltype(ys)) T = result_type(dist, eltype(xs), eltype(ys))
R = Matrix{T}(undef, length(xs), length(ys)) R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess) pairwise!(R, dist, xs, ys; preprocess = preprocess)
end end
""" """
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
Compute distances between all pairs of elements in `xs` and `ys` according to the Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`. `Union{StringSemiMetric, StringMetric}` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
Set `preprocess` to false if no preprocessing should be used. Set `preprocess` to false if no preprocessing should be used.
""" """
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) function pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length")) length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length")) length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
(xs === ys) ? (xs === ys) ?
@ -45,7 +45,7 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess) _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end end
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true) function _symmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector; preprocess = true)
if preprocess if preprocess
xs = _preprocess_list(dist, xs) xs = _preprocess_list(dist, xs)
end end
@ -59,7 +59,7 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstr
return R return R
end end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true) function _asymmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector; preprocess = true)
if preprocess if preprocess
objxs = _preprocess_list(dist, xs) objxs = _preprocess_list(dist, xs)
objys = xs === ys ? objxs : _preprocess_list(dist, ys) objys = xs === ys ? objxs : _preprocess_list(dist, ys)
@ -75,5 +75,5 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
return R return R
end end
_preprocess_list(dist::StringDistance, xs) = xs _preprocess_list(dist::Union{StringSemiMetric, StringMetric}, xs) = xs
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs)) _preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))