remove StringDistance type (since Distance does not exist)
parent
d9f99986fb
commit
0e5cd7e4d2
|
@ -1,6 +1,6 @@
|
|||
name = "StringDistances"
|
||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||
version = "0.10.1"
|
||||
version = "0.11.0"
|
||||
|
||||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
|
|
31
README.md
31
README.md
|
@ -5,26 +5,25 @@
|
|||
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
|
||||
|
||||
## Supported Distances
|
||||
|
||||
Distances are defined over iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
|
||||
The package defines two abstract types: `StringSemiMetric <: SemiMetric`, and `StringMetric <: Metric`.
|
||||
String distances inherit from one of these two types. They act over any pair of iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
|
||||
|
||||
The available distances are:
|
||||
|
||||
- Edit Distances
|
||||
- Hamming Distance `Hamming()`
|
||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
- Hamming Distance `Hamming() <: SemiStringMetric`
|
||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiStringMetric`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: StringMetric`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiStringMetric`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: StringMetric`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiStringMetric`
|
||||
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
||||
- QGram Distance `Qgram(q::Int)`
|
||||
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int)`
|
||||
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
|
||||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
|
||||
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
|
||||
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int)`
|
||||
- QGram Distance `Qgram(q::Int) <: SemiStringMetric`
|
||||
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int) <: SemiStringMetric`
|
||||
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int) <: SemiStringMetric`
|
||||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int) <: SemiStringMetric`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int) <: SemiStringMetric`
|
||||
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int) <: SemiStringMetric`
|
||||
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int) <: SemiStringMetric`
|
||||
|
||||
|
||||
## Basic Use
|
||||
|
|
|
@ -4,15 +4,14 @@ using Distances
|
|||
import StatsAPI: pairwise, pairwise!
|
||||
abstract type StringSemiMetric <: SemiMetric end
|
||||
abstract type StringMetric <: Metric end
|
||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
||||
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
|
||||
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
|
||||
T = typeof(dist("", ""))
|
||||
if (Missing <: s1) | (Missing <: s2)
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
return T
|
||||
end
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
||||
|
||||
include("distances/utils.jl")
|
||||
|
@ -33,9 +32,10 @@ include("fuzzywuzzy.jl")
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
export StringDistance,
|
||||
export
|
||||
StringSemiMetric,
|
||||
StringMetric,
|
||||
# edit distances
|
||||
Hamming,
|
||||
Jaro,
|
||||
JaroWinkler,
|
||||
|
@ -43,6 +43,7 @@ Levenshtein,
|
|||
OptimalStringAlignement,
|
||||
DamerauLevenshtein,
|
||||
RatcliffObershelp,
|
||||
# Qgram distances
|
||||
AbstractQGramDistance,
|
||||
QGramDict,
|
||||
QGramSortedVector,
|
||||
|
@ -53,15 +54,19 @@ SorensenDice,
|
|||
Overlap,
|
||||
MorisitaOverlap,
|
||||
NMD,
|
||||
qgrams,
|
||||
# normalize
|
||||
compare,
|
||||
# fuzzywuzzy
|
||||
Partial,
|
||||
TokenSort,
|
||||
TokenSet,
|
||||
TokenMax,
|
||||
evaluate,
|
||||
compare,
|
||||
result_type,
|
||||
qgrams,
|
||||
# find
|
||||
findnearest,
|
||||
# re-rexport from Distances.jl
|
||||
evaluate,
|
||||
result_type,
|
||||
pairwise,
|
||||
pairwise!
|
||||
end
|
||||
|
|
|
@ -5,7 +5,7 @@ Creates the Hamming distance
|
|||
|
||||
The Hamming distance is defined as the number of characters that do not match
|
||||
"""
|
||||
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
|
||||
struct Hamming{V <: Union{Int, Nothing}} <: StringMetric
|
||||
max_dist::V
|
||||
end
|
||||
Hamming() = Hamming(nothing)
|
||||
|
|
12
src/find.jl
12
src/find.jl
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
findnearest(s, itr, dist::StringDistance) -> (x, index)
|
||||
findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
|
||||
|
||||
`findnearest` returns the value and index of the element of `itr` that has the
|
||||
lowest distance with `s` according to the distance `dist`.
|
||||
|
@ -18,7 +18,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
|
|||
(nothing, nothing)
|
||||
```
|
||||
"""
|
||||
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
|
@ -37,15 +37,15 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
end
|
||||
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_preprocess(dist::StringDistance, s) = s
|
||||
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
findnearest(s, itr, dist; min_score = min_score)
|
||||
end
|
||||
|
||||
"""
|
||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
||||
findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||
|
||||
`findall` returns the vector of indices for elements of `itr` that have a
|
||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
|
@ -66,7 +66,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
|||
0-element Array{Int64,1}
|
||||
```
|
||||
"""
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _preprocess(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
|
|
|
@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
|
|||
0.5483870967741935
|
||||
```
|
||||
"""
|
||||
struct Partial{S <: StringDistance} <: StringSemiMetric
|
||||
struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
|
|||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSort{S <: StringDistance} <: StringSemiMetric
|
||||
struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
|
|||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSet{S <: StringDistance} <: StringSemiMetric
|
||||
struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
|
|||
0.05
|
||||
```
|
||||
"""
|
||||
struct TokenMax{S <: StringDistance} <: StringSemiMetric
|
||||
struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::S
|
||||
max_dist::Float64
|
||||
end
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
struct Normalized{V <: StringDistance} <: StringSemiMetric
|
||||
struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||
dist::V
|
||||
max_dist::Float64
|
||||
end
|
||||
|
@ -59,7 +59,7 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
|
|||
0.8064
|
||||
```
|
||||
"""
|
||||
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||
|
||||
|
||||
|
@ -75,6 +75,6 @@ julia> compare("martha", "marhta", Levenshtein())
|
|||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
|
|
@ -23,21 +23,21 @@ julia> pairwise(Levenshtein(), iter, iter2)
|
|||
10.0
|
||||
```
|
||||
"""
|
||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
function pairwise(dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
T = result_type(dist, eltype(xs), eltype(ys))
|
||||
R = Matrix{T}(undef, length(xs), length(ys))
|
||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
"""
|
||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
`Union{StringSemiMetric, StringMetric}` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
Set `preprocess` to false if no preprocessing should be used.
|
||||
"""
|
||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
function pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
(xs === ys) ?
|
||||
|
@ -45,7 +45,7 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
|
|||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector; preprocess = true)
|
||||
if preprocess
|
||||
xs = _preprocess_list(dist, xs)
|
||||
end
|
||||
|
@ -59,7 +59,7 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstr
|
|||
return R
|
||||
end
|
||||
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector; preprocess = true)
|
||||
if preprocess
|
||||
objxs = _preprocess_list(dist, xs)
|
||||
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
|
||||
|
@ -75,5 +75,5 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
|
|||
return R
|
||||
end
|
||||
|
||||
_preprocess_list(dist::StringDistance, xs) = xs
|
||||
_preprocess_list(dist::Union{StringSemiMetric, StringMetric}, xs) = xs
|
||||
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
Loading…
Reference in New Issue