remove StringDistance type (since Distance does not exist)
parent
d9f99986fb
commit
0e5cd7e4d2
|
@ -1,6 +1,6 @@
|
||||||
name = "StringDistances"
|
name = "StringDistances"
|
||||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||||
version = "0.10.1"
|
version = "0.11.0"
|
||||||
|
|
||||||
[deps]
|
[deps]
|
||||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||||
|
|
31
README.md
31
README.md
|
@ -5,26 +5,25 @@
|
||||||
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
|
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
|
||||||
|
|
||||||
## Supported Distances
|
## Supported Distances
|
||||||
|
The package defines two abstract types: `StringSemiMetric <: SemiMetric`, and `StringMetric <: Metric`.
|
||||||
Distances are defined over iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
|
String distances inherit from one of these two types. They act over any pair of iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`)
|
||||||
|
|
||||||
The available distances are:
|
The available distances are:
|
||||||
|
|
||||||
- Edit Distances
|
- Edit Distances
|
||||||
- Hamming Distance `Hamming()`
|
- Hamming Distance `Hamming() <: SemiStringMetric`
|
||||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
|
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiStringMetric`
|
||||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
|
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: StringMetric`
|
||||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiStringMetric`
|
||||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
|
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: StringMetric`
|
||||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiStringMetric`
|
||||||
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
||||||
- QGram Distance `Qgram(q::Int)`
|
- QGram Distance `Qgram(q::Int) <: SemiStringMetric`
|
||||||
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int)`
|
- [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int) <: SemiStringMetric`
|
||||||
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
|
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int) <: SemiStringMetric`
|
||||||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
|
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int) <: SemiStringMetric`
|
||||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
|
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int) <: SemiStringMetric`
|
||||||
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
|
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int) <: SemiStringMetric`
|
||||||
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int)`
|
- [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int) <: SemiStringMetric`
|
||||||
|
|
||||||
|
|
||||||
## Basic Use
|
## Basic Use
|
||||||
|
|
|
@ -4,15 +4,14 @@ using Distances
|
||||||
import StatsAPI: pairwise, pairwise!
|
import StatsAPI: pairwise, pairwise!
|
||||||
abstract type StringSemiMetric <: SemiMetric end
|
abstract type StringSemiMetric <: SemiMetric end
|
||||||
abstract type StringMetric <: Metric end
|
abstract type StringMetric <: Metric end
|
||||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
|
||||||
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
|
|
||||||
T = typeof(dist("", ""))
|
T = typeof(dist("", ""))
|
||||||
if (Missing <: s1) | (Missing <: s2)
|
if (Missing <: s1) | (Missing <: s2)
|
||||||
T = Union{T, Missing}
|
T = Union{T, Missing}
|
||||||
end
|
end
|
||||||
return T
|
return T
|
||||||
end
|
end
|
||||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||||
|
|
||||||
|
|
||||||
include("distances/utils.jl")
|
include("distances/utils.jl")
|
||||||
|
@ -33,9 +32,10 @@ include("fuzzywuzzy.jl")
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
export StringDistance,
|
export
|
||||||
StringSemiMetric,
|
StringSemiMetric,
|
||||||
StringMetric,
|
StringMetric,
|
||||||
|
# edit distances
|
||||||
Hamming,
|
Hamming,
|
||||||
Jaro,
|
Jaro,
|
||||||
JaroWinkler,
|
JaroWinkler,
|
||||||
|
@ -43,6 +43,7 @@ Levenshtein,
|
||||||
OptimalStringAlignement,
|
OptimalStringAlignement,
|
||||||
DamerauLevenshtein,
|
DamerauLevenshtein,
|
||||||
RatcliffObershelp,
|
RatcliffObershelp,
|
||||||
|
# Qgram distances
|
||||||
AbstractQGramDistance,
|
AbstractQGramDistance,
|
||||||
QGramDict,
|
QGramDict,
|
||||||
QGramSortedVector,
|
QGramSortedVector,
|
||||||
|
@ -53,15 +54,19 @@ SorensenDice,
|
||||||
Overlap,
|
Overlap,
|
||||||
MorisitaOverlap,
|
MorisitaOverlap,
|
||||||
NMD,
|
NMD,
|
||||||
|
qgrams,
|
||||||
|
# normalize
|
||||||
|
compare,
|
||||||
|
# fuzzywuzzy
|
||||||
Partial,
|
Partial,
|
||||||
TokenSort,
|
TokenSort,
|
||||||
TokenSet,
|
TokenSet,
|
||||||
TokenMax,
|
TokenMax,
|
||||||
evaluate,
|
# find
|
||||||
compare,
|
|
||||||
result_type,
|
|
||||||
qgrams,
|
|
||||||
findnearest,
|
findnearest,
|
||||||
|
# re-rexport from Distances.jl
|
||||||
|
evaluate,
|
||||||
|
result_type,
|
||||||
pairwise,
|
pairwise,
|
||||||
pairwise!
|
pairwise!
|
||||||
end
|
end
|
||||||
|
|
|
@ -5,7 +5,7 @@ Creates the Hamming distance
|
||||||
|
|
||||||
The Hamming distance is defined as the number of characters that do not match
|
The Hamming distance is defined as the number of characters that do not match
|
||||||
"""
|
"""
|
||||||
struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric
|
struct Hamming{V <: Union{Int, Nothing}} <: StringMetric
|
||||||
max_dist::V
|
max_dist::V
|
||||||
end
|
end
|
||||||
Hamming() = Hamming(nothing)
|
Hamming() = Hamming(nothing)
|
||||||
|
|
12
src/find.jl
12
src/find.jl
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
findnearest(s, itr, dist::StringDistance) -> (x, index)
|
findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index)
|
||||||
|
|
||||||
`findnearest` returns the value and index of the element of `itr` that has the
|
`findnearest` returns the value and index of the element of `itr` that has the
|
||||||
lowest distance with `s` according to the distance `dist`.
|
lowest distance with `s` according to the distance `dist`.
|
||||||
|
@ -18,7 +18,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
|
||||||
(nothing, nothing)
|
(nothing, nothing)
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||||
is = [0 for _ in 1:Threads.nthreads()]
|
is = [0 for _ in 1:Threads.nthreads()]
|
||||||
|
@ -37,15 +37,15 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||||
end
|
end
|
||||||
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
|
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
|
||||||
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||||
_preprocess(dist::StringDistance, s) = s
|
_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s
|
||||||
|
|
||||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||||
findnearest(s, itr, dist; min_score = min_score)
|
findnearest(s, itr, dist; min_score = min_score)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||||
|
|
||||||
`findall` returns the vector of indices for elements of `itr` that have a
|
`findall` returns the vector of indices for elements of `itr` that have a
|
||||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||||
|
@ -66,7 +66,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
||||||
0-element Array{Int64,1}
|
0-element Array{Int64,1}
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8)
|
||||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||||
s = _preprocess(dist, s)
|
s = _preprocess(dist, s)
|
||||||
# need collect since @threads requires a length method
|
# need collect since @threads requires a length method
|
||||||
|
|
|
@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2)
|
||||||
0.5483870967741935
|
0.5483870967741935
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct Partial{S <: StringDistance} <: StringSemiMetric
|
struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2)
|
||||||
0.0
|
0.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenSort{S <: StringDistance} <: StringSemiMetric
|
struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2)
|
||||||
0.0
|
0.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenSet{S <: StringDistance} <: StringSemiMetric
|
struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
|
||||||
0.05
|
0.05
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenMax{S <: StringDistance} <: StringSemiMetric
|
struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
max_dist::Float64
|
max_dist::Float64
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
struct Normalized{V <: StringDistance} <: StringSemiMetric
|
struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
||||||
dist::V
|
dist::V
|
||||||
max_dist::Float64
|
max_dist::Float64
|
||||||
end
|
end
|
||||||
|
@ -59,7 +59,7 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
|
||||||
0.8064
|
0.8064
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,6 +75,6 @@ julia> compare("martha", "marhta", Levenshtein())
|
||||||
0.6666666666666667
|
0.6666666666666667
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0)
|
||||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||||
end
|
end
|
||||||
|
|
|
@ -23,21 +23,21 @@ julia> pairwise(Levenshtein(), iter, iter2)
|
||||||
10.0
|
10.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
function pairwise(dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||||
T = result_type(dist, eltype(xs), eltype(ys))
|
T = result_type(dist, eltype(xs), eltype(ys))
|
||||||
R = Matrix{T}(undef, length(xs), length(ys))
|
R = Matrix{T}(undef, length(xs), length(ys))
|
||||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||||
|
|
||||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
`Union{StringSemiMetric, StringMetric}` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||||
|
|
||||||
Set `preprocess` to false if no preprocessing should be used.
|
Set `preprocess` to false if no preprocessing should be used.
|
||||||
"""
|
"""
|
||||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
function pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||||
(xs === ys) ?
|
(xs === ys) ?
|
||||||
|
@ -45,7 +45,7 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
|
||||||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
|
function _symmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector; preprocess = true)
|
||||||
if preprocess
|
if preprocess
|
||||||
xs = _preprocess_list(dist, xs)
|
xs = _preprocess_list(dist, xs)
|
||||||
end
|
end
|
||||||
|
@ -59,7 +59,7 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstr
|
||||||
return R
|
return R
|
||||||
end
|
end
|
||||||
|
|
||||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
|
function _asymmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector; preprocess = true)
|
||||||
if preprocess
|
if preprocess
|
||||||
objxs = _preprocess_list(dist, xs)
|
objxs = _preprocess_list(dist, xs)
|
||||||
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
|
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
|
||||||
|
@ -75,5 +75,5 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
|
||||||
return R
|
return R
|
||||||
end
|
end
|
||||||
|
|
||||||
_preprocess_list(dist::StringDistance, xs) = xs
|
_preprocess_list(dist::Union{StringSemiMetric, StringMetric}, xs) = xs
|
||||||
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
Loading…
Reference in New Issue