diff --git a/Project.toml b/Project.toml index d74c411..7a359c5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StringDistances" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.10.1" +version = "0.11.0" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" diff --git a/README.md b/README.md index d10dd91..7d2d250 100644 --- a/README.md +++ b/README.md @@ -5,26 +5,25 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`. ## Supported Distances - -Distances are defined over iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`) +The package defines two abstract types: `StringSemiMetric <: SemiMetric`, and `StringMetric <: Metric`. +String distances inherit from one of these two types. They act over any pair of iterators that define `length` (this includes `AbstractStrings`, but also `GraphemeIterators` or `AbstractVectors`) The available distances are: - - Edit Distances - - Hamming Distance `Hamming()` - - [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()` - - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric` - - [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()` - - [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric` - - [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()` + - Hamming Distance `Hamming() <: SemiStringMetric` + - [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiStringMetric` + - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: StringMetric` + - [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiStringMetric` + - [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: StringMetric` + - [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiStringMetric` - Q-gram distances compare the set of all substrings of length `q` in each string. - - QGram Distance `Qgram(q::Int)` - - [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int)` - - [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)` - - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` - - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)` - - [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int)` + - QGram Distance `Qgram(q::Int) <: SemiStringMetric` + - [Cosine Distance](https://en.wikipedia.org/wiki/Cosine_similarity) `Cosine(q::Int) <: SemiStringMetric` + - [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int) <: SemiStringMetric` + - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int) <: SemiStringMetric` + - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int) <: SemiStringMetric` + - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int) <: SemiStringMetric` + - [Normalized Multiset Distance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NMD(q::Int) <: SemiStringMetric` ## Basic Use diff --git a/src/StringDistances.jl b/src/StringDistances.jl index a6e3fdc..76fa7dd 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -4,15 +4,14 @@ using Distances import StatsAPI: pairwise, pairwise! abstract type StringSemiMetric <: SemiMetric end abstract type StringMetric <: Metric end -const StringDistance = Union{StringSemiMetric, StringMetric} -function Distances.result_type(dist::StringDistance, s1::Type, s2::Type) +function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type) T = typeof(dist("", "")) if (Missing <: s1) | (Missing <: s2) T = Union{T, Missing} end return T end -Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) +Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2)) include("distances/utils.jl") @@ -33,9 +32,10 @@ include("fuzzywuzzy.jl") ## ############################################################################## -export StringDistance, +export StringSemiMetric, StringMetric, +# edit distances Hamming, Jaro, JaroWinkler, @@ -43,6 +43,7 @@ Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, +# Qgram distances AbstractQGramDistance, QGramDict, QGramSortedVector, @@ -53,15 +54,19 @@ SorensenDice, Overlap, MorisitaOverlap, NMD, +qgrams, +# normalize +compare, +# fuzzywuzzy Partial, TokenSort, TokenSet, TokenMax, -evaluate, -compare, -result_type, -qgrams, +# find findnearest, +# re-rexport from Distances.jl +evaluate, +result_type, pairwise, pairwise! end diff --git a/src/distances/edit.jl b/src/distances/edit.jl index 2f24eb4..cf0ffb3 100755 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -5,7 +5,7 @@ Creates the Hamming distance The Hamming distance is defined as the number of characters that do not match """ -struct Hamming{V <: Union{Int, Nothing}} <: StringSemiMetric +struct Hamming{V <: Union{Int, Nothing}} <: StringMetric max_dist::V end Hamming() = Hamming(nothing) diff --git a/src/find.jl b/src/find.jl index 9a92410..fd830f6 100644 --- a/src/find.jl +++ b/src/find.jl @@ -1,5 +1,5 @@ """ - findnearest(s, itr, dist::StringDistance) -> (x, index) + findnearest(s, itr, dist::Union{StringMetric, StringSemiMetric}) -> (x, index) `findnearest` returns the value and index of the element of `itr` that has the lowest distance with `s` according to the distance `dist`. @@ -18,7 +18,7 @@ julia> findnearest(s, iter, Levenshtein(); min_score = 0.9) (nothing, nothing) ``` """ -function findnearest(s, itr, dist::StringDistance; min_score = 0.0) +function findnearest(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) min_score_atomic = Threads.Atomic{Float64}(min_score) scores = [0.0 for _ in 1:Threads.nthreads()] is = [0 for _ in 1:Threads.nthreads()] @@ -37,15 +37,15 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0) end _preprocess(dist::AbstractQGramDistance, ::Missing) = missing _preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q) -_preprocess(dist::StringDistance, s) = s +_preprocess(dist::Union{StringSemiMetric, StringMetric}, s) = s -function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0) +function Base.findmax(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)" findnearest(s, itr, dist; min_score = min_score) end """ - findall(s, itr , dist::StringDistance; min_score = 0.8) + findall(s, itr , dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8) `findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`. @@ -66,7 +66,7 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9) 0-element Array{Int64,1} ``` """ -function Base.findall(s, itr, dist::StringDistance; min_score = 0.8) +function Base.findall(s, itr, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.8) out = [Int[] for _ in 1:Threads.nthreads()] s = _preprocess(dist, s) # need collect since @threads requires a length method diff --git a/src/fuzzywuzzy.jl b/src/fuzzywuzzy.jl index fe47a7b..eb9c43f 100755 --- a/src/fuzzywuzzy.jl +++ b/src/fuzzywuzzy.jl @@ -15,7 +15,7 @@ julia> Partial(RatcliffObershelp())(s1, s2) 0.5483870967741935 ``` """ -struct Partial{S <: StringDistance} <: StringSemiMetric +struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric dist::S end @@ -97,7 +97,7 @@ julia> TokenSort(RatcliffObershelp())(s1, s2) 0.0 ``` """ -struct TokenSort{S <: StringDistance} <: StringSemiMetric +struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric dist::S end @@ -131,7 +131,7 @@ julia> TokenSet(RatcliffObershelp())(s1, s2) 0.0 ``` """ -struct TokenSet{S <: StringDistance} <: StringSemiMetric +struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric dist::S end @@ -173,7 +173,7 @@ julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2) 0.05 ``` """ -struct TokenMax{S <: StringDistance} <: StringSemiMetric +struct TokenMax{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric dist::S max_dist::Float64 end diff --git a/src/normalize.jl b/src/normalize.jl index c5000d2..b12ac39 100755 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -1,4 +1,4 @@ -struct Normalized{V <: StringDistance} <: StringSemiMetric +struct Normalized{V <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric dist::V max_dist::Float64 end @@ -59,7 +59,7 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2) 0.8064 ``` """ -normalize(dist::StringDistance; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist) +normalize(dist::Union{StringSemiMetric, StringMetric}; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist) normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist) @@ -75,6 +75,6 @@ julia> compare("martha", "marhta", Levenshtein()) 0.6666666666666667 ``` """ -function compare(s1, s2, dist::StringDistance; min_score = 0.0) +function compare(s1, s2, dist::Union{StringSemiMetric, StringMetric}; min_score = 0.0) 1 - normalize(dist, max_dist = 1 - min_score)(s1, s2) end diff --git a/src/pairwise.jl b/src/pairwise.jl index 73ed280..4506c2a 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -23,21 +23,21 @@ julia> pairwise(Levenshtein(), iter, iter2) 10.0 ``` """ -function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) +function pairwise(dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) T = result_type(dist, eltype(xs), eltype(ys)) R = Matrix{T}(undef, length(xs), length(ys)) pairwise!(R, dist, xs, ys; preprocess = preprocess) end """ - pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) + pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) Compute distances between all pairs of elements in `xs` and `ys` according to the -`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`. +`Union{StringSemiMetric, StringMetric}` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`. Set `preprocess` to false if no preprocessing should be used. """ -function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) +function pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true) length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length")) length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length")) (xs === ys) ? @@ -45,7 +45,7 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess) end -function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true) +function _symmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector; preprocess = true) if preprocess xs = _preprocess_list(dist, xs) end @@ -59,7 +59,7 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abstr return R end -function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true) +function _asymmetric_pairwise!(R::AbstractMatrix, dist::Union{StringSemiMetric, StringMetric}, xs::AbstractVector, ys::AbstractVector; preprocess = true) if preprocess objxs = _preprocess_list(dist, xs) objys = xs === ys ? objxs : _preprocess_list(dist, ys) @@ -75,5 +75,5 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst return R end -_preprocess_list(dist::StringDistance, xs) = xs +_preprocess_list(dist::Union{StringSemiMetric, StringMetric}, xs) = xs _preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs)) \ No newline at end of file