use correct (unrestricted) DamerauLevenshtein.
The old DamerauLevenshtein distance is renamed to OptimalStringAlignement.pull/57/head
parent
0faf255f93
commit
5bec23d357
|
@ -14,6 +14,7 @@ The available distances are:
|
|||
- Hamming Distance `Hamming()`
|
||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
||||
|
|
|
@ -21,10 +21,12 @@ end
|
|||
# 0.36s
|
||||
@time f(Levenshtein(), x, y, min_score = 0.8);
|
||||
# 0.11
|
||||
@time f(DamerauLevenshtein(), x, y);
|
||||
@time f(OptimalStringAlignement(), x, y);
|
||||
# 0.56s.
|
||||
@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
|
||||
@time f(OptimalStringAlignement(), x, y, min_score = 0.8);
|
||||
# 0.08
|
||||
@time f(DamerauLevenshtein(), x, y);
|
||||
# 2s
|
||||
@time f(RatcliffObershelp(), x, y);
|
||||
# 0.65s
|
||||
|
||||
|
@ -33,7 +35,7 @@ end
|
|||
|
||||
@time findnearest(x[1], y, Levenshtein());
|
||||
# 0.1
|
||||
@time findnearest(x[1], y, DamerauLevenshtein());
|
||||
@time findnearest(x[1], y, OptimalStringAlignement());
|
||||
# 0.1
|
||||
@time findnearest(x[1], y, QGram(2));
|
||||
# 0.75
|
||||
|
@ -42,17 +44,17 @@ end
|
|||
|
||||
@time findall(x[1], y, Levenshtein());
|
||||
# 0.05
|
||||
@time findall(x[1], y, DamerauLevenshtein());
|
||||
@time findall(x[1], y, OptimalStringAlignement());
|
||||
# 0.05
|
||||
@time findall(x[1], y, Partial(DamerauLevenshtein()));
|
||||
@time findall(x[1], y, Partial(OptimalStringAlignement()));
|
||||
# 0.96
|
||||
@time findall(x[1], y, QGram(2));
|
||||
# 0.81
|
||||
@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
|
||||
@time findall(x[1], y, TokenSort(OptimalStringAlignement()));
|
||||
# 0.27 (now 0.32)
|
||||
@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
|
||||
@time findall(x[1], y, TokenSet(OptimalStringAlignement()));
|
||||
# 0.55
|
||||
@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
|
||||
@time findall(x[1], y, TokenMax(OptimalStringAlignement()));
|
||||
# 2.25 (now 3.6)
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ include("distances/qgram.jl")
|
|||
|
||||
include("modifiers.jl")
|
||||
include("normalize.jl")
|
||||
include("pairwise.jl")
|
||||
include("convenience.jl")
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
@ -27,10 +27,11 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
|
|||
export
|
||||
StringDistance,
|
||||
Hamming,
|
||||
Levenshtein,
|
||||
DamerauLevenshtein,
|
||||
Jaro,
|
||||
JaroWinkler,
|
||||
Levenshtein,
|
||||
OptimalStringAlignement,
|
||||
DamerauLevenshtein,
|
||||
RatcliffObershelp,
|
||||
AbstractQGramDistance,
|
||||
QGramDict,
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
||||
"""
|
||||
findnearest(s, itr, dist::StringDistance) -> (x, index)
|
||||
|
||||
`findnearest` returns the value and index of the element of `itr` that has the
|
||||
lowest distance with `s` according to the distance `dist`.
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = "Newark"
|
||||
julia> iter = ["New York", "Princeton", "San Francisco"]
|
||||
julia> findnearest(s, iter, Levenshtein())
|
||||
("NewYork", 1)
|
||||
julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
|
||||
(nothing, nothing)
|
||||
```
|
||||
"""
|
||||
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
|
||||
score_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= score_old
|
||||
scores[Threads.threadid()] = score
|
||||
is[Threads.threadid()] = i
|
||||
end
|
||||
end
|
||||
imax = is[argmax(scores)]
|
||||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||
end
|
||||
_helper(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_helper(dist::StringDistance, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
findnearest(s, itr, dist; min_score = min_score)
|
||||
end
|
||||
|
||||
"""
|
||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
||||
|
||||
`findall` returns the vector of indices for elements of `itr` that have a
|
||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
If there are no such elements, return an empty array.
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = "Newark"
|
||||
julia> iter = ["Newwark", "Princeton", "San Francisco"]
|
||||
julia> findall(s, iter, Levenshtein())
|
||||
1-element Array{Int64,1}:
|
||||
1
|
||||
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
||||
0-element Array{Int64,1}
|
||||
```
|
||||
"""
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
push!(out[Threads.threadid()], i)
|
||||
end
|
||||
end
|
||||
vcat(out...)
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
|
||||
Both symmetric and asymmetric versions are available.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> iter = ["New York", "Princeton"]
|
||||
julia> pairwise(Levenshtein(), iter)
|
||||
2×2 Array{Float64,2}:
|
||||
0.0 9.0
|
||||
9.0 0.0
|
||||
julia> iter2 = ["San Francisco"]
|
||||
julia> pairwise(Levenshtein(), iter, iter2)
|
||||
2×1 Array{Float64,2}:
|
||||
12.0
|
||||
10.0
|
||||
```
|
||||
"""
|
||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
T = result_type(dist, eltype(xs), eltype(ys))
|
||||
if Missing <: Union{eltype(xs), eltype(ys)}
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
R = Matrix{T}(undef, length(xs), length(ys))
|
||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
"""
|
||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
"""
|
||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
((xs === ys) & (dist isa SemiMetric)) ?
|
||||
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
|
||||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
objs = _preprocess(xs, dist, preprocess)
|
||||
for i in 1:length(objs)
|
||||
# handle missing
|
||||
R[i, i] = objs[i] != objs[i]
|
||||
Threads.@threads for j in (i+1):length(objs)
|
||||
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
objsxs = _preprocess(xs, dist, preprocess)
|
||||
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
|
||||
for i in 1:length(objsxs)
|
||||
Threads.@threads for j in 1:length(objsys)
|
||||
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _preprocess(xs, dist::StringDistance, preprocess)
|
||||
if preprocess === nothing
|
||||
preprocess = length(xs) >= 5
|
||||
end
|
||||
if (dist isa AbstractQGramDistance) && preprocess
|
||||
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
||||
else
|
||||
return xs
|
||||
end
|
||||
end
|
|
@ -155,29 +155,29 @@ function (dist::Levenshtein)(s1, s2)
|
|||
end
|
||||
|
||||
"""
|
||||
DamerauLevenshtein()
|
||||
OptimalStringAlignement()
|
||||
|
||||
Creates the restricted DamerauLevenshtein distance
|
||||
Creates the OptimalStringAlignement distance (also known ad the unrestricted DamerauLevenshtein distance).
|
||||
|
||||
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
|
||||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||
required to change one string into the other.
|
||||
It is the minimum number of operations (consisting of insertions,
|
||||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||
required to change one string into the other.
|
||||
|
||||
The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing
|
||||
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
|
||||
distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
|
||||
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
|
||||
the triangle inequality.
|
||||
The distance differs slightly from the Damerau-Levenshtein algorithm by imposing
|
||||
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
|
||||
distance of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
|
||||
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
|
||||
the triangle inequality.
|
||||
"""
|
||||
|
||||
struct DamerauLevenshtein{V <: Union{Integer, Nothing}} <: SemiMetric
|
||||
struct OptimalStringAlignement{V <: Union{Integer, Nothing}} <: SemiMetric
|
||||
max_dist::V
|
||||
end
|
||||
DamerauLevenshtein() = DamerauLevenshtein(nothing)
|
||||
OptimalStringAlignement() = OptimalStringAlignement(nothing)
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
function (dist::DamerauLevenshtein)(s1, s2)
|
||||
function (dist::OptimalStringAlignement)(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -228,6 +228,59 @@ function (dist::DamerauLevenshtein)(s1, s2)
|
|||
return current
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
DamerauLevenshtein()
|
||||
|
||||
Creates the DamerauLevenshtein distance
|
||||
|
||||
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
|
||||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||
required to change one string into the other.
|
||||
"""
|
||||
|
||||
struct DamerauLevenshtein <: Metric
|
||||
end
|
||||
|
||||
## https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance
|
||||
function (dist::DamerauLevenshtein)(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
T = promote_type(eltype(s1), eltype(s2))
|
||||
da = Dict{T, Int}(x => 0 for x in Iterators.flatten((s1, s2)))
|
||||
d = zeros(Int, len1 + 2, len2 + 2)
|
||||
md = len1 + len2
|
||||
@inbounds for i in 0:len1
|
||||
d[i + 2, 1] = md
|
||||
d[i + 2, 2] = i
|
||||
end
|
||||
@inbounds for j in 0:len2
|
||||
d[1, j + 2] = md
|
||||
d[2, j + 2] = j
|
||||
end
|
||||
# fill in the distance matrix d
|
||||
for (i1, ch1) in enumerate(s1)
|
||||
db = 0
|
||||
for (i2, ch2) in enumerate(s2)
|
||||
j1 = da[ch2]
|
||||
j2 = db
|
||||
if ch1 == ch2
|
||||
cost = 0
|
||||
db = i2
|
||||
else
|
||||
cost = 1
|
||||
end
|
||||
@inbounds d[i1 + 2, i2 + 2] = min(d[i1 + 1, i2 + 1] + cost,
|
||||
d[i1 + 2, i2 + 1] + 1,
|
||||
d[i1 + 1, i2 + 2] + 1,
|
||||
d[j1 + 1, j2 + 1] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
|
||||
end
|
||||
da[ch1] = i1
|
||||
end
|
||||
return d[end, end]
|
||||
end
|
||||
|
||||
"""
|
||||
RatcliffObershelp()
|
||||
|
||||
|
|
133
src/normalize.jl
133
src/normalize.jl
|
@ -3,24 +3,27 @@ struct Normalized{V <: SemiMetric} <: SemiMetric
|
|||
max_dist::Float64
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:Hamming})(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
out = dist.dist(s1, s2) / len2
|
||||
function (dist::Normalized{<: Union{Jaro, JaroWinkler, RatcliffObershelp}})(s1, s2)
|
||||
out = dist.dist(s1, s2)
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Nothing}}})(s1, s2)
|
||||
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
isempty(s1) && isempty(s2) && return 0.0
|
||||
out = dist.dist(s1, s2) / length(s2)
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<:Union{Levenshtein{Nothing}, OptimalStringAlignement{Nothing}}})(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
isempty(s1) && isempty(s2) && return 0.0
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
if dist.dist isa Levenshtein
|
||||
d = Levenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
|
||||
else
|
||||
d = DamerauLevenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
|
||||
d = OptimalStringAlignement(ceil(Int, len2 * dist.max_dist))(s1, s2)
|
||||
end
|
||||
out = d / len2
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
|
@ -40,10 +43,6 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
|
|||
out > dist.max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized)(s1, s2)
|
||||
out = dist.dist(s1, s2)
|
||||
out > dist.max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
"""
|
||||
normalize(dist)
|
||||
|
@ -70,13 +69,12 @@ normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
|||
"""
|
||||
TokenMax(dist)
|
||||
|
||||
Creates the `TokenMax{dist}` distance
|
||||
Creates the `TokenMax{dist}` distance.
|
||||
|
||||
`TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
|
||||
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
|
||||
[`TokenSet`](@ref) modifier, with penalty terms depending on string length.
|
||||
[`TokenSet`](@ref) modifier, with penalty terms depending on the iterator length.
|
||||
|
||||
It is only defined on AbstractStrings
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
|
@ -93,7 +91,7 @@ end
|
|||
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
|
||||
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
|
||||
|
||||
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
function (dist::TokenMax)(s1, s2)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -124,104 +122,3 @@ function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
|
|||
end
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
||||
"""
|
||||
findnearest(s, itr, dist::StringDistance) -> (x, index)
|
||||
|
||||
`findnearest` returns the value and index of the element of `itr` that has the
|
||||
lowest distance with `s` according to the distance `dist`.
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = "Newark"
|
||||
julia> iter = ["New York", "Princeton", "San Francisco"]
|
||||
julia> findnearest(s, iter, Levenshtein())
|
||||
("NewYork", 1)
|
||||
julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
|
||||
(nothing, nothing)
|
||||
```
|
||||
"""
|
||||
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
|
||||
score_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= score_old
|
||||
scores[Threads.threadid()] = score
|
||||
is[Threads.threadid()] = i
|
||||
end
|
||||
end
|
||||
imax = is[argmax(scores)]
|
||||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||
end
|
||||
_helper(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_helper(dist::StringDistance, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
findnearest(s, itr, dist; min_score = min_score)
|
||||
end
|
||||
"""
|
||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
||||
|
||||
`findall` returns the vector of indices for elements of `itr` that have a
|
||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
If there are no such elements, return an empty array.
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = "Newark"
|
||||
julia> iter = ["Newwark", "Princeton", "San Francisco"]
|
||||
julia> findall(s, iter, Levenshtein())
|
||||
1-element Array{Int64,1}:
|
||||
1
|
||||
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
||||
0-element Array{Int64,1}
|
||||
```
|
||||
"""
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
push!(out[Threads.threadid()], i)
|
||||
end
|
||||
end
|
||||
vcat(out...)
|
||||
end
|
||||
|
|
|
@ -1,87 +0,0 @@
|
|||
@doc """
|
||||
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
|
||||
Both symmetric and asymmetric versions are available.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> iter = ["New York", "Princeton"]
|
||||
julia> pairwise(Levenshtein(), iter)
|
||||
2×2 Array{Float64,2}:
|
||||
0.0 9.0
|
||||
9.0 0.0
|
||||
julia> iter2 = ["San Francisco"]
|
||||
julia> pairwise(Levenshtein(), iter, iter2)
|
||||
2×1 Array{Float64,2}:
|
||||
12.0
|
||||
10.0
|
||||
```
|
||||
"""
|
||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
T = result_type(dist, eltype(xs), eltype(ys))
|
||||
if Missing <: Union{eltype(xs), eltype(ys)}
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
R = Matrix{T}(undef, length(xs), length(ys))
|
||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
@doc """
|
||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
"""
|
||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
((xs === ys) & (dist isa SemiMetric)) ?
|
||||
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
|
||||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
objs = _preprocess(xs, dist, preprocess)
|
||||
for i in 1:length(objs)
|
||||
# handle missing
|
||||
R[i, i] = objs[i] != objs[i]
|
||||
Threads.@threads for j in (i+1):length(objs)
|
||||
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
objsxs = _preprocess(xs, dist, preprocess)
|
||||
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
|
||||
for i in 1:length(objsxs)
|
||||
Threads.@threads for j in 1:length(objsys)
|
||||
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _preprocess(xs, dist::StringDistance, preprocess)
|
||||
if preprocess === nothing
|
||||
preprocess = length(xs) >= 5
|
||||
end
|
||||
if (dist isa AbstractQGramDistance) && preprocess
|
||||
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
||||
else
|
||||
return xs
|
||||
end
|
||||
end
|
|
@ -38,20 +38,31 @@ using StringDistances, Unicode, Test, Random
|
|||
@test ismissing(evaluate(Levenshtein(), "", missing))
|
||||
end
|
||||
|
||||
@testset "OptimalStringAlignement" begin
|
||||
@test evaluate(OptimalStringAlignement(), "", "") == 0
|
||||
@test evaluate(OptimalStringAlignement(), "abc", "") == 3
|
||||
@test evaluate(OptimalStringAlignement(), "bc", "abc") == 1
|
||||
@test evaluate(OptimalStringAlignement(), "fuor", "four") == 1
|
||||
@test evaluate(OptimalStringAlignement(), "abcd", "acb") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(OptimalStringAlignement(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(OptimalStringAlignement(), "ifhs", "fish") == 2
|
||||
@test OptimalStringAlignement(2)("abcdef", "abcxyf") == 2
|
||||
|
||||
@test evaluate(OptimalStringAlignement(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(OptimalStringAlignement(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(OptimalStringAlignement(), "alborgów", "amoniak")
|
||||
@test OptimalStringAlignement()("bc", "abc") == 1
|
||||
@test result_type(OptimalStringAlignement(), "hello", "world") == Int
|
||||
@inferred evaluate(OptimalStringAlignement(), "", "")
|
||||
@test ismissing(evaluate(OptimalStringAlignement(), "", missing))
|
||||
end
|
||||
|
||||
@testset "DamerauLevenshtein" begin
|
||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||
@test DamerauLevenshtein(2)("abcdef", "abcxyf") == 2
|
||||
|
||||
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
|
||||
@test DamerauLevenshtein()("bc", "abc") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "CA", "ABC") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABDCEF") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "BACDFE") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABCDE") == 1
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@inferred evaluate(DamerauLevenshtein(), "", "")
|
||||
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
|
||||
|
@ -292,7 +303,7 @@ using StringDistances, Unicode, Test, Random
|
|||
]
|
||||
|
||||
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||
(OptimalStringAlignement(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
|
@ -320,13 +331,13 @@ using StringDistances, Unicode, Test, Random
|
|||
for i in eachindex(strings)
|
||||
d = Levenshtein()(strings[i]...)
|
||||
@test Levenshtein(d)(strings[i]...) == d
|
||||
d = DamerauLevenshtein()(strings[i]...)
|
||||
@test DamerauLevenshtein(d)(strings[i]...) == d
|
||||
d = OptimalStringAlignement()(strings[i]...)
|
||||
@test OptimalStringAlignement(d)(strings[i]...) == d
|
||||
end
|
||||
end
|
||||
|
||||
d = DamerauLevenshtein()("abcdef", "abcxyf")
|
||||
@test DamerauLevenshtein(d)("abcdef", "abcxyf") == d
|
||||
d = OptimalStringAlignement()("abcdef", "abcxyf")
|
||||
@test OptimalStringAlignement(d)("abcdef", "abcxyf") == d
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -60,9 +60,9 @@ end
|
|||
#Levenshtein
|
||||
compare("aüa", "aua", Levenshtein())
|
||||
@test compare("ok", missing, Levenshtein()) === missing
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
||||
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
|
||||
compare("aüa", "aua", OptimalStringAlignement())
|
||||
@test StringDistances.normalize(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
|
||||
@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0
|
||||
|
||||
# RatcliffObershelp
|
||||
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
||||
|
@ -115,7 +115,7 @@ end
|
|||
("ifhs", "fish"),
|
||||
("leia", "leela"),
|
||||
]
|
||||
for dist in (Levenshtein, DamerauLevenshtein)
|
||||
for dist in (Levenshtein, OptimalStringAlignement)
|
||||
for i in eachindex(strings)
|
||||
if compare(strings[i]..., dist()) < 1 / 3
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0
|
||||
|
|
|
@ -7,7 +7,7 @@ using StringDistances, Unicode, Test, Random
|
|||
TestStrings1missing = ["", "abc", "bc", missing]
|
||||
TestStrings2missing = ["mew", missing]
|
||||
|
||||
for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(),
|
||||
for d in [Jaro(), Levenshtein(), OptimalStringAlignement(), RatcliffObershelp(),
|
||||
QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)]
|
||||
|
||||
R = pairwise(d, TestStrings1)
|
||||
|
|
Loading…
Reference in New Issue