use correct (unrestricted) DamerauLevenshtein.

The old DamerauLevenshtein distance is renamed to OptimalStringAlignement.
pull/57/head
matthieugomez 2021-09-10 17:14:21 -04:00
parent 0faf255f93
commit 5bec23d357
10 changed files with 316 additions and 251 deletions

View File

@ -14,6 +14,7 @@ The available distances are:
- Hamming Distance `Hamming()`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
- Q-gram distances compare the set of all substrings of length `q` in each string.

View File

@ -21,10 +21,12 @@ end
# 0.36s
@time f(Levenshtein(), x, y, min_score = 0.8);
# 0.11
@time f(DamerauLevenshtein(), x, y);
@time f(OptimalStringAlignement(), x, y);
# 0.56s.
@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
@time f(OptimalStringAlignement(), x, y, min_score = 0.8);
# 0.08
@time f(DamerauLevenshtein(), x, y);
# 2s
@time f(RatcliffObershelp(), x, y);
# 0.65s
@ -33,7 +35,7 @@ end
@time findnearest(x[1], y, Levenshtein());
# 0.1
@time findnearest(x[1], y, DamerauLevenshtein());
@time findnearest(x[1], y, OptimalStringAlignement());
# 0.1
@time findnearest(x[1], y, QGram(2));
# 0.75
@ -42,17 +44,17 @@ end
@time findall(x[1], y, Levenshtein());
# 0.05
@time findall(x[1], y, DamerauLevenshtein());
@time findall(x[1], y, OptimalStringAlignement());
# 0.05
@time findall(x[1], y, Partial(DamerauLevenshtein()));
@time findall(x[1], y, Partial(OptimalStringAlignement()));
# 0.96
@time findall(x[1], y, QGram(2));
# 0.81
@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
@time findall(x[1], y, TokenSort(OptimalStringAlignement()));
# 0.27 (now 0.32)
@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
@time findall(x[1], y, TokenSet(OptimalStringAlignement()));
# 0.55
@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
@time findall(x[1], y, TokenMax(OptimalStringAlignement()));
# 2.25 (now 3.6)

View File

@ -9,7 +9,7 @@ include("distances/qgram.jl")
include("modifiers.jl")
include("normalize.jl")
include("pairwise.jl")
include("convenience.jl")
# Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
@ -27,10 +27,11 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
export
StringDistance,
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
JaroWinkler,
Levenshtein,
OptimalStringAlignement,
DamerauLevenshtein,
RatcliffObershelp,
AbstractQGramDistance,
QGramDict,

187
src/convenience.jl Normal file
View File

@ -0,0 +1,187 @@
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end
"""
findnearest(s, itr, dist::StringDistance) -> (x, index)
`findnearest` returns the value and index of the element of `itr` that has the
lowest distance with `s` according to the distance `dist`.
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findnearest(s, iter, Levenshtein())
("NewYork", 1)
julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
is[Threads.threadid()] = i
end
end
imax = is[argmax(scores)]
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end
_helper(dist::AbstractQGramDistance, ::Missing) = missing
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_helper(dist::StringDistance, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score)
end
"""
findall(s, itr , dist::StringDistance; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
If there are no such elements, return an empty array.
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["Newwark", "Princeton", "San Francisco"]
julia> findall(s, iter, Levenshtein())
1-element Array{Int64,1}:
1
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
if score >= min_score
push!(out[Threads.threadid()], i)
end
end
vcat(out...)
end
"""
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available.
### Examples
```julia-repl
julia> using StringDistances
julia> iter = ["New York", "Princeton"]
julia> pairwise(Levenshtein(), iter)
2×2 Array{Float64,2}:
0.0 9.0
9.0 0.0
julia> iter2 = ["San Francisco"]
julia> pairwise(Levenshtein(), iter, iter2)
2×1 Array{Float64,2}:
12.0
10.0
```
"""
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
"""
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
((xs === ys) & (dist isa SemiMetric)) ?
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
end
end
return R
end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end
end
return R
end
function _preprocess(xs, dist::StringDistance, preprocess)
if preprocess === nothing
preprocess = length(xs) >= 5
end
if (dist isa AbstractQGramDistance) && preprocess
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
else
return xs
end
end

View File

@ -155,29 +155,29 @@ function (dist::Levenshtein)(s1, s2)
end
"""
DamerauLevenshtein()
OptimalStringAlignement()
Creates the restricted DamerauLevenshtein distance
Creates the OptimalStringAlignement distance (also known ad the unrestricted DamerauLevenshtein distance).
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
It is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
The distance differs slightly from the Damerau-Levenshtein algorithm by imposing
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
distance of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
"""
struct DamerauLevenshtein{V <: Union{Integer, Nothing}} <: SemiMetric
struct OptimalStringAlignement{V <: Union{Integer, Nothing}} <: SemiMetric
max_dist::V
end
DamerauLevenshtein() = DamerauLevenshtein(nothing)
OptimalStringAlignement() = OptimalStringAlignement(nothing)
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::DamerauLevenshtein)(s1, s2)
function (dist::OptimalStringAlignement)(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -228,6 +228,59 @@ function (dist::DamerauLevenshtein)(s1, s2)
return current
end
"""
DamerauLevenshtein()
Creates the DamerauLevenshtein distance
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
"""
struct DamerauLevenshtein <: Metric
end
## https://en.wikipedia.org/wiki/DamerauLevenshtein_distance
function (dist::DamerauLevenshtein)(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
T = promote_type(eltype(s1), eltype(s2))
da = Dict{T, Int}(x => 0 for x in Iterators.flatten((s1, s2)))
d = zeros(Int, len1 + 2, len2 + 2)
md = len1 + len2
@inbounds for i in 0:len1
d[i + 2, 1] = md
d[i + 2, 2] = i
end
@inbounds for j in 0:len2
d[1, j + 2] = md
d[2, j + 2] = j
end
# fill in the distance matrix d
for (i1, ch1) in enumerate(s1)
db = 0
for (i2, ch2) in enumerate(s2)
j1 = da[ch2]
j2 = db
if ch1 == ch2
cost = 0
db = i2
else
cost = 1
end
@inbounds d[i1 + 2, i2 + 2] = min(d[i1 + 1, i2 + 1] + cost,
d[i1 + 2, i2 + 1] + 1,
d[i1 + 1, i2 + 2] + 1,
d[j1 + 1, j2 + 1] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
end
da[ch1] = i1
end
return d[end, end]
end
"""
RatcliffObershelp()

View File

@ -3,24 +3,27 @@ struct Normalized{V <: SemiMetric} <: SemiMetric
max_dist::Float64
end
function (dist::Normalized{<:Hamming})(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
out = dist.dist(s1, s2) / len2
function (dist::Normalized{<: Union{Jaro, JaroWinkler, RatcliffObershelp}})(s1, s2)
out = dist.dist(s1, s2)
out > dist.max_dist ? 1.0 : out
end
function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Nothing}}})(s1, s2)
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0
out = dist.dist(s1, s2) / length(s2)
out > dist.max_dist ? 1.0 : out
end
function (dist::Normalized{<:Union{Levenshtein{Nothing}, OptimalStringAlignement{Nothing}}})(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
if dist.dist isa Levenshtein
d = Levenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
else
d = DamerauLevenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
d = OptimalStringAlignement(ceil(Int, len2 * dist.max_dist))(s1, s2)
end
out = d / len2
out > dist.max_dist ? 1.0 : out
@ -40,10 +43,6 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
out > dist.max_dist ? 1.0 : out
end
function (dist::Normalized)(s1, s2)
out = dist.dist(s1, s2)
out > dist.max_dist ? 1.0 : out
end
"""
normalize(dist)
@ -70,13 +69,12 @@ normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
"""
TokenMax(dist)
Creates the `TokenMax{dist}` distance
Creates the `TokenMax{dist}` distance.
`TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
[`TokenSet`](@ref) modifier, with penalty terms depending on string length.
[`TokenSet`](@ref) modifier, with penalty terms depending on the iterator length.
It is only defined on AbstractStrings
### Examples
```julia-repl
@ -93,7 +91,7 @@ end
TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
function (dist::TokenMax)(s1, s2)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -124,104 +122,3 @@ function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
end
out > max_dist ? 1.0 : out
end
const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end
"""
findnearest(s, itr, dist::StringDistance) -> (x, index)
`findnearest` returns the value and index of the element of `itr` that has the
lowest distance with `s` according to the distance `dist`.
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findnearest(s, iter, Levenshtein())
("NewYork", 1)
julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
is[Threads.threadid()] = i
end
end
imax = is[argmax(scores)]
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end
_helper(dist::AbstractQGramDistance, ::Missing) = missing
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_helper(dist::StringDistance, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
findnearest(s, itr, dist; min_score = min_score)
end
"""
findall(s, itr , dist::StringDistance; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
If there are no such elements, return an empty array.
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["Newwark", "Princeton", "San Francisco"]
julia> findall(s, iter, Levenshtein())
1-element Array{Int64,1}:
1
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
if score >= min_score
push!(out[Threads.threadid()], i)
end
end
vcat(out...)
end

View File

@ -1,87 +0,0 @@
@doc """
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available.
### Examples
```julia-repl
julia> using StringDistances
julia> iter = ["New York", "Princeton"]
julia> pairwise(Levenshtein(), iter)
2×2 Array{Float64,2}:
0.0 9.0
9.0 0.0
julia> iter2 = ["San Francisco"]
julia> pairwise(Levenshtein(), iter, iter2)
2×1 Array{Float64,2}:
12.0
10.0
```
"""
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
@doc """
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
((xs === ys) & (dist isa SemiMetric)) ?
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
end
end
return R
end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end
end
return R
end
function _preprocess(xs, dist::StringDistance, preprocess)
if preprocess === nothing
preprocess = length(xs) >= 5
end
if (dist isa AbstractQGramDistance) && preprocess
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
else
return xs
end
end

View File

@ -38,20 +38,31 @@ using StringDistances, Unicode, Test, Random
@test ismissing(evaluate(Levenshtein(), "", missing))
end
@testset "OptimalStringAlignement" begin
@test evaluate(OptimalStringAlignement(), "", "") == 0
@test evaluate(OptimalStringAlignement(), "abc", "") == 3
@test evaluate(OptimalStringAlignement(), "bc", "abc") == 1
@test evaluate(OptimalStringAlignement(), "fuor", "four") == 1
@test evaluate(OptimalStringAlignement(), "abcd", "acb") == 2
@test evaluate(OptimalStringAlignement(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(OptimalStringAlignement(), "jellyifhs", "jellyfish") == 2
@test evaluate(OptimalStringAlignement(), "ifhs", "fish") == 2
@test OptimalStringAlignement(2)("abcdef", "abcxyf") == 2
@test evaluate(OptimalStringAlignement(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(OptimalStringAlignement(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(OptimalStringAlignement(), "alborgów", "amoniak")
@test OptimalStringAlignement()("bc", "abc") == 1
@test result_type(OptimalStringAlignement(), "hello", "world") == Int
@inferred evaluate(OptimalStringAlignement(), "", "")
@test ismissing(evaluate(OptimalStringAlignement(), "", missing))
end
@testset "DamerauLevenshtein" begin
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test DamerauLevenshtein(2)("abcdef", "abcxyf") == 2
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
@test DamerauLevenshtein()("bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "CA", "ABC") == 2
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABDCEF") == 1
@test evaluate(DamerauLevenshtein(), "ABCDEF", "BACDFE") == 2
@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABCDE") == 1
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred evaluate(DamerauLevenshtein(), "", "")
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@ -292,7 +303,7 @@ using StringDistances, Unicode, Test, Random
]
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(OptimalStringAlignement(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
@ -320,13 +331,13 @@ using StringDistances, Unicode, Test, Random
for i in eachindex(strings)
d = Levenshtein()(strings[i]...)
@test Levenshtein(d)(strings[i]...) == d
d = DamerauLevenshtein()(strings[i]...)
@test DamerauLevenshtein(d)(strings[i]...) == d
d = OptimalStringAlignement()(strings[i]...)
@test OptimalStringAlignement(d)(strings[i]...) == d
end
end
d = DamerauLevenshtein()("abcdef", "abcxyf")
@test DamerauLevenshtein(d)("abcdef", "abcxyf") == d
d = OptimalStringAlignement()("abcdef", "abcxyf")
@test OptimalStringAlignement(d)("abcdef", "abcxyf") == d

View File

@ -60,9 +60,9 @@ end
#Levenshtein
compare("aüa", "aua", Levenshtein())
@test compare("ok", missing, Levenshtein()) === missing
compare("aüa", "aua", DamerauLevenshtein())
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
compare("aüa", "aua", OptimalStringAlignement())
@test StringDistances.normalize(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@ -115,7 +115,7 @@ end
("ifhs", "fish"),
("leia", "leela"),
]
for dist in (Levenshtein, DamerauLevenshtein)
for dist in (Levenshtein, OptimalStringAlignement)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0

View File

@ -7,7 +7,7 @@ using StringDistances, Unicode, Test, Random
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(),
for d in [Jaro(), Levenshtein(), OptimalStringAlignement(), RatcliffObershelp(),
QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)]
R = pairwise(d, TestStrings1)