
90 lines
3.4 KiB
Raw Normal View History

@doc """
2020-12-30 14:39:18 +01:00
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
2020-12-30 14:39:18 +01:00
`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
2020-11-14 20:40:44 +01:00
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available.
### Examples
julia> using StringDistances
julia> iter = ["New York", "Princeton"]
2020-12-30 14:39:18 +01:00
julia> pairwise(Levenshtein(), iter)
2×2 Array{Float64,2}:
0.0 9.0
9.0 0.0
julia> iter2 = ["San Francisco"]
2020-12-30 14:39:18 +01:00
julia> pairwise(Levenshtein(), iter, iter2)
2×1 Array{Float64,2}:
2020-12-30 14:39:18 +01:00
function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
2020-11-09 02:05:14 +01:00
2020-11-09 02:05:14 +01:00
@doc """
2020-12-30 14:39:18 +01:00
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
2020-12-30 14:39:18 +01:00
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
2020-11-14 20:40:44 +01:00
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
2020-11-09 02:05:14 +01:00
false if no preprocessing should be used, regardless of length.
2020-12-30 14:39:18 +01:00
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
2020-12-30 14:39:18 +01:00
((xs === ys) & (dist isa SemiMetric)) ?
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
2020-12-30 14:39:18 +01:00
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
return R
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
2020-12-30 14:39:18 +01:00
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
return R
2020-11-10 16:12:28 +01:00
2020-12-30 14:39:18 +01:00
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
if preprocess === nothing ? length(xs) >= 5 : preprocess
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
return xs
_preprocess(xs, dist::StringDistance, preprocess) = xs