add Hamming + restrict pairwise to vectors + handle missings
parent
b407b186f0
commit
e4095682b4
|
@ -7,10 +7,11 @@ include("distances/edit.jl")
|
|||
include("distances/qgram.jl")
|
||||
include("normalize.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2))
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
||||
|
||||
include("find.jl")
|
||||
include("pairwise.jl")
|
||||
|
@ -23,10 +24,12 @@ include("pairwise.jl")
|
|||
|
||||
export
|
||||
StringDistance,
|
||||
Hamming,
|
||||
Levenshtein,
|
||||
DamerauLevenshtein,
|
||||
Jaro,
|
||||
RatcliffObershelp,
|
||||
QGramDistance,
|
||||
QGram,
|
||||
Cosine,
|
||||
Jaccard,
|
||||
|
|
|
@ -1,3 +1,27 @@
|
|||
"""
|
||||
Hamming()
|
||||
|
||||
Creates the Hamming distance
|
||||
|
||||
The Hamming distance is defined as the number of characters that do not match
|
||||
"""
|
||||
struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric
|
||||
max_dist::V
|
||||
end
|
||||
Hamming() = Hamming(nothing)
|
||||
|
||||
function (dist::Hamming)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
current = abs(length(s2) - length(s1))
|
||||
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
current += ch1 != ch2
|
||||
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
Jaro()
|
||||
|
||||
|
|
|
@ -11,6 +11,14 @@ end
|
|||
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
|
||||
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)
|
||||
|
||||
function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
out = dist.dist(s1, s2) / len2
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
@doc """
|
||||
pairwise(dist::StringDistance, itr; preprocess = nothing)
|
||||
pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing)
|
||||
pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `itr` according to the
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist`.
|
||||
|
||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
|
||||
Both symmetric and asymmetric versions are available.
|
||||
|
@ -28,55 +28,66 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
|
|||
"""
|
||||
Distances.pairwise
|
||||
|
||||
function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5)
|
||||
T = result_type(dist, eltype(X), eltype(Y))
|
||||
R = Matrix{T}(undef, length(X), length(Y))
|
||||
pairwise!(R, dist, X, Y; preprocess = preprocess)
|
||||
function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
T = result_type(dist, eltype(xs), eltype(ys))
|
||||
if Missing <: Union{eltype(xs), eltype(ys)}
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
R = Matrix{T}(undef, length(xs), length(ys))
|
||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function Distances.pairwise(dist::StringDistance, X; preprocess = nothing)
|
||||
T = result_type(dist, eltype(X), eltype(X))
|
||||
R = Matrix{T}(undef, length(X), length(X))
|
||||
pairwise!(R, dist, X; preprocess = preprocess)
|
||||
function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
T = result_type(dist, eltype(xs), eltype(xs))
|
||||
if Missing <: eltype(xs)
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
R = Matrix{T}(undef, length(xs), length(xs))
|
||||
pairwise!(R, dist, xs; preprocess = preprocess)
|
||||
end
|
||||
|
||||
@doc """
|
||||
pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing)
|
||||
pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing)
|
||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `itr` according to the
|
||||
`StringDistance` `dist` and write the result in `r`.
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`.
|
||||
|
||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
"""
|
||||
Distances.pairwise!
|
||||
|
||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
|
||||
_asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess)
|
||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
|
||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
(dist isa SemiMetric) ?
|
||||
_symmetric_pairwise!(R, dist, X; preprocess = preprocess) :
|
||||
_asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess)
|
||||
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
|
||||
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _preprocess(X, dist::QGramDistance, preprocess)
|
||||
if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5)
|
||||
return map(x -> QGramSortedVector(x, dist.q), X)
|
||||
function _preprocess(xs, dist::QGramDistance, preprocess)
|
||||
if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5)
|
||||
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
||||
else
|
||||
return X
|
||||
return xs
|
||||
end
|
||||
end
|
||||
_preprocess(X, dist::StringDistance, preprocess) = X
|
||||
_preprocess(xs, dist::StringDistance, preprocess) = xs
|
||||
|
||||
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
|
||||
objs = _preprocess(X, dist, preprocess)
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
objs = _preprocess(xs, dist, preprocess)
|
||||
for i in 1:length(objs)
|
||||
R[i, i] = 0
|
||||
# handle missing
|
||||
R[i, i] = objs[i] != objs[i]
|
||||
Threads.@threads for j in (i+1):length(objs)
|
||||
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
||||
end
|
||||
|
@ -84,12 +95,12 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; prepro
|
|||
return R
|
||||
end
|
||||
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
|
||||
objsX = _preprocess(X, dist, preprocess)
|
||||
objsY = _preprocess(Y, dist, preprocess)
|
||||
for i in 1:length(objsX)
|
||||
Threads.@threads for j in 1:length(objsY)
|
||||
R[i, j] = evaluate(dist, objsX[i], objsY[j])
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
objsxs = _preprocess(xs, dist, preprocess)
|
||||
objsys = _preprocess(ys, dist, preprocess)
|
||||
for i in 1:length(objsxs)
|
||||
Threads.@threads for j in 1:length(objsys)
|
||||
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
|
|
|
@ -1,6 +1,13 @@
|
|||
using StringDistances, Unicode, Test, Random
|
||||
|
||||
@testset "Distances" begin
|
||||
@testset "Hamming" begin
|
||||
@test evaluate(Hamming(), "martha", "marhta") ≈ 2
|
||||
@test evaluate(Hamming(), "es an ", " vs an") ≈ 6
|
||||
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) ≈ 1
|
||||
@inferred evaluate(Hamming(), "", "")
|
||||
@test ismissing(evaluate(Hamming(), "", missing))
|
||||
end
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
using StringDistances, Unicode, Test, Random
|
||||
using StringDistances: pairwise, pairwise!, QGramDistance
|
||||
|
||||
@testset "pairwise" begin
|
||||
|
||||
TestStrings1 = ["", "abc", "bc", "kitten"]
|
||||
TestStrings2 = ["mew", "ab"]
|
||||
|
||||
TestStrings1missing = ["", "abc", "bc", missing]
|
||||
TestStrings2missing = ["mew", missing]
|
||||
|
||||
@testset "pairwise" begin
|
||||
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
||||
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||
|
@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"]
|
|||
end
|
||||
end
|
||||
end
|
||||
# ensures missing
|
||||
R5 = pairwise(d, TestStrings1missing; preprocess = true)
|
||||
@test eltype(R5) == Union{result_type(d, String, String), Missing}
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue