add Hamming + restrict pairwise to vectors + handle missings

pull/39/head
matthieugomez 2020-11-09 19:04:35 -08:00
parent b407b186f0
commit e4095682b4
6 changed files with 97 additions and 40 deletions

View File

@ -7,10 +7,11 @@ include("distances/edit.jl")
include("distances/qgram.jl")
include("normalize.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
# Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
include("find.jl")
include("pairwise.jl")
@ -23,10 +24,12 @@ include("pairwise.jl")
export
StringDistance,
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
RatcliffObershelp,
QGramDistance,
QGram,
Cosine,
Jaccard,

View File

@ -1,3 +1,27 @@
"""
Hamming()
Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match
"""
struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric
max_dist::V
end
Hamming() = Hamming(nothing)
function (dist::Hamming)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
current = abs(length(s2) - length(s1))
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
for (ch1, ch2) in zip(s1, s2)
current += ch1 != ch2
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
end
return current
end
"""
Jaro()

View File

@ -11,6 +11,14 @@ end
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)
function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
out = dist.dist(s1, s2) / len2
out > max_dist ? 1.0 : out
end
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)

View File

@ -1,12 +1,12 @@
@doc """
pairwise(dist::StringDistance, itr; preprocess = nothing)
pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing)
pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`.
For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available.
@ -28,55 +28,66 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
"""
Distances.pairwise
function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5)
T = result_type(dist, eltype(X), eltype(Y))
R = Matrix{T}(undef, length(X), length(Y))
pairwise!(R, dist, X, Y; preprocess = preprocess)
function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function Distances.pairwise(dist::StringDistance, X; preprocess = nothing)
T = result_type(dist, eltype(X), eltype(X))
R = Matrix{T}(undef, length(X), length(X))
pairwise!(R, dist, X; preprocess = preprocess)
function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(xs), eltype(xs))
if Missing <: eltype(xs)
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(xs))
pairwise!(R, dist, xs; preprocess = preprocess)
end
@doc """
pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing)
pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing)
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the
`StringDistance` `dist` and write the result in `r`.
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`.
For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
Distances.pairwise!
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
_asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess)
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
(dist isa SemiMetric) ?
_symmetric_pairwise!(R, dist, X; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess)
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
end
function _preprocess(X, dist::QGramDistance, preprocess)
if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5)
return map(x -> QGramSortedVector(x, dist.q), X)
function _preprocess(xs, dist::QGramDistance, preprocess)
if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5)
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
else
return X
return xs
end
end
_preprocess(X, dist::StringDistance, preprocess) = X
_preprocess(xs, dist::StringDistance, preprocess) = xs
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
objs = _preprocess(X, dist, preprocess)
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
R[i, i] = 0
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
end
@ -84,12 +95,12 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; prepro
return R
end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
objsX = _preprocess(X, dist, preprocess)
objsY = _preprocess(Y, dist, preprocess)
for i in 1:length(objsX)
Threads.@threads for j in 1:length(objsY)
R[i, j] = evaluate(dist, objsX[i], objsY[j])
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
objsys = _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end
end
return R

View File

@ -1,6 +1,13 @@
using StringDistances, Unicode, Test, Random
@testset "Distances" begin
@testset "Hamming" begin
@test evaluate(Hamming(), "martha", "marhta") 2
@test evaluate(Hamming(), "es an ", " vs an") 6
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) 1
@inferred evaluate(Hamming(), "", "")
@test ismissing(evaluate(Hamming(), "", missing))
end
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547

View File

@ -1,11 +1,12 @@
using StringDistances, Unicode, Test, Random
using StringDistances: pairwise, pairwise!, QGramDistance
@testset "pairwise" begin
TestStrings1 = ["", "abc", "bc", "kitten"]
TestStrings2 = ["mew", "ab"]
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
@testset "pairwise" begin
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap]
@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"]
end
end
end
# ensures missing
R5 = pairwise(d, TestStrings1missing; preprocess = true)
@test eltype(R5) == Union{result_type(d, String, String), Missing}
end
end