add Hamming + restrict pairwise to vectors + handle missings

pull/39/head
matthieugomez 2020-11-09 19:04:35 -08:00
parent b407b186f0
commit e4095682b4
6 changed files with 97 additions and 40 deletions

View File

@ -7,10 +7,11 @@ include("distances/edit.jl")
include("distances/qgram.jl") include("distances/qgram.jl")
include("normalize.jl") include("normalize.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize} const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
# Distances API # Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", "")) Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2)) Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
include("find.jl") include("find.jl")
include("pairwise.jl") include("pairwise.jl")
@ -23,10 +24,12 @@ include("pairwise.jl")
export export
StringDistance, StringDistance,
Hamming,
Levenshtein, Levenshtein,
DamerauLevenshtein, DamerauLevenshtein,
Jaro, Jaro,
RatcliffObershelp, RatcliffObershelp,
QGramDistance,
QGram, QGram,
Cosine, Cosine,
Jaccard, Jaccard,

View File

@ -1,3 +1,27 @@
"""
Hamming()
Creates the Hamming distance
The Hamming distance is defined as the number of characters that do not match
"""
struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric
max_dist::V
end
Hamming() = Hamming(nothing)
function (dist::Hamming)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
current = abs(length(s2) - length(s1))
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
for (ch1, ch2) in zip(s1, s2)
current += ch1 != ch2
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
end
return current
end
""" """
Jaro() Jaro()

View File

@ -11,6 +11,14 @@ end
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist) normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist) normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)
function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
out = dist.dist(s1, s2) / len2
out > max_dist ? 1.0 : out
end
# A normalized distance is between 0 and 1, and accept a third argument, max_dist. # A normalized distance is between 0 and 1, and accept a third argument, max_dist.
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0) function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)

View File

@ -1,12 +1,12 @@
@doc """ @doc """
pairwise(dist::StringDistance, itr; preprocess = nothing) pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing) pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist`. `StringDistance` `dist`.
For QGramDistances preprocessing will be used either if `preprocess` is set For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length. false if no preprocessing should be used, regardless of length.
Both symmetric and asymmetric versions are available. Both symmetric and asymmetric versions are available.
@ -28,55 +28,66 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
""" """
Distances.pairwise Distances.pairwise
function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5) function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(X), eltype(Y)) T = result_type(dist, eltype(xs), eltype(ys))
R = Matrix{T}(undef, length(X), length(Y)) if Missing <: Union{eltype(xs), eltype(ys)}
pairwise!(R, dist, X, Y; preprocess = preprocess) T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end end
function Distances.pairwise(dist::StringDistance, X; preprocess = nothing) function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
T = result_type(dist, eltype(X), eltype(X)) T = result_type(dist, eltype(xs), eltype(xs))
R = Matrix{T}(undef, length(X), length(X)) if Missing <: eltype(xs)
pairwise!(R, dist, X; preprocess = preprocess) T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(xs))
pairwise!(R, dist, xs; preprocess = preprocess)
end end
@doc """ @doc """
pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing) pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing) pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
Compute distances between all pairs of elements in `itr` according to the Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `r`. `StringDistance` `dist` and write the result in `R`.
For QGramDistances preprocessing will be used either if `preprocess` is set For QGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `itr`. Set `preprocess` to to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length. false if no preprocessing should be used, regardless of length.
""" """
Distances.pairwise! Distances.pairwise!
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing) function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
_asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess) length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end end
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing) function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
(dist isa SemiMetric) ? (dist isa SemiMetric) ?
_symmetric_pairwise!(R, dist, X; preprocess = preprocess) : _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
_asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess) _asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
end end
function _preprocess(X, dist::QGramDistance, preprocess) function _preprocess(xs, dist::QGramDistance, preprocess)
if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5) if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5)
return map(x -> QGramSortedVector(x, dist.q), X) return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
else else
return X return xs
end end
end end
_preprocess(X, dist::StringDistance, preprocess) = X _preprocess(xs, dist::StringDistance, preprocess) = xs
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing) function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(X, dist, preprocess) objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs) for i in 1:length(objs)
R[i, i] = 0 # handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs) Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j]) R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
end end
@ -84,12 +95,12 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; prepro
return R return R
end end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing) function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsX = _preprocess(X, dist, preprocess) objsxs = _preprocess(xs, dist, preprocess)
objsY = _preprocess(Y, dist, preprocess) objsys = _preprocess(ys, dist, preprocess)
for i in 1:length(objsX) for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsY) Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsX[i], objsY[j]) R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end end
end end
return R return R

View File

@ -1,6 +1,13 @@
using StringDistances, Unicode, Test, Random using StringDistances, Unicode, Test, Random
@testset "Distances" begin @testset "Distances" begin
@testset "Hamming" begin
@test evaluate(Hamming(), "martha", "marhta") 2
@test evaluate(Hamming(), "es an ", " vs an") 6
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) 1
@inferred evaluate(Hamming(), "", "")
@test ismissing(evaluate(Hamming(), "", missing))
end
@testset "Jaro" begin @testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547 @test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547

View File

@ -1,11 +1,12 @@
using StringDistances, Unicode, Test, Random using StringDistances, Unicode, Test, Random
using StringDistances: pairwise, pairwise!, QGramDistance
@testset "pairwise" begin @testset "pairwise" begin
TestStrings1 = ["", "abc", "bc", "kitten"] TestStrings1 = ["", "abc", "bc", "kitten"]
TestStrings2 = ["mew", "ab"] TestStrings2 = ["mew", "ab"]
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
@testset "pairwise" begin @testset "pairwise" begin
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap] QGram, Cosine, Jaccard, SorensenDice, Overlap]
@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"]
end end
end end
end end
# ensures missing
R5 = pairwise(d, TestStrings1missing; preprocess = true)
@test eltype(R5) == Union{result_type(d, String, String), Missing}
end end
end end