add Hamming + restrict pairwise to vectors + handle missings
parent
b407b186f0
commit
e4095682b4
|
@ -7,10 +7,11 @@ include("distances/edit.jl")
|
||||||
include("distances/qgram.jl")
|
include("distances/qgram.jl")
|
||||||
include("normalize.jl")
|
include("normalize.jl")
|
||||||
|
|
||||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||||
# Distances API
|
# Distances API
|
||||||
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
|
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
|
||||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, eltype(s1), eltype(s2))
|
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||||
|
|
||||||
|
|
||||||
include("find.jl")
|
include("find.jl")
|
||||||
include("pairwise.jl")
|
include("pairwise.jl")
|
||||||
|
@ -23,10 +24,12 @@ include("pairwise.jl")
|
||||||
|
|
||||||
export
|
export
|
||||||
StringDistance,
|
StringDistance,
|
||||||
|
Hamming,
|
||||||
Levenshtein,
|
Levenshtein,
|
||||||
DamerauLevenshtein,
|
DamerauLevenshtein,
|
||||||
Jaro,
|
Jaro,
|
||||||
RatcliffObershelp,
|
RatcliffObershelp,
|
||||||
|
QGramDistance,
|
||||||
QGram,
|
QGram,
|
||||||
Cosine,
|
Cosine,
|
||||||
Jaccard,
|
Jaccard,
|
||||||
|
|
|
@ -1,3 +1,27 @@
|
||||||
|
"""
|
||||||
|
Hamming()
|
||||||
|
|
||||||
|
Creates the Hamming distance
|
||||||
|
|
||||||
|
The Hamming distance is defined as the number of characters that do not match
|
||||||
|
"""
|
||||||
|
struct Hamming{V <: Union{Integer, Nothing}} <: SemiMetric
|
||||||
|
max_dist::V
|
||||||
|
end
|
||||||
|
Hamming() = Hamming(nothing)
|
||||||
|
|
||||||
|
function (dist::Hamming)(s1, s2)
|
||||||
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
|
current = abs(length(s2) - length(s1))
|
||||||
|
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
|
||||||
|
for (ch1, ch2) in zip(s1, s2)
|
||||||
|
current += ch1 != ch2
|
||||||
|
dist.max_dist !== nothing && current > dist.max_dist && return dist.max_dist + 1
|
||||||
|
end
|
||||||
|
return current
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Jaro()
|
Jaro()
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,14 @@ end
|
||||||
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
|
normalize(dist::SemiMetric, max_dist = 1.0) = Normalize(dist)
|
||||||
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)
|
normalize(dist::Normalize, max_dist = 1.0) = Normalize(dist.dist)
|
||||||
|
|
||||||
|
function (dist::Normalize{<:Hamming})(s1, s2, max_dist = 1.0)
|
||||||
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
|
s1, s2 = reorder(s1, s2)
|
||||||
|
len1, len2 = length(s1), length(s2)
|
||||||
|
len2 == 0 && return 1.0
|
||||||
|
out = dist.dist(s1, s2) / len2
|
||||||
|
out > max_dist ? 1.0 : out
|
||||||
|
end
|
||||||
|
|
||||||
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
||||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
@doc """
|
@doc """
|
||||||
pairwise(dist::StringDistance, itr; preprocess = nothing)
|
pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||||
pairwise(dist::StringDistance, itr1, itr2; preprocess = nothing)
|
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||||
|
|
||||||
Compute distances between all pairs of elements in `itr` according to the
|
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||||
`StringDistance` `dist`.
|
`StringDistance` `dist`.
|
||||||
|
|
||||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||||
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
|
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||||
false if no preprocessing should be used, regardless of length.
|
false if no preprocessing should be used, regardless of length.
|
||||||
|
|
||||||
Both symmetric and asymmetric versions are available.
|
Both symmetric and asymmetric versions are available.
|
||||||
|
@ -28,55 +28,66 @@ julia> pairwise(Levenshtein(), iter, iter2) # asymmetric
|
||||||
"""
|
"""
|
||||||
Distances.pairwise
|
Distances.pairwise
|
||||||
|
|
||||||
function Distances.pairwise(dist::StringDistance, X, Y; preprocess = length(X) >= 5)
|
function Distances.pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||||
T = result_type(dist, eltype(X), eltype(Y))
|
T = result_type(dist, eltype(xs), eltype(ys))
|
||||||
R = Matrix{T}(undef, length(X), length(Y))
|
if Missing <: Union{eltype(xs), eltype(ys)}
|
||||||
pairwise!(R, dist, X, Y; preprocess = preprocess)
|
T = Union{T, Missing}
|
||||||
|
end
|
||||||
|
R = Matrix{T}(undef, length(xs), length(ys))
|
||||||
|
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
function Distances.pairwise(dist::StringDistance, X; preprocess = nothing)
|
function Distances.pairwise(dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||||
T = result_type(dist, eltype(X), eltype(X))
|
T = result_type(dist, eltype(xs), eltype(xs))
|
||||||
R = Matrix{T}(undef, length(X), length(X))
|
if Missing <: eltype(xs)
|
||||||
pairwise!(R, dist, X; preprocess = preprocess)
|
T = Union{T, Missing}
|
||||||
|
end
|
||||||
|
R = Matrix{T}(undef, length(xs), length(xs))
|
||||||
|
pairwise!(R, dist, xs; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
@doc """
|
@doc """
|
||||||
pairwise!(r::AbstractMatrix, dist::StringDistance, itr; preprocess = nothing)
|
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||||
pairwise!(r::AbstractMatrix, dist::StringDistance, itr1, itr2; preprocess = nothing)
|
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||||
|
|
||||||
Compute distances between all pairs of elements in `itr` according to the
|
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||||
`StringDistance` `dist` and write the result in `r`.
|
`StringDistance` `dist` and write the result in `R`.
|
||||||
|
|
||||||
For QGramDistances preprocessing will be used either if `preprocess` is set
|
For QGramDistances preprocessing will be used either if `preprocess` is set
|
||||||
to true or if there are more than 5 elements in `itr`. Set `preprocess` to
|
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||||
false if no preprocessing should be used, regardless of length.
|
false if no preprocessing should be used, regardless of length.
|
||||||
"""
|
"""
|
||||||
Distances.pairwise!
|
Distances.pairwise!
|
||||||
|
|
||||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
|
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||||
_asymmetric_pairwise!(R, dist, X, Y; preprocess = preprocess)
|
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||||
|
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||||
|
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
|
function Distances.pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||||
|
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||||
|
length(xs) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||||
(dist isa SemiMetric) ?
|
(dist isa SemiMetric) ?
|
||||||
_symmetric_pairwise!(R, dist, X; preprocess = preprocess) :
|
_symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
|
||||||
_asymmetric_pairwise!(R, dist, X, X; preprocess = preprocess)
|
_asymmetric_pairwise!(R, dist, xs, xs; preprocess = preprocess)
|
||||||
end
|
end
|
||||||
|
|
||||||
function _preprocess(X, dist::QGramDistance, preprocess)
|
function _preprocess(xs, dist::QGramDistance, preprocess)
|
||||||
if (preprocess === true) || (isnothing(preprocess) && length(X) >= 5)
|
if (preprocess === true) || (isnothing(preprocess) && length(xs) >= 5)
|
||||||
return map(x -> QGramSortedVector(x, dist.q), X)
|
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
||||||
else
|
else
|
||||||
return X
|
return xs
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
_preprocess(X, dist::StringDistance, preprocess) = X
|
_preprocess(xs, dist::StringDistance, preprocess) = xs
|
||||||
|
|
||||||
|
|
||||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; preprocess = nothing)
|
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||||
objs = _preprocess(X, dist, preprocess)
|
objs = _preprocess(xs, dist, preprocess)
|
||||||
for i in 1:length(objs)
|
for i in 1:length(objs)
|
||||||
R[i, i] = 0
|
# handle missing
|
||||||
|
R[i, i] = objs[i] != objs[i]
|
||||||
Threads.@threads for j in (i+1):length(objs)
|
Threads.@threads for j in (i+1):length(objs)
|
||||||
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
||||||
end
|
end
|
||||||
|
@ -84,12 +95,12 @@ function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X; prepro
|
||||||
return R
|
return R
|
||||||
end
|
end
|
||||||
|
|
||||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, X, Y; preprocess = nothing)
|
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||||
objsX = _preprocess(X, dist, preprocess)
|
objsxs = _preprocess(xs, dist, preprocess)
|
||||||
objsY = _preprocess(Y, dist, preprocess)
|
objsys = _preprocess(ys, dist, preprocess)
|
||||||
for i in 1:length(objsX)
|
for i in 1:length(objsxs)
|
||||||
Threads.@threads for j in 1:length(objsY)
|
Threads.@threads for j in 1:length(objsys)
|
||||||
R[i, j] = evaluate(dist, objsX[i], objsY[j])
|
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
return R
|
return R
|
||||||
|
|
|
@ -1,6 +1,13 @@
|
||||||
using StringDistances, Unicode, Test, Random
|
using StringDistances, Unicode, Test, Random
|
||||||
|
|
||||||
@testset "Distances" begin
|
@testset "Distances" begin
|
||||||
|
@testset "Hamming" begin
|
||||||
|
@test evaluate(Hamming(), "martha", "marhta") ≈ 2
|
||||||
|
@test evaluate(Hamming(), "es an ", " vs an") ≈ 6
|
||||||
|
@test evaluate(Hamming(), [1, 2, 3], [1,2, 4]) ≈ 1
|
||||||
|
@inferred evaluate(Hamming(), "", "")
|
||||||
|
@test ismissing(evaluate(Hamming(), "", missing))
|
||||||
|
end
|
||||||
|
|
||||||
@testset "Jaro" begin
|
@testset "Jaro" begin
|
||||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
using StringDistances, Unicode, Test, Random
|
using StringDistances, Unicode, Test, Random
|
||||||
using StringDistances: pairwise, pairwise!, QGramDistance
|
|
||||||
|
|
||||||
@testset "pairwise" begin
|
@testset "pairwise" begin
|
||||||
|
|
||||||
TestStrings1 = ["", "abc", "bc", "kitten"]
|
TestStrings1 = ["", "abc", "bc", "kitten"]
|
||||||
TestStrings2 = ["mew", "ab"]
|
TestStrings2 = ["mew", "ab"]
|
||||||
|
|
||||||
|
TestStrings1missing = ["", "abc", "bc", missing]
|
||||||
|
TestStrings2missing = ["mew", missing]
|
||||||
|
|
||||||
@testset "pairwise" begin
|
@testset "pairwise" begin
|
||||||
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
|
||||||
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||||
|
@ -79,6 +80,9 @@ TestStrings2 = ["mew", "ab"]
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
# ensures missing
|
||||||
|
R5 = pairwise(d, TestStrings1missing; preprocess = true)
|
||||||
|
@test eltype(R5) == Union{result_type(d, String, String), Missing}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue