simplify a bit preprocessed qgrams (#50)
parent
633a2d85dc
commit
e9b224f03f
|
@ -1,6 +1,6 @@
|
|||
name = "StringDistances"
|
||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||
version = "0.10.0"
|
||||
version = "0.10.1"
|
||||
|
||||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
|
|
|
@ -14,47 +14,46 @@ end
|
|||
|
||||
|
||||
|
||||
@time f(Jaro(), x, y)
|
||||
@time f(Jaro(), x, y);
|
||||
#0.3s
|
||||
@time f(Levenshtein(), x, y)
|
||||
@time f(Levenshtein(), x, y);
|
||||
# 0.4s
|
||||
@time f(Levenshtein(), x, y, min_score = 0.8)
|
||||
@time f(Levenshtein(), x, y, min_score = 0.8);
|
||||
# 0.11
|
||||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time f(DamerauLevenshtein(), x, y);
|
||||
# 0.58s.
|
||||
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
|
||||
@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
|
||||
# 0.08 (now 0.09)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
@time f(RatcliffObershelp(), x, y);
|
||||
# 1.35s
|
||||
|
||||
|
||||
|
||||
|
||||
@time findnearest(x[1], y, Levenshtein())
|
||||
@time findnearest(x[1], y, Levenshtein());
|
||||
# 0.02
|
||||
@time findnearest(x[1], y, DamerauLevenshtein())
|
||||
@time findnearest(x[1], y, DamerauLevenshtein());
|
||||
# 0.05
|
||||
@time findnearest(x[1], y, QGram(2))
|
||||
@time findnearest(x[1], y, QGram(2));
|
||||
# 0.75
|
||||
|
||||
|
||||
|
||||
@time findall(x[1], y, Levenshtein())
|
||||
@time findall(x[1], y, Levenshtein());
|
||||
# 0.05
|
||||
@time findall(x[1], y, DamerauLevenshtein())
|
||||
@time findall(x[1], y, DamerauLevenshtein());
|
||||
# 0.05
|
||||
@time findall(x[1], y, Partial(DamerauLevenshtein()))
|
||||
@time findall(x[1], y, Partial(DamerauLevenshtein()));
|
||||
# 0.96
|
||||
@time findall(x[1], y, QGram(2))
|
||||
@time findall(x[1], y, QGram(2));
|
||||
# 0.81
|
||||
@time findall(x[1], y, TokenSort(DamerauLevenshtein()))
|
||||
@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
|
||||
# 0.27 (now 0.32)
|
||||
@time findall(x[1], y, TokenSet(DamerauLevenshtein()))
|
||||
@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
|
||||
# 0.55
|
||||
@time findall(x[1], y, TokenMax(DamerauLevenshtein()))
|
||||
@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
|
||||
# 2.25 (now 3.6)
|
||||
@time findnearest(x[1], y, DamerauLevenshtein())
|
||||
# 0.15
|
||||
|
||||
|
||||
x = map(Random.randstring, rand(5:25,1000))
|
||||
y = map(Random.randstring, rand(5:25,1000))
|
||||
|
|
|
@ -6,6 +6,8 @@ import StatsAPI: pairwise, pairwise!
|
|||
include("distances/utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("distances/qgram_preprocessed.jl")
|
||||
|
||||
include("modifiers.jl")
|
||||
include("normalize.jl")
|
||||
include("pairwise.jl")
|
||||
|
|
|
@ -1,6 +1,13 @@
|
|||
struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
|
||||
s::S # Collection
|
||||
q::Int # Length of Qgram
|
||||
function QGramIterator{S}(s, q) where {S <: Union{AbstractString, AbstractVector}}
|
||||
q > 0 || throw(ArgumentError("The qgram length must be higher than zero"))
|
||||
new(s, q)
|
||||
end
|
||||
end
|
||||
function QGramIterator(s::Union{AbstractString, AbstractVector}, q::Integer)
|
||||
QGramIterator{typeof(s)}(s, q)
|
||||
end
|
||||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||||
|
||||
|
@ -51,7 +58,7 @@ qgrams
|
|||
# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
|
||||
function _count(s1, s2)
|
||||
K = promote_type(eltype(s1), eltype(s2))
|
||||
d = Dict{K, Tuple{Int, Int}}()
|
||||
d = Dict{K, Tuple{Int32, Int32}}()
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
|
@ -78,164 +85,7 @@ function _count(s1, s2)
|
|||
return values(d)
|
||||
end
|
||||
|
||||
# Turn a sequence of qgrams to a count dict for them, i.e. map each
|
||||
# qgram to the number of times it has been seen.
|
||||
function countdict(qgrams)
|
||||
d = Dict{eltype(qgrams), Int32}()
|
||||
for qg in qgrams
|
||||
index = Base.ht_keyindex2!(d, qg)
|
||||
if index > 0
|
||||
d.age += 1
|
||||
@inbounds d.keys[index] = qg
|
||||
@inbounds d.vals[index] = d.vals[index][1] + 1
|
||||
else
|
||||
@inbounds Base._setindex!(d, 1, qg, -index)
|
||||
end
|
||||
end
|
||||
d
|
||||
end
|
||||
|
||||
abstract type AbstractQGramCounts{Q,K} end
|
||||
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
|
||||
counts(qc::AbstractQGramCounts) = qc.counts
|
||||
Base.length(qc::AbstractQGramCounts{Q}) where Q = length(qc.counts) + Q - 1
|
||||
"""
|
||||
QGramDict(s, q::Integer = 2)
|
||||
|
||||
Creates a QGramDict that pre-calculates (pre-counts) the qgrams
|
||||
of a string or stream. This enables faster calculation of QGram
|
||||
distances.
|
||||
|
||||
Note that the qgram length must correspond with the q length used
|
||||
in the distance.
|
||||
|
||||
## Examples
|
||||
```julia
|
||||
str1, str2 = "my string", "another string"
|
||||
qd1 = QGramDict(str1, 2)
|
||||
qd2 = QGramDict(str2, 2)
|
||||
evaluate(Overlap(2), qd1, qd2)
|
||||
```
|
||||
"""
|
||||
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
counts::Dict{K,Int}
|
||||
end
|
||||
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
@assert q >= 1
|
||||
qgs = qgrams(s, q)
|
||||
QGramDict{q, eltype(qgs)}(countdict(qgs))
|
||||
end
|
||||
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
|
||||
|
||||
"""
|
||||
QGramSortedVector(s, q::Integer = 2)
|
||||
|
||||
Creates a QGramSortedVector that pre-calculates (pre-counts) the
|
||||
qgrams of a string or stream. This enables faster calculation of
|
||||
QGram distances.
|
||||
|
||||
Since qgrams are sorted in lexicographic order QGram distances can be
|
||||
calculated even faster than when using a QGramDict. However, the
|
||||
sorting means that updating the counts after creation is less
|
||||
efficient. However, for most use cases QGramSortedVector is preferred
|
||||
over a QgramDict.
|
||||
|
||||
Note that the qgram length must correspond with the q length used
|
||||
in the distance.
|
||||
|
||||
## Examples
|
||||
```julia
|
||||
str1, str2 = "my string", "another string"
|
||||
qs1 = QGramSortedVector(str1, 2)
|
||||
qs2 = QGramSortedVector(str2, 2)
|
||||
evaluate(Jaccard(2), qs1, qs2)
|
||||
```
|
||||
"""
|
||||
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
counts::Vector{Pair{K,Int}}
|
||||
end
|
||||
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
@assert q >= 1
|
||||
qgs = qgrams(s, q)
|
||||
countpairs = collect(countdict(qgs))
|
||||
sort!(countpairs, by = first)
|
||||
QGramSortedVector{q, eltype(qgs)}(countpairs)
|
||||
end
|
||||
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
|
||||
|
||||
# To implement the distances we will count qgram matches
|
||||
# between strings or pre-calculated AbstractQgramCounts objects.
|
||||
# The abstract type defines different fallback versions which can be
|
||||
# specialied by subtypes for best performance.
|
||||
abstract type AbstractQGramMatchCounter end
|
||||
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
|
||||
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
|
||||
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
|
||||
countboth!(c, n1, n2)
|
||||
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
|
||||
countleft!(c, n1)
|
||||
countright!(c, n2)
|
||||
countshared!(c, n1, n2)
|
||||
end
|
||||
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
|
||||
|
||||
# Subtypes must implement these methods:
|
||||
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
|
||||
error("countleft! not implemented for $(typeof(c))")
|
||||
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
|
||||
error("countright! not implemented for $(typeof(c))")
|
||||
|
||||
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
|
||||
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
|
||||
error("countshared! not implemented for $(typeof(c))")
|
||||
|
||||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
|
||||
i1 = i2 = 1
|
||||
while i1 <= length(d1) || i2 <= length(d2)
|
||||
if i2 > length(d2)
|
||||
for i in i1:length(d1)
|
||||
@inbounds countleft!(mc, d1[i][1], d1[i][2])
|
||||
end
|
||||
return
|
||||
elseif i1 > length(d1)
|
||||
for i in i2:length(d2)
|
||||
@inbounds countright!(mc, d2[i][1], d2[i][2])
|
||||
end
|
||||
return
|
||||
end
|
||||
@inbounds k1, n1 = d1[i1]
|
||||
@inbounds k2, n2 = d2[i2]
|
||||
cmpval = Base.cmp(k1, k2)
|
||||
if cmpval == -1 # k1 < k2
|
||||
countleft!(mc, k1, n1)
|
||||
i1 += 1
|
||||
elseif cmpval == +1 # k2 < k1
|
||||
countright!(mc, k2, n2)
|
||||
i2 += 1
|
||||
else
|
||||
countboth!(mc, k1, n1, n2)
|
||||
i1 += 1
|
||||
i2 += 1
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
|
||||
for (k1, c1) in d1
|
||||
index = Base.ht_keyindex2!(d2, k1)
|
||||
if index > 0
|
||||
countboth!(mc, k1, c1, d2.vals[index])
|
||||
else
|
||||
countleft!(mc, k1, c1)
|
||||
end
|
||||
end
|
||||
for (k2, c2) in d2
|
||||
index = Base.ht_keyindex2!(d1, k2)
|
||||
if index <= 0
|
||||
countright!(mc, k2, c2)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
|
||||
|
@ -243,18 +93,11 @@ function (dist::AbstractQGramDistance)(s1, s2)
|
|||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
counter = newcounter(dist)
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
countboth!(counter, n1, n2)
|
||||
count!(counter, n1, n2)
|
||||
end
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
|
||||
@assert dist.q == q(qc1)
|
||||
@assert dist.q == q(qc2)
|
||||
counter = newcounter(dist)
|
||||
countmatches!(counter, counts(qc1), counts(qc2))
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
"""
|
||||
QGram(q::Int)
|
||||
|
@ -272,17 +115,15 @@ struct QGram <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
n::T
|
||||
mutable struct SingleCounter{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
shared::Int
|
||||
end
|
||||
|
||||
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
|
||||
|
||||
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
|
||||
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
|
||||
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
|
||||
|
||||
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
|
||||
newcounter(d::QGram) = SingleCounter{QGram}(0)
|
||||
@inline function count!(c::SingleCounter{QGram}, n1::Integer, n2::Integer)
|
||||
c.shared += abs(n1 - n2)
|
||||
end
|
||||
calculate(dist::QGram, c::SingleCounter{QGram}) = c.shared
|
||||
|
||||
"""
|
||||
Cosine(q::Int)
|
||||
|
@ -300,19 +141,19 @@ struct Cosine <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
left::T
|
||||
right::T
|
||||
shared::T
|
||||
mutable struct ThreeCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
left::Int
|
||||
right::Int
|
||||
shared::Int
|
||||
end
|
||||
|
||||
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
|
||||
|
||||
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
|
||||
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
|
||||
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
|
||||
|
||||
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
|
||||
newcounter(d::Cosine) = ThreeCounters{Cosine}(0, 0, 0)
|
||||
@inline function count!(c::ThreeCounters{Cosine}, n1::Integer, n2::Integer)
|
||||
c.left += n1^2
|
||||
c.right += n2^2
|
||||
c.shared += n1 * n2
|
||||
end
|
||||
calculate(d::Cosine, c::ThreeCounters{Cosine}) =
|
||||
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
|
||||
|
||||
"""
|
||||
|
@ -329,8 +170,13 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct Jaccard <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
|
||||
newcounter(d::Jaccard) = ThreeCounters{Jaccard}(0, 0, 0)
|
||||
@inline function count!(c::ThreeCounters{Jaccard}, n1::Integer, n2::Integer)
|
||||
c.left += (n1 > 0)
|
||||
c.right += (n2 > 0)
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
calculate(d::Jaccard, c::ThreeCounters{Jaccard}) =
|
||||
1.0 - c.shared / (c.left + c.right - c.shared)
|
||||
|
||||
"""
|
||||
|
@ -347,8 +193,13 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct SorensenDice <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
|
||||
newcounter(d::SorensenDice) = ThreeCounters{SorensenDice}(0, 0, 0)
|
||||
@inline function count!(c::ThreeCounters{SorensenDice}, n1::Integer, n2::Integer)
|
||||
c.left += (n1 > 0)
|
||||
c.right += (n2 > 0)
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
end
|
||||
calculate(d::SorensenDice, c::ThreeCounters{SorensenDice}) =
|
||||
1.0 - 2.0 * c.shared / (c.left + c.right)
|
||||
|
||||
"""
|
||||
|
@ -365,67 +216,15 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
|
|||
struct Overlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
|
||||
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
|
||||
|
||||
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
|
||||
newcounter(d::Overlap) = ThreeCounters{Overlap}(0, 0, 0)
|
||||
@inline function count!(c::ThreeCounters{Overlap}, n1::Integer, n2::Integer)
|
||||
c.left += (n1 > 0)
|
||||
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
|
||||
c.right += (n2 > 0)
|
||||
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
|
||||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
|
||||
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
|
||||
end
|
||||
calculate(d::Overlap, c::ThreeCounters{Overlap}) =
|
||||
1.0 - c.shared / min(c.left, c.right)
|
||||
|
||||
"""
|
||||
MorisitaOverlap(q::Int)
|
||||
|
||||
Creates a MorisitaOverlap distance, a general, statistical measure of
|
||||
dispersion which can also be used on dictionaries such as created
|
||||
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
|
||||
This is more fine-grained than many of the other QGramDistances since
|
||||
it is based on the counts per q-gram rather than only which q-grams are
|
||||
in the strings.
|
||||
|
||||
The distance corresponds to
|
||||
|
||||
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
|
||||
|
||||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||
sum of those counts.
|
||||
"""
|
||||
struct MorisitaOverlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
leftsum::T # sum(m(s1))
|
||||
rightsum::T # sum(m(s2))
|
||||
leftsq::T # sum(m(s1).^2)
|
||||
rightsq::T # sum(m(s2).^2)
|
||||
shared::T # sum(m(s1) .* m(s2))
|
||||
end
|
||||
|
||||
newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
|
||||
|
||||
@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
|
||||
c.leftsum += n1
|
||||
c.leftsq += (n1^2)
|
||||
end
|
||||
|
||||
@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
|
||||
c.rightsum += n2
|
||||
c.rightsq += (n2^2)
|
||||
end
|
||||
|
||||
@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
|
||||
c.shared += (n1 * n2)
|
||||
|
||||
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
|
||||
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
|
||||
|
||||
"""
|
||||
NMD(q::Int)
|
||||
NMD(q::Int)
|
||||
|
@ -449,23 +248,53 @@ struct NMD <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
|
||||
|
||||
@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
|
||||
c.left += n1
|
||||
c.shared += n1 # max(n1, 0) == n1
|
||||
end
|
||||
|
||||
@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
|
||||
c.right += n2
|
||||
c.shared += n2 # max(n2, 0) == n2
|
||||
end
|
||||
|
||||
@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
|
||||
newcounter(d::NMD) = ThreeCounters{NMD}(0, 0, 0)
|
||||
@inline function count!(c::ThreeCounters{NMD}, n1::Integer, n2::Integer)
|
||||
c.left += n1
|
||||
c.right += n2
|
||||
c.shared += max(n1, n2)
|
||||
end
|
||||
|
||||
calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
|
||||
calculate(d::NMD, c::ThreeCounters{NMD}) =
|
||||
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
|
||||
|
||||
|
||||
"""
|
||||
MorisitaOverlap(q::Int)
|
||||
|
||||
Creates a MorisitaOverlap distance, a general, statistical measure of
|
||||
dispersion which can also be used on dictionaries such as created
|
||||
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
|
||||
This is more fine-grained than many of the other QGramDistances since
|
||||
it is based on the counts per q-gram rather than only which q-grams are
|
||||
in the strings.
|
||||
|
||||
The distance corresponds to
|
||||
|
||||
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
|
||||
|
||||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||
sum of those counts.
|
||||
"""
|
||||
struct MorisitaOverlap <: AbstractQGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct FiveCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
|
||||
leftsum::Int # sum(m(s1))
|
||||
rightsum::Int # sum(m(s2))
|
||||
leftsq::Int # sum(m(s1).^2)
|
||||
rightsq::Int # sum(m(s2).^2)
|
||||
shared::Int # sum(m(s1) .* m(s2))
|
||||
end
|
||||
|
||||
newcounter(d::MorisitaOverlap) = FiveCounters{MorisitaOverlap}(0, 0, 0, 0, 0)
|
||||
@inline function count!(c::FiveCounters{MorisitaOverlap}, n1::Integer, n2::Integer)
|
||||
c.leftsum += n1
|
||||
c.rightsum += n2
|
||||
c.leftsq += (n1^2)
|
||||
c.rightsq += (n2^2)
|
||||
c.shared += (n1 * n2)
|
||||
end
|
||||
calculate(d::MorisitaOverlap, c::FiveCounters{MorisitaOverlap}) =
|
||||
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
|
||||
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
# sometimes we already preprocess the strings
|
||||
# We now define special methods for these special string types
|
||||
"""
|
||||
QGramDict(s, q::Integer = 2)
|
||||
|
||||
An iterator with a pre-computed dictionary of its qgrams. This enables faster calculation of QGram
|
||||
distances.
|
||||
|
||||
Note that the qgram length must correspond with the q length used
|
||||
in the distance.
|
||||
|
||||
## Examples
|
||||
```julia
|
||||
str1, str2 = "my string", "another string"
|
||||
qd1 = QGramDict(str1, 2)
|
||||
qd2 = QGramDict(str2, 2)
|
||||
evaluate(Overlap(2), qd1, qd2)
|
||||
```
|
||||
"""
|
||||
struct QGramDict{S, K}
|
||||
s::S
|
||||
q::Int
|
||||
counts::Dict{K,Int}
|
||||
end
|
||||
Base.length(s::QGramDict) = length(s.s)
|
||||
Base.iterate(s::QGramDict) = iterate(s.s)
|
||||
Base.iterate(s::QGramDict, state) = iterate(s.s, state)
|
||||
|
||||
function QGramDict(s, q::Integer = 2)
|
||||
(s isa QGramDict) && (s.q == q) && return s
|
||||
qgs = qgrams(s, q)
|
||||
QGramDict{typeof(s), eltype(qgs)}(s, q, countdict(qgs))
|
||||
end
|
||||
|
||||
# Turn a sequence of qgrams to a count dict for them, i.e. map each
|
||||
# qgram to the number of times it has been seen.
|
||||
function countdict(qgrams)
|
||||
d = Dict{eltype(qgrams), Int32}()
|
||||
for qg in qgrams
|
||||
index = Base.ht_keyindex2!(d, qg)
|
||||
if index > 0
|
||||
d.age += 1
|
||||
@inbounds d.keys[index] = qg
|
||||
@inbounds d.vals[index] = d.vals[index][1] + 1
|
||||
else
|
||||
@inbounds Base._setindex!(d, 1, qg, -index)
|
||||
end
|
||||
end
|
||||
d
|
||||
end
|
||||
|
||||
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
|
||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
|
||||
counter = newcounter(dist)
|
||||
d1, d2 = qc1.counts, qc2.counts
|
||||
for (k1, c1) in d1
|
||||
index = Base.ht_keyindex2!(d2, k1)
|
||||
if index > 0
|
||||
count!(counter, c1, d2.vals[index])
|
||||
else
|
||||
count!(counter, c1, 0)
|
||||
end
|
||||
end
|
||||
for (k2, c2) in d2
|
||||
index = Base.ht_keyindex2!(d1, k2)
|
||||
if index <= 0
|
||||
count!(counter, 0, c2)
|
||||
end
|
||||
end
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
||||
"""
|
||||
QGramSortedVector(s, q::Integer = 2)
|
||||
|
||||
An iterator with a pre-computed sorted vector of its qgrams. This enables faster calculation of QGram
|
||||
distances.
|
||||
|
||||
Since qgrams are sorted in lexicographic order QGram distances can be
|
||||
calculated even faster than when using a QGramDict. However, the
|
||||
sorting means that updating the counts after creation is less
|
||||
efficient. However, for most use cases QGramSortedVector is preferred
|
||||
over a QgramDict.
|
||||
|
||||
Note that the qgram length must correspond with the q length used
|
||||
in the distance.
|
||||
|
||||
## Examples
|
||||
```julia
|
||||
str1, str2 = "my string", "another string"
|
||||
qs1 = QGramSortedVector(str1, 2)
|
||||
qs2 = QGramSortedVector(str2, 2)
|
||||
evaluate(Jaccard(2), qs1, qs2)
|
||||
```
|
||||
"""
|
||||
struct QGramSortedVector{S, K}
|
||||
s::S
|
||||
q::Int
|
||||
counts::Vector{Pair{K,Int}}
|
||||
end
|
||||
Base.length(s::QGramSortedVector) = length(s.s)
|
||||
Base.iterate(s::QGramSortedVector) = iterate(s.s)
|
||||
Base.iterate(s::QGramSortedVector, state) = iterate(s.s, state)
|
||||
|
||||
function QGramSortedVector(s, q::Integer = 2)
|
||||
(s isa QGramSortedVector) && (s.q == q) && return s
|
||||
qgs = qgrams(s, q)
|
||||
countpairs = collect(countdict(qgs))
|
||||
sort!(countpairs, by = first)
|
||||
QGramSortedVector{typeof(s), eltype(qgs)}(s, q, countpairs)
|
||||
end
|
||||
|
||||
|
||||
|
||||
# To implement the distances we will count qgram matches
|
||||
# between strings or pre-calculated AbstractQgramCounts objects.
|
||||
# The abstract type defines different fallback versions which can be
|
||||
# specialied by subtypes for best performance.
|
||||
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
|
||||
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
|
||||
counter = newcounter(dist)
|
||||
d1, d2 = qc1.counts, qc2.counts
|
||||
i1 = i2 = 1
|
||||
while true
|
||||
# length can be zero
|
||||
if i2 > length(d2)
|
||||
for i in i1:length(d1)
|
||||
@inbounds count!(counter, d1[i][2], 0)
|
||||
end
|
||||
break
|
||||
elseif i1 > length(d1)
|
||||
for i in i2:length(d2)
|
||||
@inbounds count!(counter, 0, d2[i][2])
|
||||
end
|
||||
break
|
||||
end
|
||||
@inbounds k1, n1 = d1[i1]
|
||||
@inbounds k2, n2 = d2[i2]
|
||||
cmpval = Base.cmp(k1, k2)
|
||||
if cmpval == -1 # k1 < k2
|
||||
count!(counter, n1, 0)
|
||||
i1 += 1
|
||||
elseif cmpval == +1 # k2 < k1
|
||||
count!(counter, 0, n2)
|
||||
i2 += 1
|
||||
else
|
||||
count!(counter, n1, n2)
|
||||
i1 += 1
|
||||
i2 += 1
|
||||
end
|
||||
end
|
||||
calculate(dist, counter)
|
||||
end
|
||||
|
|
@ -170,10 +170,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
s = _helper(s, dist)
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(itr[i], dist), dist; min_score = min_score_atomic[])
|
||||
for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
|
||||
score_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= score_old
|
||||
scores[Threads.threadid()] = score
|
||||
|
@ -183,12 +183,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
imax = is[argmax(scores)]
|
||||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||
end
|
||||
|
||||
function _helper(s, dist::AbstractQGramDistance)
|
||||
s !== missing ? QGramSortedVector(s, dist.q) : s
|
||||
end
|
||||
_helper(s, dist::StringDistance) = s
|
||||
|
||||
_helper(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_helper(dist::StringDistance, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
|
@ -218,10 +215,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
|||
"""
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _helper(s, dist)
|
||||
s = _helper(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(itr[i], dist), dist; min_score = min_score)
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
push!(out[Threads.threadid()], i)
|
||||
end
|
||||
|
|
|
@ -38,7 +38,7 @@ end
|
|||
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
Compute distances between all pairs of elements in `xs` and `ys` according to the
|
||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
|
||||
|
||||
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
|
||||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
|
@ -75,11 +75,13 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
|
|||
return R
|
||||
end
|
||||
|
||||
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
|
||||
if preprocess === nothing ? length(xs) >= 5 : preprocess
|
||||
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
|
||||
function _preprocess(xs, dist::StringDistance, preprocess)
|
||||
if preprocess === nothing
|
||||
preprocess = length(xs) >= 5
|
||||
end
|
||||
if (dist isa AbstractQGramDistance) && preprocess
|
||||
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
||||
else
|
||||
return xs
|
||||
end
|
||||
end
|
||||
_preprocess(xs, dist::StringDistance, preprocess) = xs
|
||||
|
|
|
@ -174,7 +174,7 @@ using StringDistances, Unicode, Test, Random
|
|||
# To get something we can more easily compare to:
|
||||
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
|
||||
stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
|
||||
sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
|
||||
sortedcounts(qc) = sort(collect(qc.counts), by = first)
|
||||
totuples(qc) = map(stringify, sortedcounts(qc))
|
||||
|
||||
s1, s2 = "arnearne", "arnebeda"
|
||||
|
|
|
@ -133,6 +133,7 @@ end
|
|||
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
|
||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
||||
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1)
|
||||
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], QGram(2)) == ("NewYork", 3)
|
||||
|
||||
# findall
|
||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||
|
|
Loading…
Reference in New Issue