simplify a bit preprocessed qgrams (#50)

pull/51/head
Matthieu Gomez 2021-08-08 06:58:42 +02:00 committed by GitHub
parent 633a2d85dc
commit e9b224f03f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 281 additions and 297 deletions

View File

@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.10.0"
version = "0.10.1"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

View File

@ -14,47 +14,46 @@ end
@time f(Jaro(), x, y)
@time f(Jaro(), x, y);
#0.3s
@time f(Levenshtein(), x, y)
@time f(Levenshtein(), x, y);
# 0.4s
@time f(Levenshtein(), x, y, min_score = 0.8)
@time f(Levenshtein(), x, y, min_score = 0.8);
# 0.11
@time f(DamerauLevenshtein(), x, y)
@time f(DamerauLevenshtein(), x, y);
# 0.58s.
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
# 0.08 (now 0.09)
@time f(RatcliffObershelp(), x, y)
@time f(RatcliffObershelp(), x, y);
# 1.35s
@time findnearest(x[1], y, Levenshtein())
@time findnearest(x[1], y, Levenshtein());
# 0.02
@time findnearest(x[1], y, DamerauLevenshtein())
@time findnearest(x[1], y, DamerauLevenshtein());
# 0.05
@time findnearest(x[1], y, QGram(2))
@time findnearest(x[1], y, QGram(2));
# 0.75
@time findall(x[1], y, Levenshtein())
@time findall(x[1], y, Levenshtein());
# 0.05
@time findall(x[1], y, DamerauLevenshtein())
@time findall(x[1], y, DamerauLevenshtein());
# 0.05
@time findall(x[1], y, Partial(DamerauLevenshtein()))
@time findall(x[1], y, Partial(DamerauLevenshtein()));
# 0.96
@time findall(x[1], y, QGram(2))
@time findall(x[1], y, QGram(2));
# 0.81
@time findall(x[1], y, TokenSort(DamerauLevenshtein()))
@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
# 0.27 (now 0.32)
@time findall(x[1], y, TokenSet(DamerauLevenshtein()))
@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
# 0.55
@time findall(x[1], y, TokenMax(DamerauLevenshtein()))
@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
# 2.25 (now 3.6)
@time findnearest(x[1], y, DamerauLevenshtein())
# 0.15
x = map(Random.randstring, rand(5:25,1000))
y = map(Random.randstring, rand(5:25,1000))

View File

@ -6,6 +6,8 @@ import StatsAPI: pairwise, pairwise!
include("distances/utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("distances/qgram_preprocessed.jl")
include("modifiers.jl")
include("normalize.jl")
include("pairwise.jl")

View File

@ -1,6 +1,13 @@
struct QGramIterator{S <: Union{AbstractString, AbstractVector}}
s::S # Collection
q::Int # Length of Qgram
function QGramIterator{S}(s, q) where {S <: Union{AbstractString, AbstractVector}}
q > 0 || throw(ArgumentError("The qgram length must be higher than zero"))
new(s, q)
end
end
function QGramIterator(s::Union{AbstractString, AbstractVector}, q::Integer)
QGramIterator{typeof(s)}(s, q)
end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
@ -51,7 +58,7 @@ qgrams
# for each element in s1 s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
function _count(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
d = Dict{K, Tuple{Int32, Int32}}()
sizehint!(d, length(s1) + length(s2))
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
@ -78,164 +85,7 @@ function _count(s1, s2)
return values(d)
end
# Turn a sequence of qgrams to a count dict for them, i.e. map each
# qgram to the number of times it has been seen.
function countdict(qgrams)
d = Dict{eltype(qgrams), Int32}()
for qg in qgrams
index = Base.ht_keyindex2!(d, qg)
if index > 0
d.age += 1
@inbounds d.keys[index] = qg
@inbounds d.vals[index] = d.vals[index][1] + 1
else
@inbounds Base._setindex!(d, 1, qg, -index)
end
end
d
end
abstract type AbstractQGramCounts{Q,K} end
q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q
counts(qc::AbstractQGramCounts) = qc.counts
Base.length(qc::AbstractQGramCounts{Q}) where Q = length(qc.counts) + Q - 1
"""
QGramDict(s, q::Integer = 2)
Creates a QGramDict that pre-calculates (pre-counts) the qgrams
of a string or stream. This enables faster calculation of QGram
distances.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qd1 = QGramDict(str1, 2)
qd2 = QGramDict(str2, 2)
evaluate(Overlap(2), qd1, qd2)
```
"""
struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K}
counts::Dict{K,Int}
end
function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
QGramDict{q, eltype(qgs)}(countdict(qgs))
end
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
"""
QGramSortedVector(s, q::Integer = 2)
Creates a QGramSortedVector that pre-calculates (pre-counts) the
qgrams of a string or stream. This enables faster calculation of
QGram distances.
Since qgrams are sorted in lexicographic order QGram distances can be
calculated even faster than when using a QGramDict. However, the
sorting means that updating the counts after creation is less
efficient. However, for most use cases QGramSortedVector is preferred
over a QgramDict.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qs1 = QGramSortedVector(str1, 2)
qs2 = QGramSortedVector(str2, 2)
evaluate(Jaccard(2), qs1, qs2)
```
"""
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
counts::Vector{Pair{K,Int}}
end
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
countpairs = collect(countdict(qgs))
sort!(countpairs, by = first)
QGramSortedVector{q, eltype(qgs)}(countpairs)
end
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
# To implement the distances we will count qgram matches
# between strings or pre-calculated AbstractQgramCounts objects.
# The abstract type defines different fallback versions which can be
# specialied by subtypes for best performance.
abstract type AbstractQGramMatchCounter end
@inline countleft!(c::AbstractQGramMatchCounter, qg, n1::Integer) = countleft!(c, n1)
@inline countright!(c::AbstractQGramMatchCounter, qg, n2::Integer) = countright!(c, n2)
@inline countboth!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) =
countboth!(c, n1, n2)
@inline function countboth!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer)
countleft!(c, n1)
countright!(c, n2)
countshared!(c, n1, n2)
end
@inline countshared!(c::AbstractQGramMatchCounter, qg, n1::Integer, n2::Integer) = countshared!(c, n1, n2)
# Subtypes must implement these methods:
@inline countleft!(c::AbstractQGramMatchCounter, n1::Integer) =
error("countleft! not implemented for $(typeof(c))")
@inline countright!(c::AbstractQGramMatchCounter, n2::Integer) =
error("countright! not implemented for $(typeof(c))")
# Subtypes either must overwrite countboth! from above (so it not uses countshared!) or implement:
@inline countshared!(c::AbstractQGramMatchCounter, n1::Integer, n2::Integer) =
error("countshared! not implemented for $(typeof(c))")
function countmatches!(mc::AbstractQGramMatchCounter, d1::Vector{Pair{K,I}}, d2::Vector{Pair{K,I}}) where {K,I<:Integer}
i1 = i2 = 1
while i1 <= length(d1) || i2 <= length(d2)
if i2 > length(d2)
for i in i1:length(d1)
@inbounds countleft!(mc, d1[i][1], d1[i][2])
end
return
elseif i1 > length(d1)
for i in i2:length(d2)
@inbounds countright!(mc, d2[i][1], d2[i][2])
end
return
end
@inbounds k1, n1 = d1[i1]
@inbounds k2, n2 = d2[i2]
cmpval = Base.cmp(k1, k2)
if cmpval == -1 # k1 < k2
countleft!(mc, k1, n1)
i1 += 1
elseif cmpval == +1 # k2 < k1
countright!(mc, k2, n2)
i2 += 1
else
countboth!(mc, k1, n1, n2)
i1 += 1
i2 += 1
end
end
end
function countmatches!(mc::AbstractQGramMatchCounter, d1::Dict{K,I}, d2::Dict{K,I}) where {K,I<:Integer}
for (k1, c1) in d1
index = Base.ht_keyindex2!(d2, k1)
if index > 0
countboth!(mc, k1, c1, d2.vals[index])
else
countleft!(mc, k1, c1)
end
end
for (k2, c2) in d2
index = Base.ht_keyindex2!(d1, k2)
if index <= 0
countright!(mc, k2, c2)
end
end
end
abstract type AbstractQGramDistance <: SemiMetric end
@ -243,18 +93,11 @@ function (dist::AbstractQGramDistance)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
counter = newcounter(dist)
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
countboth!(counter, n1, n2)
count!(counter, n1, n2)
end
calculate(dist, counter)
end
function (dist::AbstractQGramDistance)(qc1::QC, qc2::QC) where {QC<:AbstractQGramCounts}
@assert dist.q == q(qc1)
@assert dist.q == q(qc2)
counter = newcounter(dist)
countmatches!(counter, counts(qc1), counts(qc2))
calculate(dist, counter)
end
"""
QGram(q::Int)
@ -272,17 +115,15 @@ struct QGram <: AbstractQGramDistance
q::Int
end
mutable struct SingleCounter{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
n::T
mutable struct SingleCounter{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
shared::Int
end
newcounter(d::QGram) = SingleCounter{Int, QGram}(0)
@inline countleft!(c::SingleCounter{Int, QGram}, n1::Integer) = c.n += n1 # n1 === abs(n1 - 0)
@inline countright!(c::SingleCounter{Int, QGram}, n2::Integer) = c.n += n2 # n2 === abs(0 - n2)
@inline countboth!(c::SingleCounter{Int, QGram}, n1::Integer, n2::Integer) = c.n += abs(n1 - n2)
calculate(dist::QGram, c::SingleCounter{Int, QGram}) = c.n
newcounter(d::QGram) = SingleCounter{QGram}(0)
@inline function count!(c::SingleCounter{QGram}, n1::Integer, n2::Integer)
c.shared += abs(n1 - n2)
end
calculate(dist::QGram, c::SingleCounter{QGram}) = c.shared
"""
Cosine(q::Int)
@ -300,19 +141,19 @@ struct Cosine <: AbstractQGramDistance
q::Int
end
mutable struct ThreeCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
left::T
right::T
shared::T
mutable struct ThreeCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
left::Int
right::Int
shared::Int
end
newcounter(d::Cosine) = ThreeCounters{Int, Cosine}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, Cosine}, n1::Integer) = c.left += n1^2
@inline countright!(c::ThreeCounters{Int, Cosine}, n2::Integer) = c.right += n2^2
@inline countshared!(c::ThreeCounters{Int, Cosine}, n1::Integer, n2::Integer) = c.shared += n1 * n2
calculate(d::Cosine, c::ThreeCounters{Int, Cosine}) =
newcounter(d::Cosine) = ThreeCounters{Cosine}(0, 0, 0)
@inline function count!(c::ThreeCounters{Cosine}, n1::Integer, n2::Integer)
c.left += n1^2
c.right += n2^2
c.shared += n1 * n2
end
calculate(d::Cosine, c::ThreeCounters{Cosine}) =
1.0 - c.shared / (sqrt(c.left) * sqrt(c.right))
"""
@ -329,8 +170,13 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct Jaccard <: AbstractQGramDistance
q::Int
end
calculate(d::Jaccard, c::ThreeCounters{Int, Jaccard}) =
newcounter(d::Jaccard) = ThreeCounters{Jaccard}(0, 0, 0)
@inline function count!(c::ThreeCounters{Jaccard}, n1::Integer, n2::Integer)
c.left += (n1 > 0)
c.right += (n2 > 0)
c.shared += (n1 > 0) & (n2 > 0)
end
calculate(d::Jaccard, c::ThreeCounters{Jaccard}) =
1.0 - c.shared / (c.left + c.right - c.shared)
"""
@ -347,8 +193,13 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct SorensenDice <: AbstractQGramDistance
q::Int
end
calculate(d::SorensenDice, c::ThreeCounters{Int, SorensenDice}) =
newcounter(d::SorensenDice) = ThreeCounters{SorensenDice}(0, 0, 0)
@inline function count!(c::ThreeCounters{SorensenDice}, n1::Integer, n2::Integer)
c.left += (n1 > 0)
c.right += (n2 > 0)
c.shared += (n1 > 0) & (n2 > 0)
end
calculate(d::SorensenDice, c::ThreeCounters{SorensenDice}) =
1.0 - 2.0 * c.shared / (c.left + c.right)
"""
@ -365,67 +216,15 @@ where ``Q(s, q)`` denotes the set of q-grams of length n for the string s
struct Overlap <: AbstractQGramDistance
q::Int
end
const IntersectionDist = Union{Jaccard, SorensenDice, Overlap}
newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
@inline countleft!(c::ThreeCounters{Int, QD}, n1::Integer) where {QD<:IntersectionDist} =
newcounter(d::Overlap) = ThreeCounters{Overlap}(0, 0, 0)
@inline function count!(c::ThreeCounters{Overlap}, n1::Integer, n2::Integer)
c.left += (n1 > 0)
@inline countright!(c::ThreeCounters{Int, QD}, n2::Integer) where {QD<:IntersectionDist} =
c.right += (n2 > 0)
@inline countshared!(c::ThreeCounters{Int, QD}, n1::Integer, n2::Integer) where {QD<:IntersectionDist} =
c.shared += (n1 > 0) & (n2 > 0)
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
end
calculate(d::Overlap, c::ThreeCounters{Overlap}) =
1.0 - c.shared / min(c.left, c.right)
"""
MorisitaOverlap(q::Int)
Creates a MorisitaOverlap distance, a general, statistical measure of
dispersion which can also be used on dictionaries such as created
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
This is more fine-grained than many of the other QGramDistances since
it is based on the counts per q-gram rather than only which q-grams are
in the strings.
The distance corresponds to
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
"""
struct MorisitaOverlap <: AbstractQGramDistance
q::Int
end
mutable struct FiveCounters{T, QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
leftsum::T # sum(m(s1))
rightsum::T # sum(m(s2))
leftsq::T # sum(m(s1).^2)
rightsq::T # sum(m(s2).^2)
shared::T # sum(m(s1) .* m(s2))
end
newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
c.leftsum += n1
c.leftsq += (n1^2)
end
@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
c.rightsum += n2
c.rightsq += (n2^2)
end
@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
c.shared += (n1 * n2)
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
"""
NMD(q::Int)
NMD(q::Int)
@ -449,23 +248,53 @@ struct NMD <: AbstractQGramDistance
q::Int
end
newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
c.left += n1
c.shared += n1 # max(n1, 0) == n1
end
@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
c.right += n2
c.shared += n2 # max(n2, 0) == n2
end
@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
newcounter(d::NMD) = ThreeCounters{NMD}(0, 0, 0)
@inline function count!(c::ThreeCounters{NMD}, n1::Integer, n2::Integer)
c.left += n1
c.right += n2
c.shared += max(n1, n2)
end
calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
calculate(d::NMD, c::ThreeCounters{NMD}) =
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
"""
MorisitaOverlap(q::Int)
Creates a MorisitaOverlap distance, a general, statistical measure of
dispersion which can also be used on dictionaries such as created
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
This is more fine-grained than many of the other QGramDistances since
it is based on the counts per q-gram rather than only which q-grams are
in the strings.
The distance corresponds to
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
"""
struct MorisitaOverlap <: AbstractQGramDistance
q::Int
end
mutable struct FiveCounters{QD<:AbstractQGramDistance} <: AbstractQGramMatchCounter
leftsum::Int # sum(m(s1))
rightsum::Int # sum(m(s2))
leftsq::Int # sum(m(s1).^2)
rightsq::Int # sum(m(s2).^2)
shared::Int # sum(m(s1) .* m(s2))
end
newcounter(d::MorisitaOverlap) = FiveCounters{MorisitaOverlap}(0, 0, 0, 0, 0)
@inline function count!(c::FiveCounters{MorisitaOverlap}, n1::Integer, n2::Integer)
c.leftsum += n1
c.rightsum += n2
c.leftsq += (n1^2)
c.rightsq += (n2^2)
c.shared += (n1 * n2)
end
calculate(d::MorisitaOverlap, c::FiveCounters{MorisitaOverlap}) =
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))

View File

@ -0,0 +1,154 @@
# sometimes we already preprocess the strings
# We now define special methods for these special string types
"""
QGramDict(s, q::Integer = 2)
An iterator with a pre-computed dictionary of its qgrams. This enables faster calculation of QGram
distances.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qd1 = QGramDict(str1, 2)
qd2 = QGramDict(str2, 2)
evaluate(Overlap(2), qd1, qd2)
```
"""
struct QGramDict{S, K}
s::S
q::Int
counts::Dict{K,Int}
end
Base.length(s::QGramDict) = length(s.s)
Base.iterate(s::QGramDict) = iterate(s.s)
Base.iterate(s::QGramDict, state) = iterate(s.s, state)
function QGramDict(s, q::Integer = 2)
(s isa QGramDict) && (s.q == q) && return s
qgs = qgrams(s, q)
QGramDict{typeof(s), eltype(qgs)}(s, q, countdict(qgs))
end
# Turn a sequence of qgrams to a count dict for them, i.e. map each
# qgram to the number of times it has been seen.
function countdict(qgrams)
d = Dict{eltype(qgrams), Int32}()
for qg in qgrams
index = Base.ht_keyindex2!(d, qg)
if index > 0
d.age += 1
@inbounds d.keys[index] = qg
@inbounds d.vals[index] = d.vals[index][1] + 1
else
@inbounds Base._setindex!(d, 1, qg, -index)
end
end
d
end
function (dist::AbstractQGramDistance)(qc1::QGramDict, qc2::QGramDict)
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramDict must have the same qgram length"))
counter = newcounter(dist)
d1, d2 = qc1.counts, qc2.counts
for (k1, c1) in d1
index = Base.ht_keyindex2!(d2, k1)
if index > 0
count!(counter, c1, d2.vals[index])
else
count!(counter, c1, 0)
end
end
for (k2, c2) in d2
index = Base.ht_keyindex2!(d1, k2)
if index <= 0
count!(counter, 0, c2)
end
end
calculate(dist, counter)
end
"""
QGramSortedVector(s, q::Integer = 2)
An iterator with a pre-computed sorted vector of its qgrams. This enables faster calculation of QGram
distances.
Since qgrams are sorted in lexicographic order QGram distances can be
calculated even faster than when using a QGramDict. However, the
sorting means that updating the counts after creation is less
efficient. However, for most use cases QGramSortedVector is preferred
over a QgramDict.
Note that the qgram length must correspond with the q length used
in the distance.
## Examples
```julia
str1, str2 = "my string", "another string"
qs1 = QGramSortedVector(str1, 2)
qs2 = QGramSortedVector(str2, 2)
evaluate(Jaccard(2), qs1, qs2)
```
"""
struct QGramSortedVector{S, K}
s::S
q::Int
counts::Vector{Pair{K,Int}}
end
Base.length(s::QGramSortedVector) = length(s.s)
Base.iterate(s::QGramSortedVector) = iterate(s.s)
Base.iterate(s::QGramSortedVector, state) = iterate(s.s, state)
function QGramSortedVector(s, q::Integer = 2)
(s isa QGramSortedVector) && (s.q == q) && return s
qgs = qgrams(s, q)
countpairs = collect(countdict(qgs))
sort!(countpairs, by = first)
QGramSortedVector{typeof(s), eltype(qgs)}(s, q, countpairs)
end
# To implement the distances we will count qgram matches
# between strings or pre-calculated AbstractQgramCounts objects.
# The abstract type defines different fallback versions which can be
# specialied by subtypes for best performance.
function (dist::AbstractQGramDistance)(qc1::QGramSortedVector, qc2::QGramSortedVector)
dist.q == qc1.q == qc2.q || throw(ArgumentError("The distance and the QGramSortedVectors must have the same qgram length"))
counter = newcounter(dist)
d1, d2 = qc1.counts, qc2.counts
i1 = i2 = 1
while true
# length can be zero
if i2 > length(d2)
for i in i1:length(d1)
@inbounds count!(counter, d1[i][2], 0)
end
break
elseif i1 > length(d1)
for i in i2:length(d2)
@inbounds count!(counter, 0, d2[i][2])
end
break
end
@inbounds k1, n1 = d1[i1]
@inbounds k2, n2 = d2[i2]
cmpval = Base.cmp(k1, k2)
if cmpval == -1 # k1 < k2
count!(counter, n1, 0)
i1 += 1
elseif cmpval == +1 # k2 < k1
count!(counter, 0, n2)
i2 += 1
else
count!(counter, n1, n2)
i1 += 1
i2 += 1
end
end
calculate(dist, counter)
end

View File

@ -170,10 +170,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
s = _helper(s, dist)
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(itr[i], dist), dist; min_score = min_score_atomic[])
for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
@ -183,12 +183,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
imax = is[argmax(scores)]
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end
function _helper(s, dist::AbstractQGramDistance)
s !== missing ? QGramSortedVector(s, dist.q) : s
end
_helper(s, dist::StringDistance) = s
_helper(dist::AbstractQGramDistance, ::Missing) = missing
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_helper(dist::StringDistance, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
@ -218,10 +215,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
s = _helper(s, dist)
s = _helper(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(itr[i], dist), dist; min_score = min_score)
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
if score >= min_score
push!(out[Threads.threadid()], i)
end

View File

@ -38,7 +38,7 @@ end
pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
Compute distances between all pairs of elements in `xs` and `ys` according to the
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
For AbstractQGramDistances preprocessing will be used either if `preprocess` is set
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
@ -75,11 +75,13 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
return R
end
function _preprocess(xs, dist::AbstractQGramDistance, preprocess)
if preprocess === nothing ? length(xs) >= 5 : preprocess
return map(x -> x === missing ? x : QGramSortedVector(x, dist.q), xs)
function _preprocess(xs, dist::StringDistance, preprocess)
if preprocess === nothing
preprocess = length(xs) >= 5
end
if (dist isa AbstractQGramDistance) && preprocess
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
else
return xs
end
end
_preprocess(xs, dist::StringDistance, preprocess) = xs

View File

@ -174,7 +174,7 @@ using StringDistances, Unicode, Test, Random
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
sortedcounts(qc) = sort(collect(qc.counts), by = first)
totuples(qc) = map(stringify, sortedcounts(qc))
s1, s2 = "arnearne", "arnebeda"

View File

@ -133,6 +133,7 @@ end
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1)
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], QGram(2)) == ("NewYork", 3)
# findall
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]