name change to QGramSortedVector, code and tests for handling unicode strings
parent
c0bedf89a6
commit
cacbbc5487
|
@ -31,7 +31,7 @@ Jaccard,
|
|||
SorensenDice,
|
||||
Overlap,
|
||||
QGramDict,
|
||||
QGramSortedArray,
|
||||
QGramSortedVector,
|
||||
Winkler,
|
||||
Partial,
|
||||
TokenSort,
|
||||
|
|
|
@ -107,18 +107,20 @@ function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
|||
qgs = qgrams(s, q)
|
||||
QGramDict{q, eltype(qgs)}(countdict(qgs))
|
||||
end
|
||||
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
|
||||
|
||||
# Faster (than QgramDict) with the qgrams presorted
|
||||
struct QGramSortedArray{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
|
||||
counts::Vector{Pair{K,Int}}
|
||||
end
|
||||
function QGramSortedArray(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
|
||||
@assert q >= 1
|
||||
qgs = qgrams(s, q)
|
||||
countpairs = collect(countdict(qgs))
|
||||
sort!(countpairs, by = first)
|
||||
QGramSortedArray{q, eltype(qgs)}(countpairs)
|
||||
QGramSortedVector{q, eltype(qgs)}(countpairs)
|
||||
end
|
||||
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
|
||||
|
||||
# To implement the distances we will count qgram matches
|
||||
# between strings or pre-calculated AbstractQgramCounts objects.
|
||||
|
|
|
@ -123,17 +123,85 @@ using StringDistances, Unicode, Test, Random
|
|||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedArray" begin
|
||||
@testset "QGramDict and QGramSortedVector counts qgrams" begin
|
||||
# To get something we can more easily compare to:
|
||||
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
|
||||
stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
|
||||
sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
|
||||
totuples(qc) = map(stringify, sortedcounts(qc))
|
||||
|
||||
s1, s2 = "arnearne", "arnebeda"
|
||||
|
||||
qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2)
|
||||
@test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
|
||||
@test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
|
||||
|
||||
qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2)
|
||||
@test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
|
||||
@test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
|
||||
|
||||
s3 = "rgówów"
|
||||
qd3a = QGramDict(s3, 2)
|
||||
@test totuples(qd3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)]
|
||||
|
||||
qd3b = QGramDict(graphemes(s3), 2)
|
||||
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
|
||||
|
||||
qc3a = QGramSortedVector(s3, 2)
|
||||
@test totuples(qc3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)]
|
||||
|
||||
qd3b = QGramDict(graphemes(s3), 2)
|
||||
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
|
||||
end
|
||||
|
||||
function partlyoverlappingstrings(sizerange, chars = [])
|
||||
str1 = if length(chars) < 1
|
||||
randstring(rand(sizerange))
|
||||
else
|
||||
randstring(chars, rand(sizerange))
|
||||
end
|
||||
elems = collect(str1)
|
||||
ci1 = prevind(str1, rand(2:div(length(elems), 2)))
|
||||
ci2 = prevind(str1, rand((ci1+1):(length(elems)-1)))
|
||||
str2 = if length(chars) < 1
|
||||
randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2)
|
||||
else
|
||||
randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2)
|
||||
end
|
||||
return str1, str2
|
||||
end
|
||||
|
||||
@testset "Precalculation on unicode strings" begin
|
||||
Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...)
|
||||
for _ in 1:100
|
||||
str1, str2 = partlyoverlappingstrings(10:100, Chars)
|
||||
qlen = rand(2:5)
|
||||
d = Jaccard(qlen)
|
||||
|
||||
qd1 = QGramDict(str1, qlen)
|
||||
qd2 = QGramDict(str2, qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2)
|
||||
|
||||
qd1b = QGramDict(graphemes(str1), qlen)
|
||||
qd2b = QGramDict(graphemes(str2), qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b)
|
||||
|
||||
qc1 = QGramSortedVector(str1, qlen)
|
||||
qc2 = QGramSortedVector(str2, qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2)
|
||||
|
||||
qc1b = QGramSortedVector(graphemes(str1), qlen)
|
||||
qc2b = QGramSortedVector(graphemes(str2), qlen)
|
||||
@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b)
|
||||
end
|
||||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||
for _ in 1:100
|
||||
qlen = rand(2:9)
|
||||
dist = D(qlen)
|
||||
|
||||
# 2 random strings with some overlap
|
||||
str1 = randstring(rand(5:10000))
|
||||
ci1 = rand(2:div(length(str1), 2))
|
||||
ci2 = rand((ci1+1):(length(str1)-1))
|
||||
str2 = randstring(ci1-1) * str1[ci1:ci2] * randstring(length(str1)-ci2)
|
||||
str1, str2 = partlyoverlappingstrings(5:10000)
|
||||
|
||||
# QGramDict gets same result as for standard string
|
||||
qd1 = QGramDict(str1, qlen)
|
||||
|
@ -141,9 +209,9 @@ using StringDistances, Unicode, Test, Random
|
|||
expected = evaluate(dist, str1, str2)
|
||||
@test expected == evaluate(dist, qd1, qd2)
|
||||
|
||||
# QGramSortedArray gets same result as for standard string
|
||||
qc1 = QGramSortedArray(str1, qlen)
|
||||
qc2 = QGramSortedArray(str2, qlen)
|
||||
# QGramSortedVector gets same result as for standard string
|
||||
qc1 = QGramSortedVector(str1, qlen)
|
||||
qc2 = QGramSortedVector(str2, qlen)
|
||||
@test expected == evaluate(dist, qc1, qc2)
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue