name change to QGramSortedVector, code and tests for handling unicode strings

pull/36/head
Robert Feldt 2020-10-24 14:32:19 +02:00
parent c0bedf89a6
commit cacbbc5487
3 changed files with 84 additions and 14 deletions

View File

@ -31,7 +31,7 @@ Jaccard,
SorensenDice,
Overlap,
QGramDict,
QGramSortedArray,
QGramSortedVector,
Winkler,
Partial,
TokenSort,

View File

@ -107,18 +107,20 @@ function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
qgs = qgrams(s, q)
QGramDict{q, eltype(qgs)}(countdict(qgs))
end
QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q)
# Faster (than QgramDict) with the qgrams presorted
struct QGramSortedArray{Q,K} <: AbstractQGramCounts{Q,K}
struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K}
counts::Vector{Pair{K,Int}}
end
function QGramSortedArray(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2)
@assert q >= 1
qgs = qgrams(s, q)
countpairs = collect(countdict(qgs))
sort!(countpairs, by = first)
QGramSortedArray{q, eltype(qgs)}(countpairs)
QGramSortedVector{q, eltype(qgs)}(countpairs)
end
QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q)
# To implement the distances we will count qgram matches
# between strings or pre-calculated AbstractQgramCounts objects.

View File

@ -123,17 +123,85 @@ using StringDistances, Unicode, Test, Random
@test ismissing(evaluate(Overlap(1), "", missing))
end
@testset "Differential testing of String, QGramDict, and QGramSortedArray" begin
@testset "QGramDict and QGramSortedVector counts qgrams" begin
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p))
sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first)
totuples(qc) = map(stringify, sortedcounts(qc))
s1, s2 = "arnearne", "arnebeda"
qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2)
@test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
@test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2)
@test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)]
@test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)]
s3 = "rgówów"
qd3a = QGramDict(s3, 2)
@test totuples(qd3a) == [("", 1), ("rg", 1), ("", 1), ("ów", 2)]
qd3b = QGramDict(graphemes(s3), 2)
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
qc3a = QGramSortedVector(s3, 2)
@test totuples(qc3a) == [("", 1), ("rg", 1), ("", 1), ("ów", 2)]
qd3b = QGramDict(graphemes(s3), 2)
@test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)]
end
function partlyoverlappingstrings(sizerange, chars = [])
str1 = if length(chars) < 1
randstring(rand(sizerange))
else
randstring(chars, rand(sizerange))
end
elems = collect(str1)
ci1 = prevind(str1, rand(2:div(length(elems), 2)))
ci2 = prevind(str1, rand((ci1+1):(length(elems)-1)))
str2 = if length(chars) < 1
randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2)
else
randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2)
end
return str1, str2
end
@testset "Precalculation on unicode strings" begin
Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...)
for _ in 1:100
str1, str2 = partlyoverlappingstrings(10:100, Chars)
qlen = rand(2:5)
d = Jaccard(qlen)
qd1 = QGramDict(str1, qlen)
qd2 = QGramDict(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1, qd2)
qd1b = QGramDict(graphemes(str1), qlen)
qd2b = QGramDict(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b)
qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1, qc2)
qc1b = QGramSortedVector(graphemes(str1), qlen)
qc2b = QGramSortedVector(graphemes(str2), qlen)
@test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b)
end
end
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)
# 2 random strings with some overlap
str1 = randstring(rand(5:10000))
ci1 = rand(2:div(length(str1), 2))
ci2 = rand((ci1+1):(length(str1)-1))
str2 = randstring(ci1-1) * str1[ci1:ci2] * randstring(length(str1)-ci2)
str1, str2 = partlyoverlappingstrings(5:10000)
# QGramDict gets same result as for standard string
qd1 = QGramDict(str1, qlen)
@ -141,9 +209,9 @@ using StringDistances, Unicode, Test, Random
expected = evaluate(dist, str1, str2)
@test expected == evaluate(dist, qd1, qd2)
# QGramSortedArray gets same result as for standard string
qc1 = QGramSortedArray(str1, qlen)
qc2 = QGramSortedArray(str2, qlen)
# QGramSortedVector gets same result as for standard string
qc1 = QGramSortedVector(str1, qlen)
qc2 = QGramSortedVector(str2, qlen)
@test expected == evaluate(dist, qc1, qc2)
end
end