From cacbbc54870e3749424d41bf252c55ceaa8c7491 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 24 Oct 2020 14:32:19 +0200 Subject: [PATCH] name change to QGramSortedVector, code and tests for handling unicode strings --- src/StringDistances.jl | 2 +- src/distances/qgram.jl | 8 ++-- test/distances.jl | 88 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 45aa0f3..dc4a215 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -31,7 +31,7 @@ Jaccard, SorensenDice, Overlap, QGramDict, -QGramSortedArray, +QGramSortedVector, Winkler, Partial, TokenSort, diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 4650839..42e4178 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -107,18 +107,20 @@ function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2) qgs = qgrams(s, q) QGramDict{q, eltype(qgs)}(countdict(qgs)) end +QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q) # Faster (than QgramDict) with the qgrams presorted -struct QGramSortedArray{Q,K} <: AbstractQGramCounts{Q,K} +struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K} counts::Vector{Pair{K,Int}} end -function QGramSortedArray(s::Union{AbstractString, AbstractVector}, q::Integer = 2) +function QGramSortedVector(s::Union{AbstractString, AbstractVector}, q::Integer = 2) @assert q >= 1 qgs = qgrams(s, q) countpairs = collect(countdict(qgs)) sort!(countpairs, by = first) - QGramSortedArray{q, eltype(qgs)}(countpairs) + QGramSortedVector{q, eltype(qgs)}(countpairs) end +QGramSortedVector(s, q::Integer = 2) = QGramSortedVector(collect(s), q) # To implement the distances we will count qgram matches # between strings or pre-calculated AbstractQgramCounts objects. diff --git a/test/distances.jl b/test/distances.jl index d6d7cb3..42bafae 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -123,17 +123,85 @@ using StringDistances, Unicode, Test, Random @test ismissing(evaluate(Overlap(1), "", missing)) end - @testset "Differential testing of String, QGramDict, and QGramSortedArray" begin + @testset "QGramDict and QGramSortedVector counts qgrams" begin + # To get something we can more easily compare to: + stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p)) + stringify(p::Pair{V, <:Integer}) where {S<:AbstractString,V<:AbstractVector{S}} = (map(string, first(p)), last(p)) + sortedcounts(qc) = sort(collect(StringDistances.counts(qc)), by = first) + totuples(qc) = map(stringify, sortedcounts(qc)) + + s1, s2 = "arnearne", "arnebeda" + + qd1, qd2 = QGramDict(s1, 2), QGramDict(s2, 2) + @test totuples(qd1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)] + @test totuples(qd2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)] + + qc1, qc2 = QGramSortedVector(s1, 2), QGramSortedVector(s2, 2) + @test totuples(qc1) == [("ar", 2), ("ea", 1), ("ne", 2), ("rn", 2)] + @test totuples(qc2) == [("ar", 1), ("be", 1), ("da", 1), ("eb", 1), ("ed", 1), ("ne", 1), ("rn", 1)] + + s3 = "rgówów" + qd3a = QGramDict(s3, 2) + @test totuples(qd3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)] + + qd3b = QGramDict(graphemes(s3), 2) + @test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)] + + qc3a = QGramSortedVector(s3, 2) + @test totuples(qc3a) == [("gó", 1), ("rg", 1), ("wó", 1), ("ów", 2)] + + qd3b = QGramDict(graphemes(s3), 2) + @test totuples(qd3b) == [(["g", "ó"], 1), (["r", "g"], 1), (["w", "ó"], 1), (["ó", "w"], 2)] + end + + function partlyoverlappingstrings(sizerange, chars = []) + str1 = if length(chars) < 1 + randstring(rand(sizerange)) + else + randstring(chars, rand(sizerange)) + end + elems = collect(str1) + ci1 = prevind(str1, rand(2:div(length(elems), 2))) + ci2 = prevind(str1, rand((ci1+1):(length(elems)-1))) + str2 = if length(chars) < 1 + randstring(ci1-1) * join(elems[ci1:ci2]) * randstring(length(str1)-ci2) + else + randstring(chars, ci1-1) * join(elems[ci1:ci2]) * randstring(chars, length(str1)-ci2) + end + return str1, str2 + end + + @testset "Precalculation on unicode strings" begin + Chars = vcat(map(collect, ["δσμΣèìòâôîêûÊÂÛ", 'a':'z', '0':'9'])...) + for _ in 1:100 + str1, str2 = partlyoverlappingstrings(10:100, Chars) + qlen = rand(2:5) + d = Jaccard(qlen) + + qd1 = QGramDict(str1, qlen) + qd2 = QGramDict(str2, qlen) + @test evaluate(d, str1, str2) == evaluate(d, qd1, qd2) + + qd1b = QGramDict(graphemes(str1), qlen) + qd2b = QGramDict(graphemes(str2), qlen) + @test evaluate(d, str1, str2) == evaluate(d, qd1b, qd2b) + + qc1 = QGramSortedVector(str1, qlen) + qc2 = QGramSortedVector(str2, qlen) + @test evaluate(d, str1, str2) == evaluate(d, qc1, qc2) + + qc1b = QGramSortedVector(graphemes(str1), qlen) + qc2b = QGramSortedVector(graphemes(str2), qlen) + @test evaluate(d, str1, str2) == evaluate(d, qc1b, qc2b) + end + end + + @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap] for _ in 1:100 qlen = rand(2:9) dist = D(qlen) - - # 2 random strings with some overlap - str1 = randstring(rand(5:10000)) - ci1 = rand(2:div(length(str1), 2)) - ci2 = rand((ci1+1):(length(str1)-1)) - str2 = randstring(ci1-1) * str1[ci1:ci2] * randstring(length(str1)-ci2) + str1, str2 = partlyoverlappingstrings(5:10000) # QGramDict gets same result as for standard string qd1 = QGramDict(str1, qlen) @@ -141,9 +209,9 @@ using StringDistances, Unicode, Test, Random expected = evaluate(dist, str1, str2) @test expected == evaluate(dist, qd1, qd2) - # QGramSortedArray gets same result as for standard string - qc1 = QGramSortedArray(str1, qlen) - qc2 = QGramSortedArray(str2, qlen) + # QGramSortedVector gets same result as for standard string + qc1 = QGramSortedVector(str1, qlen) + qc2 = QGramSortedVector(str2, qlen) @test expected == evaluate(dist, qc1, qc2) end end