From 9d28c36ed5420fbc4232a7d5b7e0f9343b931972 Mon Sep 17 00:00:00 2001 From: Robert Feldt Date: Sat, 24 Oct 2020 21:01:39 +0200 Subject: [PATCH] added doc strings and upped the dependency and CI to Julia 1.3 --- .travis.yml | 2 +- Project.toml | 2 +- src/distances/qgram.jl | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 45353e4..e9c590b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: julia os: - linux julia: - - 1.0 + - 1.3 - 1.5 - nightly matrix: diff --git a/Project.toml b/Project.toml index 74a3a22..1a5d77b 100644 --- a/Project.toml +++ b/Project.toml @@ -7,7 +7,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" [compat] Distances = "0.8.1, 0.9, 0.10" -julia = "1" +julia = "1.3" [extras] Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 42e4178..a420877 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -99,6 +99,24 @@ abstract type AbstractQGramCounts{Q,K} end q(qc::AbstractQGramCounts{Q,K}) where {Q,K} = Q counts(qc::AbstractQGramCounts) = qc.counts +""" + QGramDict(s, q::Integer = 2) + +Creates a QGramDict that pre-calculates (pre-counts) the qgrams +of a string or stream. This enables faster calculation of QGram +distances. + +Note that the qgram length must correspond with the q length used +in the distance. + +## Examples +```julia +str1, str2 = "my string", "another string" +qd1 = QGramDict(str1, 2) +qd2 = QGramDict(str2, 2) +evaluate(Overlap(2), qd1, qd2) +``` +""" struct QGramDict{Q,K} <: AbstractQGramCounts{Q,K} counts::Dict{K,Int} end @@ -109,7 +127,30 @@ function QGramDict(s::Union{AbstractString, AbstractVector}, q::Integer = 2) end QGramDict(s, q::Integer = 2) = QGramDict(collect(s), q) -# Faster (than QgramDict) with the qgrams presorted +""" + QGramSortedVector(s, q::Integer = 2) + +Creates a QGramSortedVector that pre-calculates (pre-counts) the +qgrams of a string or stream. This enables faster calculation of +QGram distances. + +Since qgrams are sorted in lexicographic order QGram distances can be +calculated even faster than when using a QGramDict. However, the +sorting means that updating the counts after creation is less +efficient. However, for most use cases QGramSortedVector is preferred +over a QgramDict. + +Note that the qgram length must correspond with the q length used +in the distance. + +## Examples +```julia +str1, str2 = "my string", "another string" +qs1 = QGramSortedVector(str1, 2) +qs2 = QGramSortedVector(str2, 2) +evaluate(Jaccard(2), qs1, qs2) +``` +""" struct QGramSortedVector{Q,K} <: AbstractQGramCounts{Q,K} counts::Vector{Pair{K,Int}} end