correct qgram bigger than 1

pull/17/head
matthieugomez 2019-08-14 10:30:22 -04:00
parent dbf8a11d8e
commit 222a417612
2 changed files with 19 additions and 10 deletions

View File

@ -6,12 +6,6 @@ module StringDistances
##
##############################################################################
# some memo
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
import Base: eltype, length, iterate, ==, hash, isless, convert, show
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import IterTools: chain
@ -47,3 +41,16 @@ include("compare.jl")
end
##############################################################################
##
## Some memo about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
##############################################################################

View File

@ -9,10 +9,11 @@ struct QGramIterator{S <: AbstractString, N}
s::S # grapheme
l::Int # length of string
end
# N is the number of characters in the QGram
param(x::QGramIterator{S, N}) where {S, N} = N
function Base.iterate(qgram::QGramIterator{S, N},
state = (1, qgram.l < N ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
istart, iend = state
iend > ncodeunits(qgram.s) && return nothing
element = qgram.s[istart:iend]
@ -20,7 +21,6 @@ function Base.iterate(qgram::QGramIterator{S, N},
element, nextstate
end
Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0)
Base.eltype(qgram::QGramIterator) = String
##############################################################################
@ -77,11 +77,13 @@ end
abstract type AbstractQGram{N} <: SemiMetric end
param(x::AbstractQGram{N}) where N = N
function qgram_iterator(dist::AbstractQGram, s::AbstractString)
QGramIterator{typeof(s), param(dist)}(s, length(s))
end
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
evaluate(dist,
CountIteratorDictionary(QGramIterator{typeof(s1), param(dist)}(s1, length(s1)),
QGramIterator{typeof(s2), param(dist)}(s2, length(s2))))
CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
end
##############################################################################