correct qgram bigger than 1
parent
dbf8a11d8e
commit
222a417612
|
@ -6,12 +6,6 @@ module StringDistances
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
|
||||
# some memo
|
||||
# length: number of characters
|
||||
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid – they may not be the start of a character,.
|
||||
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
|
||||
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import IterTools: chain
|
||||
|
@ -47,3 +41,16 @@ include("compare.jl")
|
|||
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Some memo about Strings
|
||||
|
||||
# length: number of characters
|
||||
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid – they may not be the start of a character,.
|
||||
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
|
||||
|
||||
# lastindex: Return the last index of a collection
|
||||
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
|
||||
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
|
||||
|
||||
##############################################################################
|
|
@ -9,10 +9,11 @@ struct QGramIterator{S <: AbstractString, N}
|
|||
s::S # grapheme
|
||||
l::Int # length of string
|
||||
end
|
||||
# N is the number of characters in the QGram
|
||||
param(x::QGramIterator{S, N}) where {S, N} = N
|
||||
|
||||
function Base.iterate(qgram::QGramIterator{S, N},
|
||||
state = (1, qgram.l < N ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
|
||||
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
|
||||
istart, iend = state
|
||||
iend > ncodeunits(qgram.s) && return nothing
|
||||
element = qgram.s[istart:iend]
|
||||
|
@ -20,7 +21,6 @@ function Base.iterate(qgram::QGramIterator{S, N},
|
|||
element, nextstate
|
||||
end
|
||||
Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0)
|
||||
|
||||
Base.eltype(qgram::QGramIterator) = String
|
||||
|
||||
##############################################################################
|
||||
|
@ -77,11 +77,13 @@ end
|
|||
abstract type AbstractQGram{N} <: SemiMetric end
|
||||
param(x::AbstractQGram{N}) where N = N
|
||||
|
||||
function qgram_iterator(dist::AbstractQGram, s::AbstractString)
|
||||
QGramIterator{typeof(s), param(dist)}(s, length(s))
|
||||
end
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
evaluate(dist,
|
||||
CountIteratorDictionary(QGramIterator{typeof(s1), param(dist)}(s1, length(s1)),
|
||||
QGramIterator{typeof(s2), param(dist)}(s2, length(s2))))
|
||||
CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
|
Loading…
Reference in New Issue