allow any iterator in qgram distances

pull/23/head
matthieugomez 2020-02-08 11:38:06 -05:00
parent 6ef1cfc8b2
commit e1b8aa6500
5 changed files with 35 additions and 21 deletions

View File

@ -5,6 +5,7 @@ version = "0.5.2"
[deps] [deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
[compat] [compat]
julia = "1" julia = "1"
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"

View File

@ -17,8 +17,6 @@ include("find.jl")
## ##
############################################################################## ##############################################################################
evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing
evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing

View File

@ -239,7 +239,7 @@ end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer, start1::Integer, start2::Integer) len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2, len1 , len2) a = longest_common_pattern(s1, s2, len1 , len2)
# exit if there is no common substring # exit if there is no common substring
a[3] == 0 && return x a[3] == 0 && return x
# add the info of the common to the existing set # add the info of the common to the existing set
@ -256,11 +256,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
return x return x
end end
# Return start of commn substring in s1, start of common substring in s2, and length of substring function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
# Indexes refer to character number, not index
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
if len1 > len2 if len1 > len2
start2, start1, len = longest_common_substring(s2, s1, len2, len1) start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
else else
start1, start2, len = 0, 0, 0 start1, start2, len = 0, 0, 0
p = zeros(Int, len2) p = zeros(Int, len2)

View File

@ -1,10 +1,11 @@
struct QGramIterator{S}
struct QGramIterator{S <: AbstractString} s::S # String or Iterator
s::S # string
q::Int # Length of Qgram q::Int # Length of Qgram
end end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
function Base.iterate(qgram::QGramIterator, # q-grams of AbstractString
function Base.iterate(qgram::QGramIterator{<: AbstractString},
state = (1, nextind(qgram.s, 0, qgram.q))) state = (1, nextind(qgram.s, 0, qgram.q)))
istart, iend = state istart, iend = state
iend > ncodeunits(qgram.s) && return nothing iend > ncodeunits(qgram.s) && return nothing
@ -12,15 +13,22 @@ function Base.iterate(qgram::QGramIterator,
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend) nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate element, nextstate
end end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S} Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S} Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
end
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
""" """
Return an iterator on the q-gram of a string Return an iterator on the q-gram of a string
### Arguments ### Arguments
* `s::AbstractString` * `s` iterator
* `q::Integer`: length of q-gram * `q::Integer`: length of q-gram
## Examples ## Examples
@ -30,7 +38,9 @@ for x in qgrams("hello", 2)
end end
``` ```
""" """
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q) qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@ -84,7 +94,8 @@ struct QGram <: QGramDistance
q::Int q::Int
end end
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString) function evaluate(dist::QGram, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0 n = 0
itr = itr =
@ -110,7 +121,8 @@ struct Cosine <: QGramDistance
q::Int q::Int
end end
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString) function evaluate(dist::Cosine, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0 norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr for (n1, n2) in itr
@ -136,7 +148,8 @@ struct Jaccard <: QGramDistance
q::Int q::Int
end end
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString) function evaluate(dist::Jaccard, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0 ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr for (n1, n2) in itr
@ -162,7 +175,8 @@ struct SorensenDice <: QGramDistance
q::Int q::Int
end end
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString) function evaluate(dist::SorensenDice, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0 ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr for (n1, n2) in itr
@ -188,7 +202,8 @@ struct Overlap <: QGramDistance
q::Int q::Int
end end
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString) function evaluate(dist::Overlap, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0 ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr for (n1, n2) in itr

View File

@ -143,10 +143,12 @@ using StringDistances, Test
end end
# allow any iterator
evaluate(Jaro(), [1,2,3], [1,2,10]) evaluate(Jaro(), [1,2,3], [1,2,10])
evaluate(Levenshtein(), [1,2,3], [1,2,10]) evaluate(Levenshtein(), [1,2,3], [1,2,10])
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10]) evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
evaluate(QGram(2), [1,2,3], [1,2,10])
evaluate(Overlap(2), [1,2,3], [1,2,10])
#= R test #= R test