allow any iterator in qgram distances

pull/23/head
matthieugomez 2020-02-08 11:38:06 -05:00
parent 6ef1cfc8b2
commit e1b8aa6500
5 changed files with 35 additions and 21 deletions

View File

@ -5,6 +5,7 @@ version = "0.5.2"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
[compat]
julia = "1"
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"

View File

@ -17,8 +17,6 @@ include("find.jl")
##
##############################################################################
evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing
evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing

View File

@ -239,7 +239,7 @@ end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2, len1 , len2)
a = longest_common_pattern(s1, s2, len1 , len2)
# exit if there is no common substring
a[3] == 0 && return x
# add the info of the common to the existing set
@ -256,11 +256,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
return x
end
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
if len1 > len2
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
else
start1, start2, len = 0, 0, 0
p = zeros(Int, len2)

View File

@ -1,10 +1,11 @@
struct QGramIterator{S <: AbstractString}
s::S # string
struct QGramIterator{S}
s::S # String or Iterator
q::Int # Length of Qgram
end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
function Base.iterate(qgram::QGramIterator,
# q-grams of AbstractString
function Base.iterate(qgram::QGramIterator{<: AbstractString},
state = (1, nextind(qgram.s, 0, qgram.q)))
istart, iend = state
iend > ncodeunits(qgram.s) && return nothing
@ -12,15 +13,22 @@ function Base.iterate(qgram::QGramIterator,
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
end
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
"""
Return an iterator on the q-gram of a string
### Arguments
* `s::AbstractString`
* `s` iterator
* `q::Integer`: length of q-gram
## Examples
@ -30,7 +38,9 @@ for x in qgrams("hello", 2)
end
```
"""
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@ -84,7 +94,8 @@ struct QGram <: QGramDistance
q::Int
end
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
function evaluate(dist::QGram, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
itr =
@ -110,7 +121,8 @@ struct Cosine <: QGramDistance
q::Int
end
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
function evaluate(dist::Cosine, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr
@ -136,7 +148,8 @@ struct Jaccard <: QGramDistance
q::Int
end
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
function evaluate(dist::Jaccard, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
@ -162,7 +175,8 @@ struct SorensenDice <: QGramDistance
q::Int
end
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
function evaluate(dist::SorensenDice, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
@ -188,7 +202,8 @@ struct Overlap <: QGramDistance
q::Int
end
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
function evaluate(dist::Overlap, s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr

View File

@ -143,10 +143,12 @@ using StringDistances, Test
end
# allow any iterator
evaluate(Jaro(), [1,2,3], [1,2,10])
evaluate(Levenshtein(), [1,2,3], [1,2,10])
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
evaluate(QGram(2), [1,2,3], [1,2,10])
evaluate(Overlap(2), [1,2,3], [1,2,10])
#= R test