allow any iterator in qgram distances
parent
6ef1cfc8b2
commit
e1b8aa6500
|
@ -5,6 +5,7 @@ version = "0.5.2"
|
||||||
[deps]
|
[deps]
|
||||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||||
|
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
julia = "1"
|
julia = "1"
|
||||||
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
|
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
|
||||||
|
|
|
@ -17,8 +17,6 @@ include("find.jl")
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing
|
|
||||||
evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing
|
|
||||||
|
|
||||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||||
|
|
|
@ -239,7 +239,7 @@ end
|
||||||
|
|
||||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
||||||
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||||
a = longest_common_substring(s1, s2, len1 , len2)
|
a = longest_common_pattern(s1, s2, len1 , len2)
|
||||||
# exit if there is no common substring
|
# exit if there is no common substring
|
||||||
a[3] == 0 && return x
|
a[3] == 0 && return x
|
||||||
# add the info of the common to the existing set
|
# add the info of the common to the existing set
|
||||||
|
@ -256,11 +256,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
||||||
return x
|
return x
|
||||||
end
|
end
|
||||||
|
|
||||||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
# Indexes refer to character number, not index
|
|
||||||
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
|
||||||
if len1 > len2
|
if len1 > len2
|
||||||
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
|
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
||||||
else
|
else
|
||||||
start1, start2, len = 0, 0, 0
|
start1, start2, len = 0, 0, 0
|
||||||
p = zeros(Int, len2)
|
p = zeros(Int, len2)
|
||||||
|
|
41
src/qgram.jl
41
src/qgram.jl
|
@ -1,10 +1,11 @@
|
||||||
|
struct QGramIterator{S}
|
||||||
struct QGramIterator{S <: AbstractString}
|
s::S # String or Iterator
|
||||||
s::S # string
|
|
||||||
q::Int # Length of Qgram
|
q::Int # Length of Qgram
|
||||||
end
|
end
|
||||||
|
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||||||
|
|
||||||
function Base.iterate(qgram::QGramIterator,
|
# q-grams of AbstractString
|
||||||
|
function Base.iterate(qgram::QGramIterator{<: AbstractString},
|
||||||
state = (1, nextind(qgram.s, 0, qgram.q)))
|
state = (1, nextind(qgram.s, 0, qgram.q)))
|
||||||
istart, iend = state
|
istart, iend = state
|
||||||
iend > ncodeunits(qgram.s) && return nothing
|
iend > ncodeunits(qgram.s) && return nothing
|
||||||
|
@ -12,15 +13,22 @@ function Base.iterate(qgram::QGramIterator,
|
||||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||||
element, nextstate
|
element, nextstate
|
||||||
end
|
end
|
||||||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
|
||||||
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||||||
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||||
|
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||||
|
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||||
|
end
|
||||||
|
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Return an iterator on the q-gram of a string
|
Return an iterator on the q-gram of a string
|
||||||
|
|
||||||
### Arguments
|
### Arguments
|
||||||
* `s::AbstractString`
|
* `s` iterator
|
||||||
* `q::Integer`: length of q-gram
|
* `q::Integer`: length of q-gram
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
@ -30,7 +38,9 @@ for x in qgrams("hello", 2)
|
||||||
end
|
end
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||||||
|
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||||||
|
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,7 +94,8 @@ struct QGram <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::QGram, s1, s2)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
n = 0
|
n = 0
|
||||||
itr =
|
itr =
|
||||||
|
@ -110,7 +121,8 @@ struct Cosine <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::Cosine, s1, s2)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
|
@ -136,7 +148,8 @@ struct Jaccard <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::Jaccard, s1, s2)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
|
@ -162,7 +175,8 @@ struct SorensenDice <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::SorensenDice, s1, s2)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
|
@ -188,7 +202,8 @@ struct Overlap <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::Overlap, s1, s2)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
|
|
|
@ -143,10 +143,12 @@ using StringDistances, Test
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# allow any iterator
|
||||||
evaluate(Jaro(), [1,2,3], [1,2,10])
|
evaluate(Jaro(), [1,2,3], [1,2,10])
|
||||||
evaluate(Levenshtein(), [1,2,3], [1,2,10])
|
evaluate(Levenshtein(), [1,2,3], [1,2,10])
|
||||||
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
|
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
|
||||||
|
evaluate(QGram(2), [1,2,3], [1,2,10])
|
||||||
|
evaluate(Overlap(2), [1,2,3], [1,2,10])
|
||||||
|
|
||||||
|
|
||||||
#= R test
|
#= R test
|
||||||
|
|
Loading…
Reference in New Issue