allow any iterator in qgram distances
parent
6ef1cfc8b2
commit
e1b8aa6500
|
@ -5,6 +5,7 @@ version = "0.5.2"
|
|||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
|
||||
|
||||
[compat]
|
||||
julia = "1"
|
||||
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
|
||||
|
|
|
@ -17,8 +17,6 @@ include("find.jl")
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing
|
||||
evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing
|
||||
|
||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||
|
|
|
@ -239,7 +239,7 @@ end
|
|||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
||||
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2, len1 , len2)
|
||||
a = longest_common_pattern(s1, s2, len1 , len2)
|
||||
# exit if there is no common substring
|
||||
a[3] == 0 && return x
|
||||
# add the info of the common to the existing set
|
||||
|
@ -256,11 +256,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
return x
|
||||
end
|
||||
|
||||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
||||
# Indexes refer to character number, not index
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
if len1 > len2
|
||||
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
|
||||
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
||||
else
|
||||
start1, start2, len = 0, 0, 0
|
||||
p = zeros(Int, len2)
|
||||
|
|
41
src/qgram.jl
41
src/qgram.jl
|
@ -1,10 +1,11 @@
|
|||
|
||||
struct QGramIterator{S <: AbstractString}
|
||||
s::S # string
|
||||
struct QGramIterator{S}
|
||||
s::S # String or Iterator
|
||||
q::Int # Length of Qgram
|
||||
end
|
||||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||||
|
||||
function Base.iterate(qgram::QGramIterator,
|
||||
# q-grams of AbstractString
|
||||
function Base.iterate(qgram::QGramIterator{<: AbstractString},
|
||||
state = (1, nextind(qgram.s, 0, qgram.q)))
|
||||
istart, iend = state
|
||||
iend > ncodeunits(qgram.s) && return nothing
|
||||
|
@ -12,15 +13,22 @@ function Base.iterate(qgram::QGramIterator,
|
|||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||
element, nextstate
|
||||
end
|
||||
Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0)
|
||||
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||
|
||||
|
||||
|
||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||||
|
||||
"""
|
||||
Return an iterator on the q-gram of a string
|
||||
|
||||
### Arguments
|
||||
* `s::AbstractString`
|
||||
* `s` iterator
|
||||
* `q::Integer`: length of q-gram
|
||||
|
||||
## Examples
|
||||
|
@ -30,7 +38,9 @@ for x in qgrams("hello", 2)
|
|||
end
|
||||
```
|
||||
"""
|
||||
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
||||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
|
||||
|
||||
|
||||
|
@ -84,7 +94,8 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::QGram, s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
itr =
|
||||
|
@ -110,7 +121,8 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::Cosine, s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -136,7 +148,8 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::Jaccard, s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -162,7 +175,8 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::SorensenDice, s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -188,7 +202,8 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::Overlap, s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
|
|
@ -143,10 +143,12 @@ using StringDistances, Test
|
|||
end
|
||||
|
||||
|
||||
|
||||
# allow any iterator
|
||||
evaluate(Jaro(), [1,2,3], [1,2,10])
|
||||
evaluate(Levenshtein(), [1,2,3], [1,2,10])
|
||||
evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10])
|
||||
evaluate(QGram(2), [1,2,3], [1,2,10])
|
||||
evaluate(Overlap(2), [1,2,3], [1,2,10])
|
||||
|
||||
|
||||
#= R test
|
||||
|
|
Loading…
Reference in New Issue