diff --git a/Project.toml b/Project.toml index f620db1..b5c159f 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.5.2" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" + [compat] julia = "1" Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8" diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 882f9b4..7939d30 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -17,8 +17,6 @@ include("find.jl") ## ############################################################################## -evaluate(::QGramDistance, ::Missing, ::AbstractString) = missing -evaluate(::QGramDistance, ::AbstractString, ::Missing) = missing evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing diff --git a/src/edit.jl b/src/edit.jl index eea55c2..116af1e 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -239,7 +239,7 @@ end function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer) - a = longest_common_substring(s1, s2, len1 , len2) + a = longest_common_pattern(s1, s2, len1 , len2) # exit if there is no common substring a[3] == 0 && return x # add the info of the common to the existing set @@ -256,11 +256,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: return x end -# Return start of commn substring in s1, start of common substring in s2, and length of substring -# Indexes refer to character number, not index -function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) if len1 > len2 - start2, start1, len = longest_common_substring(s2, s1, len2, len1) + start2, start1, len = longest_common_pattern(s2, s1, len2, len1) else start1, start2, len = 0, 0, 0 p = zeros(Int, len2) diff --git a/src/qgram.jl b/src/qgram.jl index 010163c..7c1e659 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -1,10 +1,11 @@ - -struct QGramIterator{S <: AbstractString} - s::S # string +struct QGramIterator{S} + s::S # String or Iterator q::Int # Length of Qgram end +Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0) -function Base.iterate(qgram::QGramIterator, +# q-grams of AbstractString +function Base.iterate(qgram::QGramIterator{<: AbstractString}, state = (1, nextind(qgram.s, 0, qgram.q))) istart, iend = state iend > ncodeunits(qgram.s) && return nothing @@ -12,15 +13,22 @@ function Base.iterate(qgram::QGramIterator, nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend) element, nextstate end -Base.length(qgram::QGramIterator) = max(length(qgram.s) - qgram.q + 1, 0) Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S} -Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S} +Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S} + + + +function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s)) + state + qgram.q - 1 > lastindex(qgram.s) && return nothing + view(qgram.s, state:(state + qgram.q - 1)), state + 1 +end +Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram)) """ Return an iterator on the q-gram of a string ### Arguments -* `s::AbstractString` +* `s` iterator * `q::Integer`: length of q-gram ## Examples @@ -30,7 +38,9 @@ for x in qgrams("hello", 2) end ``` """ -qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q) +qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q) +qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q) +qgrams(s, q::Integer) = QGramIterator(collect(s), q) @@ -84,7 +94,8 @@ struct QGram <: QGramDistance q::Int end -function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString) +function evaluate(dist::QGram, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) n = 0 itr = @@ -110,7 +121,8 @@ struct Cosine <: QGramDistance q::Int end -function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Cosine, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) norm1, norm2, prodnorm = 0, 0, 0 for (n1, n2) in itr @@ -136,7 +148,8 @@ struct Jaccard <: QGramDistance q::Int end -function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Jaccard, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr @@ -162,7 +175,8 @@ struct SorensenDice <: QGramDistance q::Int end -function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString) +function evaluate(dist::SorensenDice, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr @@ -188,7 +202,8 @@ struct Overlap <: QGramDistance q::Int end -function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Overlap, s1, s2) + (ismissing(s1) | ismissing(s2)) && return missing itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in itr diff --git a/test/distances.jl b/test/distances.jl index 34b1dbc..329d6da 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -143,10 +143,12 @@ using StringDistances, Test end - +# allow any iterator evaluate(Jaro(), [1,2,3], [1,2,10]) evaluate(Levenshtein(), [1,2,3], [1,2,10]) evaluate(DamerauLevenshtein(), [1,2,3], [1,2,10]) +evaluate(QGram(2), [1,2,3], [1,2,10]) +evaluate(Overlap(2), [1,2,3], [1,2,10]) #= R test