From 6f22f2c9f5e07c145883637f206837c4498cbb6e Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Mon, 24 Feb 2020 09:41:38 -0500 Subject: [PATCH] clean --- src/edit.jl | 13 +++++++------ src/normalize.jl | 7 ++++--- src/qgram.jl | 35 ++++++++++++++++------------------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/edit.jl b/src/edit.jl index c03c9ed..8bf98dc 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -27,6 +27,7 @@ function (dist::Jaro)(s1, s2) ch1_match = Vector{eltype(s1)}() for (i1, ch1) in enumerate(s1) for (i2, ch2) in enumerate(s2) + # greedy alignement if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] flag[i2] = true push!(ch1_match, ch1) @@ -191,16 +192,16 @@ function matching_blocks(s1, s2) end function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer) - a = longest_common_pattern(s1, s2) + n1, n2, len = longest_common_pattern(s1, s2) # exit if there is no common substring - a[3] == 0 && return x + len == 0 && return x # add the info of the common to the existing set - push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) + push!(x, (n1 + start1 - 1, n2 + start2 - 1, len)) # add the longest common substring that happens before - matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2) + matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2) # add the longest common substring that happens after - matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1), - start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) + matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), + start1 + n1 + len - 1, start2 + n2 + len - 1) return x end diff --git a/src/normalize.jl b/src/normalize.jl index 8af841f..6fe927f 100755 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -237,14 +237,15 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0 unbase_scale = 0.95 # if one string is much shorter than the other, use partial if length(s2) >= 1.5 * length(s1) + partial_dist = Partial(dist.dist) partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9 - score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale)) + score_partial = 1 - partial_scale * (1 - partial_dist(s1, s2, 1 - (1 - max_dist) / partial_scale)) min_score = min(max_dist, score_partial) score_sort = 1 - unbase_scale * partial_scale * - (1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) + (1 - TokenSort(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) max_dist = min(max_dist, score_sort) score_set = 1 - unbase_scale * partial_scale * - (1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) + (1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) return min(score, score_partial, score_sort, score_set) else score_sort = 1 - unbase_scale * diff --git a/src/qgram.jl b/src/qgram.jl index f3587a7..804e2e8 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -15,6 +15,7 @@ function Base.iterate(qgram::QGramIterator{<: AbstractString}, end Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S} Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S} +qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q) #q-grams of AbstractVector @@ -25,9 +26,11 @@ function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstinde view(qgram.s, state:(state + qgram.q - 1)), state + 1 end Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram)) +qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q) +qgrams(s, q::Integer) = QGramIterator(collect(s), q) -""" +@doc """ Return an iterator on the q-gram of a string ### Arguments @@ -40,15 +43,14 @@ for x in qgrams("hello", 2) println(x) end ``` -""" -qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q) -qgrams(s, q::Integer) = QGramIterator(collect(s), q) +""" +qgrams -# For two iterators x1 and x2, that define a length and eltype method, -# this returns a dictionary which, for each element in x1 or x2, -# returns a tuple with the numbers of times it appears in x1 and x2 -function count_map(s1, s2) +# For two iterators s1 and s2, that define a length and eltype method, +# this returns an iterator that, +# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2) +function _count(s1, s2) K = promote_type(eltype(s1), eltype(s2)) d = Dict{K, Tuple{Int, Int}}() sizehint!(d, length(s1) + length(s2)) @@ -74,7 +76,7 @@ function count_map(s1, s2) @inbounds Base._setindex!(d, (0, 1), x2, -index) end end - return d + return values(d) end @@ -98,9 +100,8 @@ end function (dist::QGram)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing - itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) n = 0 - for (n1, n2) in itr + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) n += abs(n1 - n2) end n @@ -124,9 +125,8 @@ end function (dist::Cosine)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing - itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) norm1, norm2, prodnorm = 0, 0, 0 - for (n1, n2) in itr + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) norm1 += n1^2 norm2 += n2^2 prodnorm += n1 * n2 @@ -151,9 +151,8 @@ end function (dist::Jaccard)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing - itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in itr + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -178,9 +177,8 @@ end function (dist::SorensenDice)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing - itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in itr + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0) @@ -205,9 +203,8 @@ end function (dist::Overlap)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing - itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q))) ndistinct1, ndistinct2, nintersect = 0, 0, 0 - for (n1, n2) in itr + for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) ndistinct1 += n1 > 0 ndistinct2 += n2 > 0 nintersect += (n1 > 0) & (n2 > 0)