clean

2020-02-24 09:41:38 -05:00 · 2020-02-24 09:41:38 -05:00 · 6f22f2c9f5
parent afafe93bf6
commit 6f22f2c9f5
3 changed files with 27 additions and 28 deletions
--- a/src/edit.jl
+++ b/src/edit.jl
@ -27,6 +27,7 @@ function (dist::Jaro)(s1, s2)
    ch1_match = Vector{eltype(s1)}()
    for (i1, ch1) in enumerate(s1)
        for (i2, ch2) in enumerate(s2)
+            # greedy alignement
            if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] 
                flag[i2] = true
                push!(ch1_match, ch1)
@ -191,16 +192,16 @@ function matching_blocks(s1, s2)
 end

 function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
-    a = longest_common_pattern(s1, s2)
+    n1, n2, len = longest_common_pattern(s1, s2)
    # exit if there is no common substring
-    a[3] == 0 && return x
+    len == 0 && return x
    # add the info of the common to the existing set
-    push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
+    push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
    # add the longest common substring that happens before
-    matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
+    matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
    # add the longest common substring that happens after
-    matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1), 
-                    start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
+    matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), 
+                    start1 + n1 + len - 1, start2 + n2 + len - 1)
    return x
 end

--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -237,14 +237,15 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
    unbase_scale = 0.95
    # if one string is much shorter than the other, use partial
    if length(s2) >= 1.5 * length(s1)
+        partial_dist = Partial(dist.dist)
        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
+        score_partial = 1 - partial_scale * (1 - partial_dist(s1, s2, 1 - (1 - max_dist) / partial_scale))
        min_score = min(max_dist, score_partial)
        score_sort = 1 - unbase_scale * partial_scale * 
-                (1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
+                (1 - TokenSort(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
        max_dist = min(max_dist, score_sort)
        score_set = 1 - unbase_scale * partial_scale * 
-                (1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
+                (1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
        return min(score, score_partial, score_sort, score_set)
    else
        score_sort = 1 - unbase_scale * 
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -15,6 +15,7 @@ function Base.iterate(qgram::QGramIterator{<: AbstractString},
 end
 Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
 Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
+qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)


 #q-grams of AbstractVector
@ -25,9 +26,11 @@ function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstinde
 	view(qgram.s, state:(state + qgram.q - 1)), state + 1
 end
 Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
+qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
+qgrams(s, q::Integer) = QGramIterator(collect(s), q)


-"""
+@doc """
 Return an iterator on the q-gram of a string

 ### Arguments
@ -40,15 +43,14 @@ for x in qgrams("hello", 2)
 	println(x)
 end
 ```
-"""
-qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
-qgrams(s, q::Integer) = QGramIterator(collect(s), q)
+""" 
+qgrams


-# For two iterators x1 and x2, that define a length and eltype method,
-# this returns a dictionary which, for each element in x1 or x2, 
-# returns a tuple with the numbers of times it appears in x1 and x2
-function count_map(s1, s2)
+# For two iterators s1 and s2, that define a length and eltype method,
+# this returns an iterator that,
+# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
+function _count(s1, s2)
 	K = promote_type(eltype(s1), eltype(s2))
 	d = Dict{K, Tuple{Int, Int}}()
 	sizehint!(d, length(s1) + length(s2))
@ -74,7 +76,7 @@ function count_map(s1, s2)
 			@inbounds Base._setindex!(d, (0, 1), x2, -index)
 		end
 	end
-	return d
+	return values(d)
 end


@ -98,9 +100,8 @@ end

 function (dist::QGram)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		n += abs(n1 - n2)
 	end
 	n
@ -124,9 +125,8 @@ end

 function (dist::Cosine)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		norm1 += n1^2
 		norm2 += n2^2
 		prodnorm += n1 * n2
@ -151,9 +151,8 @@ end

 function (dist::Jaccard)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -178,9 +177,8 @@ end

 function (dist::SorensenDice)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in  _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -205,9 +203,8 @@ end

 function (dist::Overlap)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)