From 6f22f2c9f5e07c145883637f206837c4498cbb6e Mon Sep 17 00:00:00 2001
From: matthieugomez <gomez.matthieu@gmail.com>
Date: Mon, 24 Feb 2020 09:41:38 -0500
Subject: [PATCH] clean

---
 src/edit.jl      | 13 +++++++------
 src/normalize.jl |  7 ++++---
 src/qgram.jl     | 35 ++++++++++++++++-------------------
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/edit.jl b/src/edit.jl
index c03c9ed..8bf98dc 100755
--- a/src/edit.jl
+++ b/src/edit.jl
@@ -27,6 +27,7 @@ function (dist::Jaro)(s1, s2)
     ch1_match = Vector{eltype(s1)}()
     for (i1, ch1) in enumerate(s1)
         for (i2, ch2) in enumerate(s2)
+            # greedy alignement
             if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2] 
                 flag[i2] = true
                 push!(ch1_match, ch1)
@@ -191,16 +192,16 @@ function matching_blocks(s1, s2)
 end
 
 function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
-    a = longest_common_pattern(s1, s2)
+    n1, n2, len = longest_common_pattern(s1, s2)
     # exit if there is no common substring
-    a[3] == 0 && return x
+    len == 0 && return x
     # add the info of the common to the existing set
-    push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
+    push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
     # add the longest common substring that happens before
-    matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
+    matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
     # add the longest common substring that happens after
-    matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1), 
-                    start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
+    matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), 
+                    start1 + n1 + len - 1, start2 + n2 + len - 1)
     return x
 end
 
diff --git a/src/normalize.jl b/src/normalize.jl
index 8af841f..6fe927f 100755
--- a/src/normalize.jl
+++ b/src/normalize.jl
@@ -237,14 +237,15 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
     unbase_scale = 0.95
     # if one string is much shorter than the other, use partial
     if length(s2) >= 1.5 * length(s1)
+        partial_dist = Partial(dist.dist)
         partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
+        score_partial = 1 - partial_scale * (1 - partial_dist(s1, s2, 1 - (1 - max_dist) / partial_scale))
         min_score = min(max_dist, score_partial)
         score_sort = 1 - unbase_scale * partial_scale * 
-                (1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
+                (1 - TokenSort(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
         max_dist = min(max_dist, score_sort)
         score_set = 1 - unbase_scale * partial_scale * 
-                (1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
+                (1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale))) 
         return min(score, score_partial, score_sort, score_set)
     else
         score_sort = 1 - unbase_scale * 
diff --git a/src/qgram.jl b/src/qgram.jl
index f3587a7..804e2e8 100755
--- a/src/qgram.jl
+++ b/src/qgram.jl
@@ -15,6 +15,7 @@ function Base.iterate(qgram::QGramIterator{<: AbstractString},
 end
 Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
 Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
+qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
 
 
 #q-grams of AbstractVector
@@ -25,9 +26,11 @@ function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstinde
 	view(qgram.s, state:(state + qgram.q - 1)), state + 1
 end
 Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
+qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
+qgrams(s, q::Integer) = QGramIterator(collect(s), q)
 
 
-"""
+@doc """
 Return an iterator on the q-gram of a string
 
 ### Arguments
@@ -40,15 +43,14 @@ for x in qgrams("hello", 2)
 	println(x)
 end
 ```
-"""
-qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
-qgrams(s, q::Integer) = QGramIterator(collect(s), q)
+""" 
+qgrams
 
 
-# For two iterators x1 and x2, that define a length and eltype method,
-# this returns a dictionary which, for each element in x1 or x2, 
-# returns a tuple with the numbers of times it appears in x1 and x2
-function count_map(s1, s2)
+# For two iterators s1 and s2, that define a length and eltype method,
+# this returns an iterator that,
+# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
+function _count(s1, s2)
 	K = promote_type(eltype(s1), eltype(s2))
 	d = Dict{K, Tuple{Int, Int}}()
 	sizehint!(d, length(s1) + length(s2))
@@ -74,7 +76,7 @@ function count_map(s1, s2)
 			@inbounds Base._setindex!(d, (0, 1), x2, -index)
 		end
 	end
-	return d
+	return values(d)
 end
 
 
@@ -98,9 +100,8 @@ end
 
 function (dist::QGram)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		n += abs(n1 - n2)
 	end
 	n
@@ -124,9 +125,8 @@ end
 
 function (dist::Cosine)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		norm1 += n1^2
 		norm2 += n2^2
 		prodnorm += n1 * n2
@@ -151,9 +151,8 @@ end
 
 function (dist::Jaccard)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@@ -178,9 +177,8 @@ end
 
 function (dist::SorensenDice)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in  _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@@ -205,9 +203,8 @@ end
 
 function (dist::Overlap)(s1, s2)
 	((s1 === missing) | (s2 === missing)) && return missing
-	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in itr
+	for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)