correct partialsort

2019-12-18 10:17:08 -05:00 · 2019-12-18 10:17:08 -05:00 · 3b9493f8a9
parent 3cb9576ab4
commit 3b9493f8a9
3 changed files with 47 additions and 64 deletions
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -3,11 +3,7 @@ module StringDistances
 using Distances
 import Distances: evaluate, result_type
-##############################################################################
+
 ##
 ## include
 ##
 ##############################################################################
 abstract type StringDistance <: SemiMetric end
 include("utils.jl")
 include("edit.jl")
@ -58,12 +54,12 @@ TokenMax,
 evaluate,
 compare,
 result_type,
-qgram
+qgrams
 end
 ##############################################################################
 ##
-## Some memo about Strings
+## Some things about Strings
 # length: number of characters
 # ncodeunits: Return the number of code units in a string (aking to index of vector). 
--- a/src/compare.jl
+++ b/src/compare.jl
@ -2,7 +2,7 @@
    compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
 return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the `StringDistance` `dist`
+`s2` based on the string distance `dist`.
 ### Examples
 ```julia-repl
@ -20,14 +20,9 @@ function compare(s1::AbstractString, s2::AbstractString,
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 1.0
    if min_score == 0.0
        return 1.0 - evaluate(dist, s1, s2) / len2
    else
    d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
    out = 1.0 - d / len2
-        out < min_score && return 0.0
+    out < min_score ? 0.0 : out
        return out
    end
 end
 function compare(s1::AbstractString, s2::AbstractString, 
@ -102,7 +97,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
    len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
    len1 == 0 && return 1.0
    out = 0.0
-    for x in qgram(s2, len1)
+    for x in qgrams(s2, len1)
        curr = compare(s1, x, dist.dist; min_score = min_score)
        out = max(out, curr)
        min_score = max(out, min_score)
@ -169,7 +164,7 @@ end
 Creates the `TokenSet{dist}` distance
 `TokenSet{dist}` modifies the string distance `dist` to adjust for differences 
-in  word orders and word numbers, by comparing the intersection of two strings with each string.
+in word orders and word numbers by comparing the intersection of two strings with each string.
 ### Examples
 ```julia-repl
@ -192,12 +187,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
    s1 = join(v1, " ")
    s2 = join(v2, " ")
    isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
-    dist0 = compare(s0, s1, dist.dist; min_score = min_score)
+    score_01 = compare(s0, s1, dist.dist; min_score = min_score)
-    min_score = max(min_score, dist0)
+    min_score = max(min_score, score_01)
-    dist1 = compare(s0, s2, dist.dist; min_score = min_score)
+    score_02 = compare(s0, s2, dist.dist; min_score = min_score)
-    min_score = max(min_score, dist1)
+    min_score = max(min_score, score_02)
-    dist2 = compare(s0, s2, dist.dist; min_score = min_score)
+    score_12 = compare(s1, s2, dist.dist; min_score = min_score)
-    max(dist0, dist1, dist2)
+    max(score_01, score_02, score_12)
 end
@ -225,31 +220,31 @@ end
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    dist0 = compare(s1, s2, dist.dist; min_score = min_score)
+    score = compare(s1, s2, dist.dist; min_score = min_score)
-    min_score = max(min_score, dist0)
+    min_score = max(min_score, score)
    unbase_scale = 0.95
    # if one string is much shorter than the other, use partial
    if length(s2) >= 1.5 * length(s1)
        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        dist1 = partial_scale * compare(s1, s2, Partial(dist.dist); 
+        score_partial = partial_scale * compare(s1, s2, Partial(dist.dist); 
                                        min_score = min_score / partial_scale) 
-        min_score = max(min_score, dist1)
+        min_score = max(min_score, score_partial)
-        dist2 = unbase_scale * partial_scale * 
+        score_sort = unbase_scale * partial_scale * 
                compare(s1, s2, TokenSort(Partial(dist.dist)); 
                            min_score = min_score / (unbase_scale * partial_scale))
-        min_score = max(min_score, dist2)
+        min_score = max(min_score, score_sort)
-        dist3 = unbase_scale * partial_scale * 
+        score_set = unbase_scale * partial_scale * 
                compare(s1, s2, TokenSet(Partial(dist.dist)); 
                            min_score = min_score / (unbase_scale * partial_scale)) 
-        return max(dist0, dist1, dist2, dist3)
+        return max(score, score_partial, score_sort, score_set)
    else
-        dist1 = unbase_scale * 
+        score_sort = unbase_scale * 
                compare(s1, s2, TokenSort(dist.dist); 
                            min_score = min_score / unbase_scale)
-        min_score = max(min_score, dist1)
+        min_score = max(min_score, score_sort)
-        dist2 = unbase_scale * 
+        score_set = unbase_scale * 
                compare(s1, s2, TokenSet(dist.dist); 
                            min_score = min_score / unbase_scale) 
-        return max(dist0, dist1, dist2)
+        return max(score, score_sort, score_set)
    end
 end
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -1,9 +1,4 @@
 ##############################################################################
 ##
 ## Define a type that iterates through q-grams of a string
 ##
 ############################################################################
 struct QGramIterator{S <: AbstractString}
 	s::S   # string
 	q::Int # Length of Qgram
@ -22,43 +17,34 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
 Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
 """
-Return an iterator that iterates on the QGram of the string
+Return an iterator on the q-gram of a string
 ### Arguments
 * `s::AbstractString`
-* `q::Integer`: length of qgram
+* `q::Integer`: length of q-gram
 ## Examples
 ```julia
-using StringDistances
+for x in qgrams("hello", 2)
 for x in qgram("hello", 2)
 	println(x)
 end
 ```
 """
-qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
+qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
 ##############################################################################
 ##
 ## Distance on strings is computed by set distance on qgram sets
 ##
 ##############################################################################
 abstract type QGramDistance <: StringDistance end
-function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
+# For two iterators x1 and x2, that define a length and eltype method,
-	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
+# this returns a dictionary which, for each element in x1 or x2, 
 	evaluate(dist, values(x))
 end
 # For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, 
 # returns a tuple with the numbers of times it appears in x1 and x2
 function count_map(s1, s2)
 	K = promote_type(eltype(s1), eltype(s2))
 	d = Dict{K, Tuple{Int, Int}}()
 	sizehint!(d, length(s1) + length(s2))
 	# I use a faster way to change a dictionary key
 	# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
 	sizehint!(d, length(s1) + length(s2))
 	for x1 in s1
 		index = Base.ht_keyindex2!(d, x1)
 		if index > 0
@ -98,8 +84,10 @@ struct QGram <: QGramDistance
 	q::Int
 end
-function evaluate(dist::QGram, itr)
+function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	n = 0
 	itr = 
 	for (n1, n2) in itr
 		n += abs(n1 - n2)
 	end
@ -122,7 +110,8 @@ struct Cosine <: QGramDistance
 	q::Int
 end
-function evaluate(dist::Cosine, itr)
+function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	norm1, norm2, prodnorm = 0, 0, 0
 	for (n1, n2) in itr
 		norm1 += n1^2
@ -147,7 +136,8 @@ struct Jaccard <: QGramDistance
 	q::Int
 end
-function evaluate(dist::Jaccard, itr)
+function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
@ -172,7 +162,8 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end
-function evaluate(dist::SorensenDice, itr)
+function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
@ -197,7 +188,8 @@ struct Overlap <: QGramDistance
 	q::Int
 end
-function evaluate(dist::Overlap, itr)
+function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
 	itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
 	for (n1, n2) in itr
 		ndistinct1 += n1 > 0