remove trie

2019-12-12 18:55:41 -05:00 · 2019-12-12 18:55:41 -05:00 · 8be5a00e3d
parent f44ab13fef
commit 8be5a00e3d
5 changed files with 79 additions and 185 deletions
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -1,7 +1,5 @@
 module StringDistances

-
-
 using Distances
 import Distances: evaluate, result_type
 using DataStructures  # for SortedSet in TokenSort
@ -69,11 +67,14 @@ end
 ## Some memo about Strings

 # length: number of characters
-# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices  are valid – they may not be the start of a character,.
-# sizeof:  Size, in bytes, of the string str. Equal to the number of code units in str  multiplied by the size, in bytes, of one code unit in str.
+# ncodeunits: Return the number of code units in a string (aking to index of vector). 
+# Not all such indices  are valid – they may not be the start of a character,.
+# sizeof:  Size, in bytes, of the string str. Equal to the number of code units in str  
+# multiplied by the size, in bytes, of one code unit in str.

 # lastindex: Return the last index of a collection
 # nextinds(s, i):  return the index of the start of the character whose encoding starts after index i
-# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
+# nextind(s, 0, N): return the index of the Nth character of s (or, if there are 
+# less than N characters, return ncodeunits(str) + (N - length(s))

 ##############################################################################
--- a/src/compare.jl
+++ b/src/compare.jl
@ -1,21 +1,16 @@
-
-##############################################################################
-##
-## compare
-## compare always return a value between 0 and 1.
-##
-##############################################################################
 """
    compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)

-compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
+compare returns a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`
 """
-
-function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
+function compare(s1::AbstractString, s2::AbstractString, 
+    dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
    1.0 - evaluate(dist, s1, s2)
 end

-function compare(s1::AbstractString, s2::AbstractString,  dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
+function compare(s1::AbstractString, s2::AbstractString,  
+    dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len2 == 0 && return 1.0
@ -29,7 +24,8 @@ function compare(s1::AbstractString, s2::AbstractString,  dist::Union{Levenshtei
    end
 end

-function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; min_score = 0.0)
+function compare(s1::AbstractString, s2::AbstractString, 
+    dist::QGramDistance; min_score = 0.0)
    # When string length < q for qgram distance, returns s1 == s2
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -41,15 +37,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; mi
    end
 end

-##############################################################################
-##
-## Winkler
-##
-##############################################################################
 """
   Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)

-Winkler is a `StringDistance` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`)
+Winkler is a `StringDistance` modifier that boosts the similarity score between 
+two strings by a scale `p` when the strings share a common prefix with lenth lower 
+than `l` (the boost is only applied the similarity score above `boosting_threshold`)
 """
 struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
    dist::T1
@ -76,16 +69,12 @@ end

 JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)

-##############################################################################
-##
-## Partial
-## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-##
-##############################################################################
+
 """
   Partial(dist::StringDistance)

-Partial is a `StringDistance` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
+Partial is a `StringDistance` modifier that returns the maximal similarity score 
+between the shorter string and substrings of the longer string
 """
 struct Partial{T <: StringDistance} <: StringDistance
    dist::T
@ -129,42 +118,35 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
    return out
 end

-##############################################################################
-##
-## TokenSort
-## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-##
-##############################################################################
 """
   TokenSort(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders by reording words alphabetically.
+TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
+by reording words alphabetically.
 """
 struct TokenSort{T <: StringDistance} <: StringDistance
    dist::T
 end

+# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0)
    s1 = join(sort!(split(s1)), " ")
    s2 = join(sort!(split(s2)), " ")
    compare(s1, s2, dist.dist; min_score = min_score)
 end

-##############################################################################
-##
-## TokenSet
-## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
-##
-##############################################################################
+
 """
   TokenSet(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
+TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
+and word numbers by comparing the intersection of two strings with each string.
 """
 struct TokenSet{T <: StringDistance} <: StringDistance
    dist::T
 end

+# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
    v1 = SortedSet(split(s1))
    v2 = SortedSet(split(s2))
@ -182,15 +164,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
 end


-##############################################################################
-##
-## TokenMax
-##
-##############################################################################
 """
   TokenMax(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
+TokenSort is a `StringDistance` modifier that combines similarlity scores using the base 
+distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on 
+string lengths.
 """
 struct TokenMax{T <: StringDistance} <: StringDistance
    dist::T
--- a/src/edit.jl
+++ b/src/edit.jl
@ -1,9 +1,3 @@
-
-##############################################################################
-##
-## Jaro
-##
-##############################################################################
 """
    Jaro()

@ -23,7 +17,8 @@ struct Jaro <: StringDistance end
 function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
+    # if both are empty, m = 0 so should be 1.0 according to wikipedia. 
+    # Add this line so that not the case
    len2 == 0 && return 0.0
    maxdist = max(0, div(len2, 2) - 1)
    flag = fill(false, len2)
@ -75,25 +70,20 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
 end

-##############################################################################
-##
-## Levenshtein
-##
-## Return max_dist +1 if distance higher than max_dist
-## This makes it possible to differentiate distance equalt to max_dist vs strictly higher
-## This is important for find_all
-##
-##############################################################################
 """
    Levenshtein()

 Creates the Levenshtein metric

-The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other.
+The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, 
+substitutions of a single character) required to change one string into the other.
 """
 struct Levenshtein <: StringDistance end

 ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
+# Return max_dist +1 if distance higher than max_dist
+# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
+# This is important for find_all
 function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -133,17 +123,15 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
    return current
 end

-##############################################################################
-##
-## Damerau Levenshtein
-##
-##############################################################################
+
 """
    DamerauLevenshtein()

 Creates the DamerauLevenshtein metric

-The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other.
+The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, 
+deletions or substitutions of a single character, or transposition of two adjacent characters) 
+required to change one string into the other.
 """
 struct DamerauLevenshtein <: StringDistance end

@ -219,18 +207,15 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
    return current
 end

-
-##############################################################################
-##
-## Ratcliff/Obershelp
-##
-##############################################################################
 """
    RatcliffObershelp()

 Creates the RatcliffObershelp metric

-The distance between two strings is defined as one minus  the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
+The distance between two strings is defined as one minus  the number of matching characters 
+divided by the total number of characters in the two strings. Matching characters are those 
+in the longest common subsequence plus, recursively, matching characters in the unmatched 
+region on either side of the longest common subsequence.
 """
 struct RatcliffObershelp <: StringDistance end

@ -244,7 +229,8 @@ function matching_blocks(s1::AbstractString, s2::AbstractString)
    matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
 end

-function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
+function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, 
+    len1::Integer, len2::Integer, start1::Integer, start2::Integer)
    a = longest_common_substring(s1, s2, len1 , len2)
    # exit if there is no common substring
    a[3] == 0 && return x
@ -257,6 +243,7 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
    # add the longest common substring that happens after
    s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
    s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
-    matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
+    matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, 
+        len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
    return x
 end
--- a/src/find.jl
+++ b/src/find.jl
@ -1,9 +1,12 @@
 """
    findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)

-`findmax` returns the value and index of the element of `itr` that has the highest similarity score with `s` according to the distance `dist`. 
-It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0)
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
+`findmax` returns the value and index of the element of `itr` that has the 
+highest similarity score with `s` according to the distance `dist`. 
+It returns `(nothing, nothing)` if none of the elements has a similarity score 
+higher or equal to `min_score` (default to 0.0)
+The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
+(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
 """
 function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
    vmin = Threads.Atomic{typeof(min_score)}(min_score)
@ -26,8 +29,11 @@ end

 """
    findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
-`findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`. 
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
+    
+`findall` returns the vector of indices for elements of `itr` that have a 
+similarity score higher or equal than `min_score` according to the distance `dist`. 
+The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
+(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
 """
 function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
    out = [Int[] for _ in 1:Threads.nthreads()]
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -36,24 +36,23 @@ for x in qgram("hello", 2)
 end
 ```
 """
-function qgram(s::AbstractString, q::Integer)
-	QGramIterator{typeof(s)}(s, q)
+qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
+
+##############################################################################
+##
+## Distance on strings is computed by set distance on qgram sets
+##
+##############################################################################
+
+abstract type QGramDistance <: StringDistance end
+
+function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
+	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
+	evaluate(dist, x)
 end

-##############################################################################
-##
-## 
-## 
-## 
-##
-##############################################################################
-"""
-   count_map(x1, x2)
-
-For two iterators `x1` and `x2`, `count_map(x1, x2)`  returns an dictionary 
-that returns,  for each element in `x1` or `x2`, a tuple with the numbers of 
-times it appears in `x1` and the number of times it appears in `x2`
-"""
+# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, 
+# returns a tuple with the numbers of times it appears in x1 and x2
 function count_map(s1, s2)
 	K = promote_type(eltype(s1), eltype(s2))
 	d = Dict{K, Tuple{Int, Int}}()
@ -83,64 +82,6 @@ function count_map(s1, s2)
 	return d
 end

-#= Trie
-function count_map(s1, s2)
-	d = Trie{Tuple{Int, Int}}()
-	for ch1 in s1
-		node = d
-		for char in ch1
-			if !haskey(node.children, char)
-	            node.children[char] = Trie{Tuple{Int, Int}}()
-	        end
-	        node = node.children[char]
-	    end
-	    node.value = node.is_key ? (node.value[1] + 1, 0) : (1, 0)
-	    node.is_key = true
-	end
-	for ch2 in s2
-		node = d
-		for char in ch2
-			if !haskey(node.children, char)
-	            node.children[char] = Trie{Tuple{Int, Int}}()
-	        end
-	        node = node.children[char]
-	    end
-	    node.value = node.is_key ? (node.value[1], node.value[2]+ 1) : (0, 1)
-	    node.is_key = true
-	end
-	return iterator(d)
-end
-function iterator(t::Trie, found = Tuple{Int, Int}[])
-    if t.is_key
-    	t.is_key = false
-    	push!(found, t.value)
-    else
-    	for k in values(t.children)
-    		iterator(k, found) 
-    	end
-    end
-    return found
-end
-=#
-
-
-##############################################################################
-##
-## Distance on strings is computed by set distance on qgram sets
-##
-##############################################################################
-abstract type QGramDistance <: StringDistance end
-
-function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
-	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
-	evaluate(dist, x)
-end
-
-##############################################################################
-##
-## q-gram 
-##
-##############################################################################
 """
 	QGram(q::Int)

@ -150,7 +91,8 @@ The distance corresponds to

 ``||v(s1, q) - v(s2, q)||``

-where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s
+where ``v(s, q)`` denotes the vector on the space of q-grams of length q, 
+that contains the number of times a q-gram appears for the string s
 """
 struct QGram <: QGramDistance
 	q::Int
@ -164,12 +106,6 @@ function evaluate(dist::QGram, count_dict)
 	n
 end

-##############################################################################
-##
-## cosine 
-##
-## 
-##############################################################################
 """
 	Cosine(q::Int)

@ -179,7 +115,8 @@ The distance corresponds to

 `` 1 - v(s1, q).v(s2, q)  / ||v(s1, q)|| * ||v(s2, q)||``

-where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s
+where ``v(s, q)`` denotes the vector on the space of q-grams of length q, 
+that contains the  number of times a q-gram appears for the string s
 """
 struct Cosine <: QGramDistance
 	q::Int
@ -195,11 +132,6 @@ function evaluate(dist::Cosine, count_dict)
 	1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
 end

-##############################################################################
-##
-## Jaccard
-##
-##############################################################################
 """
 	Jaccard(q::Int)

@ -225,11 +157,6 @@ function evaluate(dist::Jaccard, count_dict)
 	1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
 end

-##############################################################################
-##
-## SorensenDice
-##
-##############################################################################
 """
 	SorensenDice(q::Int)

@ -255,12 +182,6 @@ function evaluate(dist::SorensenDice, count_dict)
 	1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
 end

-##############################################################################
-##
-## overlap
-##
-## 1 -  |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
-##############################################################################
 """
 	Overlap(q::Int)