From 8be5a00e3dc72b7eb45124aeee73fac067fc77c8 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Thu, 12 Dec 2019 18:55:41 -0500 Subject: [PATCH] remove trie --- src/StringDistances.jl | 11 ++-- src/compare.jl | 69 +++++++++--------------- src/edit.jl | 51 +++++++----------- src/find.jl | 16 ++++-- src/qgram.jl | 117 +++++++---------------------------------- 5 files changed, 79 insertions(+), 185 deletions(-) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 748d3d8..cc128fd 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -1,7 +1,5 @@ module StringDistances - - using Distances import Distances: evaluate, result_type using DataStructures # for SortedSet in TokenSort @@ -69,11 +67,14 @@ end ## Some memo about Strings # length: number of characters -# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid – they may not be the start of a character,. -# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str. +# ncodeunits: Return the number of code units in a string (aking to index of vector). +# Not all such indices are valid – they may not be the start of a character,. +# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str +# multiplied by the size, in bytes, of one code unit in str. # lastindex: Return the last index of a collection # nextinds(s, i): return the index of the start of the character whose encoding starts after index i -# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s)) +# nextind(s, 0, N): return the index of the Nth character of s (or, if there are +# less than N characters, return ncodeunits(str) + (N - length(s)) ############################################################################## diff --git a/src/compare.jl b/src/compare.jl index 847399f..298b6ef 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -1,21 +1,16 @@ - -############################################################################## -## -## compare -## compare always return a value between 0 and 1. -## -############################################################################## """ compare(s1::AbstractString, s2::AbstractString, dist::StringDistance) -compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` +compare returns a similarity score between 0 and 1 for the strings `s1` and +`s2` based on the distance `dist` """ - -function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) +function compare(s1::AbstractString, s2::AbstractString, + dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) 1.0 - evaluate(dist, s1, s2) end -function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) +function compare(s1::AbstractString, s2::AbstractString, + dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 @@ -29,7 +24,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei end end -function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; min_score = 0.0) +function compare(s1::AbstractString, s2::AbstractString, + dist::QGramDistance; min_score = 0.0) # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -41,15 +37,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; mi end end -############################################################################## -## -## Winkler -## -############################################################################## """ Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4) -Winkler is a `StringDistance` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`) +Winkler is a `StringDistance` modifier that boosts the similarity score between +two strings by a scale `p` when the strings share a common prefix with lenth lower +than `l` (the boost is only applied the similarity score above `boosting_threshold`) """ struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance dist::T1 @@ -76,16 +69,12 @@ end JaroWinkler() = Winkler(Jaro(), 0.1, 0.7) -############################################################################## -## -## Partial -## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -## -############################################################################## + """ Partial(dist::StringDistance) -Partial is a `StringDistance` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string +Partial is a `StringDistance` modifier that returns the maximal similarity score +between the shorter string and substrings of the longer string """ struct Partial{T <: StringDistance} <: StringDistance dist::T @@ -129,42 +118,35 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO return out end -############################################################################## -## -## TokenSort -## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -## -############################################################################## """ TokenSort(dist::StringDistance) -TokenSort is a `StringDistance` modifier that adjusts for differences in word orders by reording words alphabetically. +TokenSort is a `StringDistance` modifier that adjusts for differences in word orders +by reording words alphabetically. """ struct TokenSort{T <: StringDistance} <: StringDistance dist::T end +# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0) s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") compare(s1, s2, dist.dist; min_score = min_score) end -############################################################################## -## -## TokenSet -## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ -## -############################################################################## + """ TokenSet(dist::StringDistance) -TokenSort is a `StringDistance` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. +TokenSort is a `StringDistance` modifier that adjusts for differences in word orders +and word numbers by comparing the intersection of two strings with each string. """ struct TokenSet{T <: StringDistance} <: StringDistance dist::T end +# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0) v1 = SortedSet(split(s1)) v2 = SortedSet(split(s2)) @@ -182,15 +164,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco end -############################################################################## -## -## TokenMax -## -############################################################################## """ TokenMax(dist::StringDistance) -TokenSort is a `StringDistance` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths. +TokenSort is a `StringDistance` modifier that combines similarlity scores using the base +distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on +string lengths. """ struct TokenMax{T <: StringDistance} <: StringDistance dist::T diff --git a/src/edit.jl b/src/edit.jl index 83652a4..c127117 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -1,9 +1,3 @@ - -############################################################################## -## -## Jaro -## -############################################################################## """ Jaro() @@ -23,7 +17,8 @@ struct Jaro <: StringDistance end function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case + # if both are empty, m = 0 so should be 1.0 according to wikipedia. + # Add this line so that not the case len2 == 0 && return 0.0 maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) @@ -75,25 +70,20 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 end -############################################################################## -## -## Levenshtein -## -## Return max_dist +1 if distance higher than max_dist -## This makes it possible to differentiate distance equalt to max_dist vs strictly higher -## This is important for find_all -## -############################################################################## """ Levenshtein() Creates the Levenshtein metric -The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other. +The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, +substitutions of a single character) required to change one string into the other. """ struct Levenshtein <: StringDistance end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html +# Return max_dist +1 if distance higher than max_dist +# This makes it possible to differentiate distance equalt to max_dist vs strictly higher +# This is important for find_all function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -133,17 +123,15 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max return current end -############################################################################## -## -## Damerau Levenshtein -## -############################################################################## + """ DamerauLevenshtein() Creates the DamerauLevenshtein metric -The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other. +The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, +deletions or substitutions of a single character, or transposition of two adjacent characters) +required to change one string into the other. """ struct DamerauLevenshtein <: StringDistance end @@ -219,18 +207,15 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri return current end - -############################################################################## -## -## Ratcliff/Obershelp -## -############################################################################## """ RatcliffObershelp() Creates the RatcliffObershelp metric -The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence. +The distance between two strings is defined as one minus the number of matching characters +divided by the total number of characters in the two strings. Matching characters are those +in the longest common subsequence plus, recursively, matching characters in the unmatched +region on either side of the longest common subsequence. """ struct RatcliffObershelp <: StringDistance end @@ -244,7 +229,8 @@ function matching_blocks(s1::AbstractString, s2::AbstractString) matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1) end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer) +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, + len1::Integer, len2::Integer, start1::Integer, start2::Integer) a = longest_common_substring(s1, s2, len1 , len2) # exit if there is no common substring a[3] == 0 && return x @@ -257,6 +243,7 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: # add the longest common substring that happens after s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2)) - matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) + matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, + len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) return x end diff --git a/src/find.jl b/src/find.jl index 3cb1efd..c82cff2 100755 --- a/src/find.jl +++ b/src/find.jl @@ -1,9 +1,12 @@ """ findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0) -`findmax` returns the value and index of the element of `itr` that has the highest similarity score with `s` according to the distance `dist`. -It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0) -The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) +`findmax` returns the value and index of the element of `itr` that has the +highest similarity score with `s` according to the distance `dist`. +It returns `(nothing, nothing)` if none of the elements has a similarity score +higher or equal to `min_score` (default to 0.0) +The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances +(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0) vmin = Threads.Atomic{typeof(min_score)}(min_score) @@ -26,8 +29,11 @@ end """ findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8) -`findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`. -The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) + +`findall` returns the vector of indices for elements of `itr` that have a +similarity score higher or equal than `min_score` according to the distance `dist`. +The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances +(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`) """ function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8) out = [Int[] for _ in 1:Threads.nthreads()] diff --git a/src/qgram.jl b/src/qgram.jl index 1ad6cee..71dd72f 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -36,24 +36,23 @@ for x in qgram("hello", 2) end ``` """ -function qgram(s::AbstractString, q::Integer) - QGramIterator{typeof(s)}(s, q) +qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q) + +############################################################################## +## +## Distance on strings is computed by set distance on qgram sets +## +############################################################################## + +abstract type QGramDistance <: StringDistance end + +function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString) + x = count_map(qgram(s1, dist.q), qgram(s2, dist.q)) + evaluate(dist, x) end -############################################################################## -## -## -## -## -## -############################################################################## -""" - count_map(x1, x2) - -For two iterators `x1` and `x2`, `count_map(x1, x2)` returns an dictionary -that returns, for each element in `x1` or `x2`, a tuple with the numbers of -times it appears in `x1` and the number of times it appears in `x2` -""" +# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, +# returns a tuple with the numbers of times it appears in x1 and x2 function count_map(s1, s2) K = promote_type(eltype(s1), eltype(s2)) d = Dict{K, Tuple{Int, Int}}() @@ -83,64 +82,6 @@ function count_map(s1, s2) return d end -#= Trie -function count_map(s1, s2) - d = Trie{Tuple{Int, Int}}() - for ch1 in s1 - node = d - for char in ch1 - if !haskey(node.children, char) - node.children[char] = Trie{Tuple{Int, Int}}() - end - node = node.children[char] - end - node.value = node.is_key ? (node.value[1] + 1, 0) : (1, 0) - node.is_key = true - end - for ch2 in s2 - node = d - for char in ch2 - if !haskey(node.children, char) - node.children[char] = Trie{Tuple{Int, Int}}() - end - node = node.children[char] - end - node.value = node.is_key ? (node.value[1], node.value[2]+ 1) : (0, 1) - node.is_key = true - end - return iterator(d) -end -function iterator(t::Trie, found = Tuple{Int, Int}[]) - if t.is_key - t.is_key = false - push!(found, t.value) - else - for k in values(t.children) - iterator(k, found) - end - end - return found -end -=# - - -############################################################################## -## -## Distance on strings is computed by set distance on qgram sets -## -############################################################################## -abstract type QGramDistance <: StringDistance end - -function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString) - x = count_map(qgram(s1, dist.q), qgram(s2, dist.q)) - evaluate(dist, x) -end - -############################################################################## -## -## q-gram -## -############################################################################## """ QGram(q::Int) @@ -150,7 +91,8 @@ The distance corresponds to ``||v(s1, q) - v(s2, q)||`` -where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s +where ``v(s, q)`` denotes the vector on the space of q-grams of length q, +that contains the number of times a q-gram appears for the string s """ struct QGram <: QGramDistance q::Int @@ -164,12 +106,6 @@ function evaluate(dist::QGram, count_dict) n end -############################################################################## -## -## cosine -## -## -############################################################################## """ Cosine(q::Int) @@ -179,7 +115,8 @@ The distance corresponds to `` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||`` -where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s +where ``v(s, q)`` denotes the vector on the space of q-grams of length q, +that contains the number of times a q-gram appears for the string s """ struct Cosine <: QGramDistance q::Int @@ -195,11 +132,6 @@ function evaluate(dist::Cosine, count_dict) 1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2)) end -############################################################################## -## -## Jaccard -## -############################################################################## """ Jaccard(q::Int) @@ -225,11 +157,6 @@ function evaluate(dist::Jaccard, count_dict) 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect) end -############################################################################## -## -## SorensenDice -## -############################################################################## """ SorensenDice(q::Int) @@ -255,12 +182,6 @@ function evaluate(dist::SorensenDice, count_dict) 1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2) end -############################################################################## -## -## overlap -## -## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q))) -############################################################################## """ Overlap(q::Int)