remove trie

pull/22/head
matthieugomez 2019-12-12 18:55:41 -05:00
parent f44ab13fef
commit 8be5a00e3d
5 changed files with 79 additions and 185 deletions

View File

@ -1,7 +1,5 @@
module StringDistances
using Distances
import Distances: evaluate, result_type
using DataStructures # for SortedSet in TokenSort
@ -69,11 +67,14 @@ end
## Some memo about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
# ncodeunits: Return the number of code units in a string (aking to index of vector).
# Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str
# multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are
# less than N characters, return ncodeunits(str) + (N - length(s))
##############################################################################

View File

@ -1,21 +1,16 @@
##############################################################################
##
## compare
## compare always return a value between 0 and 1.
##
##############################################################################
"""
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
compare returns a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`
"""
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
1.0 - evaluate(dist, s1, s2)
end
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
@ -29,7 +24,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei
end
end
function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; min_score = 0.0)
function compare(s1::AbstractString, s2::AbstractString,
dist::QGramDistance; min_score = 0.0)
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -41,15 +37,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::QGramDistance; mi
end
end
##############################################################################
##
## Winkler
##
##############################################################################
"""
Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
Winkler is a `StringDistance` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`)
Winkler is a `StringDistance` modifier that boosts the similarity score between
two strings by a scale `p` when the strings share a common prefix with lenth lower
than `l` (the boost is only applied the similarity score above `boosting_threshold`)
"""
struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
dist::T1
@ -76,16 +69,12 @@ end
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
##############################################################################
##
## Partial
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
Partial(dist::StringDistance)
Partial is a `StringDistance` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
Partial is a `StringDistance` modifier that returns the maximal similarity score
between the shorter string and substrings of the longer string
"""
struct Partial{T <: StringDistance} <: StringDistance
dist::T
@ -129,42 +118,35 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
return out
end
##############################################################################
##
## TokenSort
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
TokenSort(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders by reording words alphabetically.
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
by reording words alphabetically.
"""
struct TokenSort{T <: StringDistance} <: StringDistance
dist::T
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(s1, s2, dist.dist; min_score = min_score)
end
##############################################################################
##
## TokenSet
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
TokenSet(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
and word numbers by comparing the intersection of two strings with each string.
"""
struct TokenSet{T <: StringDistance} <: StringDistance
dist::T
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
v1 = SortedSet(split(s1))
v2 = SortedSet(split(s2))
@ -182,15 +164,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
end
##############################################################################
##
## TokenMax
##
##############################################################################
"""
TokenMax(dist::StringDistance)
TokenSort is a `StringDistance` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
TokenSort is a `StringDistance` modifier that combines similarlity scores using the base
distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on
string lengths.
"""
struct TokenMax{T <: StringDistance} <: StringDistance
dist::T

View File

@ -1,9 +1,3 @@
##############################################################################
##
## Jaro
##
##############################################################################
"""
Jaro()
@ -23,7 +17,8 @@ struct Jaro <: StringDistance end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
# if both are empty, m = 0 so should be 1.0 according to wikipedia.
# Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
@ -75,25 +70,20 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
##############################################################################
##
## Levenshtein
##
## Return max_dist +1 if distance higher than max_dist
## This makes it possible to differentiate distance equalt to max_dist vs strictly higher
## This is important for find_all
##
##############################################################################
"""
Levenshtein()
Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other.
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: StringDistance end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist +1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -133,17 +123,15 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
return current
end
##############################################################################
##
## Damerau Levenshtein
##
##############################################################################
"""
DamerauLevenshtein()
Creates the DamerauLevenshtein metric
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other.
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
"""
struct DamerauLevenshtein <: StringDistance end
@ -219,18 +207,15 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
return current
end
##############################################################################
##
## Ratcliff/Obershelp
##
##############################################################################
"""
RatcliffObershelp()
Creates the RatcliffObershelp metric
The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
The distance between two strings is defined as one minus the number of matching characters
divided by the total number of characters in the two strings. Matching characters are those
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: StringDistance end
@ -244,7 +229,8 @@ function matching_blocks(s1::AbstractString, s2::AbstractString)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2, len1 , len2)
# exit if there is no common substring
a[3] == 0 && return x
@ -257,6 +243,7 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
# add the longest common substring that happens after
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
return x
end

View File

@ -1,9 +1,12 @@
"""
findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
`findmax` returns the value and index of the element of `itr` that has the highest similarity score with `s` according to the distance `dist`.
It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0)
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
`findmax` returns the value and index of the element of `itr` that has the
highest similarity score with `s` according to the distance `dist`.
It returns `(nothing, nothing)` if none of the elements has a similarity score
higher or equal to `min_score` (default to 0.0)
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
vmin = Threads.Atomic{typeof(min_score)}(min_score)
@ -26,8 +29,11 @@ end
"""
findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
`findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
(potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]

View File

@ -36,24 +36,23 @@ for x in qgram("hello", 2)
end
```
"""
function qgram(s::AbstractString, q::Integer)
QGramIterator{typeof(s)}(s, q)
qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
##############################################################################
##
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract type QGramDistance <: StringDistance end
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
end
##############################################################################
##
##
##
##
##
##############################################################################
"""
count_map(x1, x2)
For two iterators `x1` and `x2`, `count_map(x1, x2)` returns an dictionary
that returns, for each element in `x1` or `x2`, a tuple with the numbers of
times it appears in `x1` and the number of times it appears in `x2`
"""
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
# returns a tuple with the numbers of times it appears in x1 and x2
function count_map(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
@ -83,64 +82,6 @@ function count_map(s1, s2)
return d
end
#= Trie
function count_map(s1, s2)
d = Trie{Tuple{Int, Int}}()
for ch1 in s1
node = d
for char in ch1
if !haskey(node.children, char)
node.children[char] = Trie{Tuple{Int, Int}}()
end
node = node.children[char]
end
node.value = node.is_key ? (node.value[1] + 1, 0) : (1, 0)
node.is_key = true
end
for ch2 in s2
node = d
for char in ch2
if !haskey(node.children, char)
node.children[char] = Trie{Tuple{Int, Int}}()
end
node = node.children[char]
end
node.value = node.is_key ? (node.value[1], node.value[2]+ 1) : (0, 1)
node.is_key = true
end
return iterator(d)
end
function iterator(t::Trie, found = Tuple{Int, Int}[])
if t.is_key
t.is_key = false
push!(found, t.value)
else
for k in values(t.children)
iterator(k, found)
end
end
return found
end
=#
##############################################################################
##
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract type QGramDistance <: StringDistance end
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
end
##############################################################################
##
## q-gram
##
##############################################################################
"""
QGram(q::Int)
@ -150,7 +91,8 @@ The distance corresponds to
``||v(s1, q) - v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct QGram <: QGramDistance
q::Int
@ -164,12 +106,6 @@ function evaluate(dist::QGram, count_dict)
n
end
##############################################################################
##
## cosine
##
##
##############################################################################
"""
Cosine(q::Int)
@ -179,7 +115,8 @@ The distance corresponds to
`` 1 - v(s1, q).v(s2, q) / ||v(s1, q)|| * ||v(s2, q)||``
where ``v(s, q)`` denotes the vector on the space of q-grams of length q, that contains the number of times a q-gram appears for the string s
where ``v(s, q)`` denotes the vector on the space of q-grams of length q,
that contains the number of times a q-gram appears for the string s
"""
struct Cosine <: QGramDistance
q::Int
@ -195,11 +132,6 @@ function evaluate(dist::Cosine, count_dict)
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
end
##############################################################################
##
## Jaccard
##
##############################################################################
"""
Jaccard(q::Int)
@ -225,11 +157,6 @@ function evaluate(dist::Jaccard, count_dict)
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
##############################################################################
##
## SorensenDice
##
##############################################################################
"""
SorensenDice(q::Int)
@ -255,12 +182,6 @@ function evaluate(dist::SorensenDice, count_dict)
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
##############################################################################
##
## overlap
##
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
##############################################################################
"""
Overlap(q::Int)