correct partialsort

pull/22/head
matthieugomez 2019-12-18 10:17:08 -05:00
parent 3cb9576ab4
commit 3b9493f8a9
3 changed files with 47 additions and 64 deletions

View File

@ -3,11 +3,7 @@ module StringDistances
using Distances
import Distances: evaluate, result_type
##############################################################################
##
## include
##
##############################################################################
abstract type StringDistance <: SemiMetric end
include("utils.jl")
include("edit.jl")
@ -58,12 +54,12 @@ TokenMax,
evaluate,
compare,
result_type,
qgram
qgrams
end
##############################################################################
##
## Some memo about Strings
## Some things about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector).

View File

@ -2,7 +2,7 @@
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the `StringDistance` `dist`
`s2` based on the string distance `dist`.
### Examples
```julia-repl
@ -20,14 +20,9 @@ function compare(s1::AbstractString, s2::AbstractString,
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
if min_score == 0.0
return 1.0 - evaluate(dist, s1, s2) / len2
else
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
out = 1.0 - d / len2
out < min_score && return 0.0
return out
end
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
out = 1.0 - d / len2
out < min_score ? 0.0 : out
end
function compare(s1::AbstractString, s2::AbstractString,
@ -102,7 +97,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
len1 == 0 && return 1.0
out = 0.0
for x in qgram(s2, len1)
for x in qgrams(s2, len1)
curr = compare(s1, x, dist.dist; min_score = min_score)
out = max(out, curr)
min_score = max(out, min_score)
@ -169,7 +164,7 @@ end
Creates the `TokenSet{dist}` distance
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
in word orders and word numbers, by comparing the intersection of two strings with each string.
in word orders and word numbers by comparing the intersection of two strings with each string.
### Examples
```julia-repl
@ -192,12 +187,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
dist0 = compare(s0, s1, dist.dist; min_score = min_score)
min_score = max(min_score, dist0)
dist1 = compare(s0, s2, dist.dist; min_score = min_score)
min_score = max(min_score, dist1)
dist2 = compare(s0, s2, dist.dist; min_score = min_score)
max(dist0, dist1, dist2)
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
min_score = max(min_score, score_01)
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
min_score = max(min_score, score_02)
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
max(score_01, score_02, score_12)
end
@ -225,31 +220,31 @@ end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist0 = compare(s1, s2, dist.dist; min_score = min_score)
min_score = max(min_score, dist0)
score = compare(s1, s2, dist.dist; min_score = min_score)
min_score = max(min_score, score)
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1)
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
dist1 = partial_scale * compare(s1, s2, Partial(dist.dist);
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
min_score = min_score / partial_scale)
min_score = max(min_score, dist1)
dist2 = unbase_scale * partial_scale *
min_score = max(min_score, score_partial)
score_sort = unbase_scale * partial_scale *
compare(s1, s2, TokenSort(Partial(dist.dist));
min_score = min_score / (unbase_scale * partial_scale))
min_score = max(min_score, dist2)
dist3 = unbase_scale * partial_scale *
min_score = max(min_score, score_sort)
score_set = unbase_scale * partial_scale *
compare(s1, s2, TokenSet(Partial(dist.dist));
min_score = min_score / (unbase_scale * partial_scale))
return max(dist0, dist1, dist2, dist3)
return max(score, score_partial, score_sort, score_set)
else
dist1 = unbase_scale *
score_sort = unbase_scale *
compare(s1, s2, TokenSort(dist.dist);
min_score = min_score / unbase_scale)
min_score = max(min_score, dist1)
dist2 = unbase_scale *
min_score = max(min_score, score_sort)
score_set = unbase_scale *
compare(s1, s2, TokenSet(dist.dist);
min_score = min_score / unbase_scale)
return max(dist0, dist1, dist2)
return max(score, score_sort, score_set)
end
end

View File

@ -1,11 +1,6 @@
##############################################################################
##
## Define a type that iterates through q-grams of a string
##
############################################################################
struct QGramIterator{S <: AbstractString}
s::S # string
s::S # string
q::Int # Length of Qgram
end
@ -22,43 +17,34 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
"""
Return an iterator that iterates on the QGram of the string
Return an iterator on the q-gram of a string
### Arguments
* `s::AbstractString`
* `q::Integer`: length of qgram
* `q::Integer`: length of q-gram
## Examples
```julia
using StringDistances
for x in qgram("hello", 2)
for x in qgrams("hello", 2)
println(x)
end
```
"""
qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
##############################################################################
##
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract type QGramDistance <: StringDistance end
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, values(x))
end
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
# For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2,
# returns a tuple with the numbers of times it appears in x1 and x2
function count_map(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
sizehint!(d, length(s1) + length(s2))
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
sizehint!(d, length(s1) + length(s2))
for x1 in s1
index = Base.ht_keyindex2!(d, x1)
if index > 0
@ -98,8 +84,10 @@ struct QGram <: QGramDistance
q::Int
end
function evaluate(dist::QGram, itr)
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
itr =
for (n1, n2) in itr
n += abs(n1 - n2)
end
@ -122,7 +110,8 @@ struct Cosine <: QGramDistance
q::Int
end
function evaluate(dist::Cosine, itr)
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr
norm1 += n1^2
@ -147,7 +136,8 @@ struct Jaccard <: QGramDistance
q::Int
end
function evaluate(dist::Jaccard, itr)
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0
@ -172,7 +162,8 @@ struct SorensenDice <: QGramDistance
q::Int
end
function evaluate(dist::SorensenDice, itr)
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0
@ -197,7 +188,8 @@ struct Overlap <: QGramDistance
q::Int
end
function evaluate(dist::Overlap, itr)
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
ndistinct1 += n1 > 0