correct partialsort
parent
3cb9576ab4
commit
3b9493f8a9
|
@ -3,11 +3,7 @@ module StringDistances
|
|||
using Distances
|
||||
import Distances: evaluate, result_type
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## include
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
abstract type StringDistance <: SemiMetric end
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
|
@ -58,12 +54,12 @@ TokenMax,
|
|||
evaluate,
|
||||
compare,
|
||||
result_type,
|
||||
qgram
|
||||
qgrams
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Some memo about Strings
|
||||
## Some things about Strings
|
||||
|
||||
# length: number of characters
|
||||
# ncodeunits: Return the number of code units in a string (aking to index of vector).
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the `StringDistance` `dist`
|
||||
`s2` based on the string distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
|
@ -20,14 +20,9 @@ function compare(s1::AbstractString, s2::AbstractString,
|
|||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
if min_score == 0.0
|
||||
return 1.0 - evaluate(dist, s1, s2) / len2
|
||||
else
|
||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
||||
out = 1.0 - d / len2
|
||||
out < min_score && return 0.0
|
||||
return out
|
||||
end
|
||||
out < min_score ? 0.0 : out
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
|
@ -102,7 +97,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
|
|||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
len1 == 0 && return 1.0
|
||||
out = 0.0
|
||||
for x in qgram(s2, len1)
|
||||
for x in qgrams(s2, len1)
|
||||
curr = compare(s1, x, dist.dist; min_score = min_score)
|
||||
out = max(out, curr)
|
||||
min_score = max(out, min_score)
|
||||
|
@ -169,7 +164,7 @@ end
|
|||
Creates the `TokenSet{dist}` distance
|
||||
|
||||
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
|
||||
in word orders and word numbers, by comparing the intersection of two strings with each string.
|
||||
in word orders and word numbers by comparing the intersection of two strings with each string.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
|
@ -192,12 +187,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
|
|||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
dist0 = compare(s0, s1, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, dist0)
|
||||
dist1 = compare(s0, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, dist1)
|
||||
dist2 = compare(s0, s2, dist.dist; min_score = min_score)
|
||||
max(dist0, dist1, dist2)
|
||||
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score_01)
|
||||
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score_02)
|
||||
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
max(score_01, score_02, score_12)
|
||||
end
|
||||
|
||||
|
||||
|
@ -225,31 +220,31 @@ end
|
|||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist0 = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, dist0)
|
||||
score = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
dist1 = partial_scale * compare(s1, s2, Partial(dist.dist);
|
||||
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
|
||||
min_score = min_score / partial_scale)
|
||||
min_score = max(min_score, dist1)
|
||||
dist2 = unbase_scale * partial_scale *
|
||||
min_score = max(min_score, score_partial)
|
||||
score_sort = unbase_scale * partial_scale *
|
||||
compare(s1, s2, TokenSort(Partial(dist.dist));
|
||||
min_score = min_score / (unbase_scale * partial_scale))
|
||||
min_score = max(min_score, dist2)
|
||||
dist3 = unbase_scale * partial_scale *
|
||||
min_score = max(min_score, score_sort)
|
||||
score_set = unbase_scale * partial_scale *
|
||||
compare(s1, s2, TokenSet(Partial(dist.dist));
|
||||
min_score = min_score / (unbase_scale * partial_scale))
|
||||
return max(dist0, dist1, dist2, dist3)
|
||||
return max(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
dist1 = unbase_scale *
|
||||
score_sort = unbase_scale *
|
||||
compare(s1, s2, TokenSort(dist.dist);
|
||||
min_score = min_score / unbase_scale)
|
||||
min_score = max(min_score, dist1)
|
||||
dist2 = unbase_scale *
|
||||
min_score = max(min_score, score_sort)
|
||||
score_set = unbase_scale *
|
||||
compare(s1, s2, TokenSet(dist.dist);
|
||||
min_score = min_score / unbase_scale)
|
||||
return max(dist0, dist1, dist2)
|
||||
return max(score, score_sort, score_set)
|
||||
end
|
||||
end
|
46
src/qgram.jl
46
src/qgram.jl
|
@ -1,9 +1,4 @@
|
|||
|
||||
##############################################################################
|
||||
##
|
||||
## Define a type that iterates through q-grams of a string
|
||||
##
|
||||
############################################################################
|
||||
struct QGramIterator{S <: AbstractString}
|
||||
s::S # string
|
||||
q::Int # Length of Qgram
|
||||
|
@ -22,43 +17,34 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
|||
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
||||
|
||||
"""
|
||||
Return an iterator that iterates on the QGram of the string
|
||||
Return an iterator on the q-gram of a string
|
||||
|
||||
### Arguments
|
||||
* `s::AbstractString`
|
||||
* `q::Integer`: length of qgram
|
||||
* `q::Integer`: length of q-gram
|
||||
|
||||
## Examples
|
||||
```julia
|
||||
using StringDistances
|
||||
for x in qgram("hello", 2)
|
||||
for x in qgrams("hello", 2)
|
||||
println(x)
|
||||
end
|
||||
```
|
||||
"""
|
||||
qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
||||
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Distance on strings is computed by set distance on qgram sets
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
abstract type QGramDistance <: StringDistance end
|
||||
|
||||
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
|
||||
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
|
||||
evaluate(dist, values(x))
|
||||
end
|
||||
|
||||
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
|
||||
# For two iterators x1 and x2, that define a length and eltype method,
|
||||
# this returns a dictionary which, for each element in x1 or x2,
|
||||
# returns a tuple with the numbers of times it appears in x1 and x2
|
||||
function count_map(s1, s2)
|
||||
K = promote_type(eltype(s1), eltype(s2))
|
||||
d = Dict{K, Tuple{Int, Int}}()
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
for x1 in s1
|
||||
index = Base.ht_keyindex2!(d, x1)
|
||||
if index > 0
|
||||
|
@ -98,8 +84,10 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, itr)
|
||||
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
itr =
|
||||
for (n1, n2) in itr
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
|
@ -122,7 +110,8 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, itr)
|
||||
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
norm1 += n1^2
|
||||
|
@ -147,7 +136,8 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, itr)
|
||||
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
|
@ -172,7 +162,8 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, itr)
|
||||
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
|
@ -197,7 +188,8 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, itr)
|
||||
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
|
|
Loading…
Reference in New Issue