correct partialsort
parent
3cb9576ab4
commit
3b9493f8a9
|
@ -3,11 +3,7 @@ module StringDistances
|
||||||
using Distances
|
using Distances
|
||||||
import Distances: evaluate, result_type
|
import Distances: evaluate, result_type
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
##
|
|
||||||
## include
|
|
||||||
##
|
|
||||||
##############################################################################
|
|
||||||
abstract type StringDistance <: SemiMetric end
|
abstract type StringDistance <: SemiMetric end
|
||||||
include("utils.jl")
|
include("utils.jl")
|
||||||
include("edit.jl")
|
include("edit.jl")
|
||||||
|
@ -58,12 +54,12 @@ TokenMax,
|
||||||
evaluate,
|
evaluate,
|
||||||
compare,
|
compare,
|
||||||
result_type,
|
result_type,
|
||||||
qgram
|
qgrams
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Some memo about Strings
|
## Some things about Strings
|
||||||
|
|
||||||
# length: number of characters
|
# length: number of characters
|
||||||
# ncodeunits: Return the number of code units in a string (aking to index of vector).
|
# ncodeunits: Return the number of code units in a string (aking to index of vector).
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
||||||
|
|
||||||
return a similarity score between 0 and 1 for the strings `s1` and
|
return a similarity score between 0 and 1 for the strings `s1` and
|
||||||
`s2` based on the `StringDistance` `dist`
|
`s2` based on the string distance `dist`.
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
```julia-repl
|
```julia-repl
|
||||||
|
@ -20,14 +20,9 @@ function compare(s1::AbstractString, s2::AbstractString,
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 == 0 && return 1.0
|
len2 == 0 && return 1.0
|
||||||
if min_score == 0.0
|
|
||||||
return 1.0 - evaluate(dist, s1, s2) / len2
|
|
||||||
else
|
|
||||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
||||||
out = 1.0 - d / len2
|
out = 1.0 - d / len2
|
||||||
out < min_score && return 0.0
|
out < min_score ? 0.0 : out
|
||||||
return out
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString,
|
||||||
|
@ -102,7 +97,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||||
len1 == 0 && return 1.0
|
len1 == 0 && return 1.0
|
||||||
out = 0.0
|
out = 0.0
|
||||||
for x in qgram(s2, len1)
|
for x in qgrams(s2, len1)
|
||||||
curr = compare(s1, x, dist.dist; min_score = min_score)
|
curr = compare(s1, x, dist.dist; min_score = min_score)
|
||||||
out = max(out, curr)
|
out = max(out, curr)
|
||||||
min_score = max(out, min_score)
|
min_score = max(out, min_score)
|
||||||
|
@ -169,7 +164,7 @@ end
|
||||||
Creates the `TokenSet{dist}` distance
|
Creates the `TokenSet{dist}` distance
|
||||||
|
|
||||||
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
|
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
|
||||||
in word orders and word numbers, by comparing the intersection of two strings with each string.
|
in word orders and word numbers by comparing the intersection of two strings with each string.
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
```julia-repl
|
```julia-repl
|
||||||
|
@ -192,12 +187,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_sco
|
||||||
s1 = join(v1, " ")
|
s1 = join(v1, " ")
|
||||||
s2 = join(v2, " ")
|
s2 = join(v2, " ")
|
||||||
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||||
dist0 = compare(s0, s1, dist.dist; min_score = min_score)
|
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
|
||||||
min_score = max(min_score, dist0)
|
min_score = max(min_score, score_01)
|
||||||
dist1 = compare(s0, s2, dist.dist; min_score = min_score)
|
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
|
||||||
min_score = max(min_score, dist1)
|
min_score = max(min_score, score_02)
|
||||||
dist2 = compare(s0, s2, dist.dist; min_score = min_score)
|
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
|
||||||
max(dist0, dist1, dist2)
|
max(score_01, score_02, score_12)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -225,31 +220,31 @@ end
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
|
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
dist0 = compare(s1, s2, dist.dist; min_score = min_score)
|
score = compare(s1, s2, dist.dist; min_score = min_score)
|
||||||
min_score = max(min_score, dist0)
|
min_score = max(min_score, score)
|
||||||
unbase_scale = 0.95
|
unbase_scale = 0.95
|
||||||
# if one string is much shorter than the other, use partial
|
# if one string is much shorter than the other, use partial
|
||||||
if length(s2) >= 1.5 * length(s1)
|
if length(s2) >= 1.5 * length(s1)
|
||||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||||
dist1 = partial_scale * compare(s1, s2, Partial(dist.dist);
|
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
|
||||||
min_score = min_score / partial_scale)
|
min_score = min_score / partial_scale)
|
||||||
min_score = max(min_score, dist1)
|
min_score = max(min_score, score_partial)
|
||||||
dist2 = unbase_scale * partial_scale *
|
score_sort = unbase_scale * partial_scale *
|
||||||
compare(s1, s2, TokenSort(Partial(dist.dist));
|
compare(s1, s2, TokenSort(Partial(dist.dist));
|
||||||
min_score = min_score / (unbase_scale * partial_scale))
|
min_score = min_score / (unbase_scale * partial_scale))
|
||||||
min_score = max(min_score, dist2)
|
min_score = max(min_score, score_sort)
|
||||||
dist3 = unbase_scale * partial_scale *
|
score_set = unbase_scale * partial_scale *
|
||||||
compare(s1, s2, TokenSet(Partial(dist.dist));
|
compare(s1, s2, TokenSet(Partial(dist.dist));
|
||||||
min_score = min_score / (unbase_scale * partial_scale))
|
min_score = min_score / (unbase_scale * partial_scale))
|
||||||
return max(dist0, dist1, dist2, dist3)
|
return max(score, score_partial, score_sort, score_set)
|
||||||
else
|
else
|
||||||
dist1 = unbase_scale *
|
score_sort = unbase_scale *
|
||||||
compare(s1, s2, TokenSort(dist.dist);
|
compare(s1, s2, TokenSort(dist.dist);
|
||||||
min_score = min_score / unbase_scale)
|
min_score = min_score / unbase_scale)
|
||||||
min_score = max(min_score, dist1)
|
min_score = max(min_score, score_sort)
|
||||||
dist2 = unbase_scale *
|
score_set = unbase_scale *
|
||||||
compare(s1, s2, TokenSet(dist.dist);
|
compare(s1, s2, TokenSet(dist.dist);
|
||||||
min_score = min_score / unbase_scale)
|
min_score = min_score / unbase_scale)
|
||||||
return max(dist0, dist1, dist2)
|
return max(score, score_sort, score_set)
|
||||||
end
|
end
|
||||||
end
|
end
|
46
src/qgram.jl
46
src/qgram.jl
|
@ -1,9 +1,4 @@
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
##
|
|
||||||
## Define a type that iterates through q-grams of a string
|
|
||||||
##
|
|
||||||
############################################################################
|
|
||||||
struct QGramIterator{S <: AbstractString}
|
struct QGramIterator{S <: AbstractString}
|
||||||
s::S # string
|
s::S # string
|
||||||
q::Int # Length of Qgram
|
q::Int # Length of Qgram
|
||||||
|
@ -22,43 +17,34 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||||||
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Return an iterator that iterates on the QGram of the string
|
Return an iterator on the q-gram of a string
|
||||||
|
|
||||||
### Arguments
|
### Arguments
|
||||||
* `s::AbstractString`
|
* `s::AbstractString`
|
||||||
* `q::Integer`: length of qgram
|
* `q::Integer`: length of q-gram
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
```julia
|
```julia
|
||||||
using StringDistances
|
for x in qgrams("hello", 2)
|
||||||
for x in qgram("hello", 2)
|
|
||||||
println(x)
|
println(x)
|
||||||
end
|
end
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
qgram(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
qgrams(s::AbstractString, q::Integer) = QGramIterator{typeof(s)}(s, q)
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
|
||||||
##
|
|
||||||
## Distance on strings is computed by set distance on qgram sets
|
|
||||||
##
|
|
||||||
##############################################################################
|
|
||||||
|
|
||||||
abstract type QGramDistance <: StringDistance end
|
abstract type QGramDistance <: StringDistance end
|
||||||
|
|
||||||
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
|
# For two iterators x1 and x2, that define a length and eltype method,
|
||||||
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
|
# this returns a dictionary which, for each element in x1 or x2,
|
||||||
evaluate(dist, values(x))
|
|
||||||
end
|
|
||||||
|
|
||||||
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
|
|
||||||
# returns a tuple with the numbers of times it appears in x1 and x2
|
# returns a tuple with the numbers of times it appears in x1 and x2
|
||||||
function count_map(s1, s2)
|
function count_map(s1, s2)
|
||||||
K = promote_type(eltype(s1), eltype(s2))
|
K = promote_type(eltype(s1), eltype(s2))
|
||||||
d = Dict{K, Tuple{Int, Int}}()
|
d = Dict{K, Tuple{Int, Int}}()
|
||||||
|
sizehint!(d, length(s1) + length(s2))
|
||||||
# I use a faster way to change a dictionary key
|
# I use a faster way to change a dictionary key
|
||||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||||
sizehint!(d, length(s1) + length(s2))
|
|
||||||
for x1 in s1
|
for x1 in s1
|
||||||
index = Base.ht_keyindex2!(d, x1)
|
index = Base.ht_keyindex2!(d, x1)
|
||||||
if index > 0
|
if index > 0
|
||||||
|
@ -98,8 +84,10 @@ struct QGram <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::QGram, itr)
|
function evaluate(dist::QGram, s1::AbstractString, s2::AbstractString)
|
||||||
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
n = 0
|
n = 0
|
||||||
|
itr =
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
n += abs(n1 - n2)
|
n += abs(n1 - n2)
|
||||||
end
|
end
|
||||||
|
@ -122,7 +110,8 @@ struct Cosine <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Cosine, itr)
|
function evaluate(dist::Cosine, s1::AbstractString, s2::AbstractString)
|
||||||
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
norm1 += n1^2
|
norm1 += n1^2
|
||||||
|
@ -147,7 +136,8 @@ struct Jaccard <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, itr)
|
function evaluate(dist::Jaccard, s1::AbstractString, s2::AbstractString)
|
||||||
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
|
@ -172,7 +162,8 @@ struct SorensenDice <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::SorensenDice, itr)
|
function evaluate(dist::SorensenDice, s1::AbstractString, s2::AbstractString)
|
||||||
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
|
@ -197,7 +188,8 @@ struct Overlap <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Overlap, itr)
|
function evaluate(dist::Overlap, s1::AbstractString, s2::AbstractString)
|
||||||
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in itr
|
for (n1, n2) in itr
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
|
|
Loading…
Reference in New Issue