matthieugomez 2020-02-24 09:41:38 -05:00
parent afafe93bf6
commit 6f22f2c9f5
3 changed files with 27 additions and 28 deletions

View File

@ -27,6 +27,7 @@ function (dist::Jaro)(s1, s2)
ch1_match = Vector{eltype(s1)}()
for (i1, ch1) in enumerate(s1)
for (i2, ch2) in enumerate(s2)
# greedy alignement
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
flag[i2] = true
push!(ch1_match, ch1)
@ -191,16 +192,16 @@ function matching_blocks(s1, s2)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
a = longest_common_pattern(s1, s2)
n1, n2, len = longest_common_pattern(s1, s2)
# exit if there is no common substring
a[3] == 0 && return x
len == 0 && return x
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
# add the longest common substring that happens before
matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
# add the longest common substring that happens after
matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1),
start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1),
start1 + n1 + len - 1, start2 + n2 + len - 1)
return x

View File

@ -237,14 +237,15 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1)
partial_dist = Partial(dist.dist)
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
score_partial = 1 - partial_scale * (1 - partial_dist(s1, s2, 1 - (1 - max_dist) / partial_scale))
min_score = min(max_dist, score_partial)
score_sort = 1 - unbase_scale * partial_scale *
(1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
(1 - TokenSort(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale * partial_scale *
(1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
return min(score, score_partial, score_sort, score_set)
score_sort = 1 - unbase_scale *

View File

@ -15,6 +15,7 @@ function Base.iterate(qgram::QGramIterator{<: AbstractString},
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
#q-grams of AbstractVector
@ -25,9 +26,11 @@ function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstinde
view(qgram.s, state:(state + qgram.q - 1)), state + 1
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
@doc """
Return an iterator on the q-gram of a string
### Arguments
@ -40,15 +43,14 @@ for x in qgrams("hello", 2)
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
# For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2,
# returns a tuple with the numbers of times it appears in x1 and x2
function count_map(s1, s2)
# For two iterators s1 and s2, that define a length and eltype method,
# this returns an iterator that,
# for each element in s1 s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
function _count(s1, s2)
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
sizehint!(d, length(s1) + length(s2))
@ -74,7 +76,7 @@ function count_map(s1, s2)
@inbounds Base._setindex!(d, (0, 1), x2, -index)
return d
return values(d)
@ -98,9 +100,8 @@ end
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
for (n1, n2) in itr
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
n += abs(n1 - n2)
@ -124,9 +125,8 @@ end
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
@ -151,9 +151,8 @@ end
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -178,9 +177,8 @@ end
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -205,9 +203,8 @@ end
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)