clean
parent
afafe93bf6
commit
6f22f2c9f5
13
src/edit.jl
13
src/edit.jl
|
@ -27,6 +27,7 @@ function (dist::Jaro)(s1, s2)
|
|||
ch1_match = Vector{eltype(s1)}()
|
||||
for (i1, ch1) in enumerate(s1)
|
||||
for (i2, ch2) in enumerate(s2)
|
||||
# greedy alignement
|
||||
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
|
||||
flag[i2] = true
|
||||
push!(ch1_match, ch1)
|
||||
|
@ -191,16 +192,16 @@ function matching_blocks(s1, s2)
|
|||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
|
||||
a = longest_common_pattern(s1, s2)
|
||||
n1, n2, len = longest_common_pattern(s1, s2)
|
||||
# exit if there is no common substring
|
||||
a[3] == 0 && return x
|
||||
len == 0 && return x
|
||||
# add the info of the common to the existing set
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
push!(x, (n1 + start1 - 1, n2 + start2 - 1, len))
|
||||
# add the longest common substring that happens before
|
||||
matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
|
||||
matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2)
|
||||
# add the longest common substring that happens after
|
||||
matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1),
|
||||
start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1),
|
||||
start1 + n1 + len - 1, start2 + n2 + len - 1)
|
||||
return x
|
||||
end
|
||||
|
||||
|
|
|
@ -237,14 +237,15 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
|
|||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial_dist = Partial(dist.dist)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||
score_partial = 1 - partial_scale * (1 - partial_dist(s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||
min_score = min(max_dist, score_partial)
|
||||
score_sort = 1 - unbase_scale * partial_scale *
|
||||
(1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
(1 - TokenSort(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale * partial_scale *
|
||||
(1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
return min(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
score_sort = 1 - unbase_scale *
|
||||
|
|
35
src/qgram.jl
35
src/qgram.jl
|
@ -15,6 +15,7 @@ function Base.iterate(qgram::QGramIterator{<: AbstractString},
|
|||
end
|
||||
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||||
|
||||
|
||||
#q-grams of AbstractVector
|
||||
|
@ -25,9 +26,11 @@ function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstinde
|
|||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
|
||||
|
||||
"""
|
||||
@doc """
|
||||
Return an iterator on the q-gram of a string
|
||||
|
||||
### Arguments
|
||||
|
@ -40,15 +43,14 @@ for x in qgrams("hello", 2)
|
|||
println(x)
|
||||
end
|
||||
```
|
||||
"""
|
||||
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
"""
|
||||
qgrams
|
||||
|
||||
|
||||
# For two iterators x1 and x2, that define a length and eltype method,
|
||||
# this returns a dictionary which, for each element in x1 or x2,
|
||||
# returns a tuple with the numbers of times it appears in x1 and x2
|
||||
function count_map(s1, s2)
|
||||
# For two iterators s1 and s2, that define a length and eltype method,
|
||||
# this returns an iterator that,
|
||||
# for each element in s1 ∪ s2, returns (numbers of times it appears in s1, numbers of times it appears in s2)
|
||||
function _count(s1, s2)
|
||||
K = promote_type(eltype(s1), eltype(s2))
|
||||
d = Dict{K, Tuple{Int, Int}}()
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
|
@ -74,7 +76,7 @@ function count_map(s1, s2)
|
|||
@inbounds Base._setindex!(d, (0, 1), x2, -index)
|
||||
end
|
||||
end
|
||||
return d
|
||||
return values(d)
|
||||
end
|
||||
|
||||
|
||||
|
@ -98,9 +100,8 @@ end
|
|||
|
||||
function (dist::QGram)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
for (n1, n2) in itr
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
n
|
||||
|
@ -124,9 +125,8 @@ end
|
|||
|
||||
function (dist::Cosine)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
norm1 += n1^2
|
||||
norm2 += n2^2
|
||||
prodnorm += n1 * n2
|
||||
|
@ -151,9 +151,8 @@ end
|
|||
|
||||
function (dist::Jaccard)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -178,9 +177,8 @@ end
|
|||
|
||||
function (dist::SorensenDice)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -205,9 +203,8 @@ end
|
|||
|
||||
function (dist::Overlap)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
|
Loading…
Reference in New Issue