simplify Ratcliff
parent
283ce87ef2
commit
d80071590b
30
src/edit.jl
30
src/edit.jl
|
@ -224,51 +224,47 @@ struct RatcliffObershelp <: SemiMetric end
|
|||
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
||||
end
|
||||
|
||||
function matching_blocks(s1, s2)
|
||||
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
||||
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2,
|
||||
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||
a = longest_common_pattern(s1, s2, len1 , len2)
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
|
||||
a = longest_common_pattern(s1, s2)
|
||||
# exit if there is no common substring
|
||||
a[3] == 0 && return x
|
||||
# add the info of the common to the existing set
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
# add the longest common substring that happens before
|
||||
s1before = _take(s1, a[1] - 1)
|
||||
s2before = _take(s2, a[2] - 1)
|
||||
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
||||
matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
|
||||
# add the longest common substring that happens after
|
||||
s1after = _drop(s1, a[1] + a[3] - 1)
|
||||
s2after = _drop(s2, a[2] + a[3] - 1)
|
||||
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
|
||||
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1),
|
||||
start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
return x
|
||||
end
|
||||
|
||||
function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
|
||||
if len1 > len2
|
||||
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
||||
function longest_common_pattern(s1, s2)
|
||||
if length(s1) > length(s2)
|
||||
start2, start1, len = longest_common_pattern(s2, s1)
|
||||
else
|
||||
start1, start2, len = 0, 0, 0
|
||||
p = zeros(Int, len2)
|
||||
p = zeros(Int, length(s2))
|
||||
i1 = 0
|
||||
for ch1 in s1
|
||||
i1 += 1
|
||||
i2 = 0
|
||||
oldp = 0
|
||||
i2 = 0
|
||||
for ch2 in s2
|
||||
i2 += 1
|
||||
newp = 0
|
||||
if ch1 == ch2
|
||||
newp = oldp > 0 ? oldp : i2
|
||||
currentlength = (i2 - newp + 1)
|
||||
currentlength = i2 - newp + 1
|
||||
if currentlength > len
|
||||
start1, start2, len = i1 - currentlength + 1, newp, currentlength
|
||||
end
|
||||
|
|
|
@ -63,14 +63,14 @@ function _take(s, n::Integer)
|
|||
Base.Iterators.take(s, n)
|
||||
end
|
||||
function _take(s::AbstractString, n::Integer)
|
||||
SubString(s, firstindex(s), nextind(s, 0, n))
|
||||
StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n)
|
||||
end
|
||||
|
||||
function _drop(s, n::Integer)
|
||||
Base.Iterators.drop(s, n)
|
||||
end
|
||||
function _drop(s::AbstractString, n::Integer)
|
||||
SubString(s, nextind(s, 0, n + 1), lastindex(s))
|
||||
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
|
||||
end
|
||||
|
||||
function _slice(s, n1::Integer, n2::Integer)
|
||||
|
|
Loading…
Reference in New Issue