simplify Ratcliff

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-16 11:12:31 -05:00
parent 283ce87ef2
commit d80071590b
2 changed files with 15 additions and 19 deletions

View File

@ -224,51 +224,47 @@ struct RatcliffObershelp <: SemiMetric end
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
end
function matching_blocks(s1, s2)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2,
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_pattern(s1, s2, len1 , len2)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer)
a = longest_common_pattern(s1, s2)
# exit if there is no common substring
a[3] == 0 && return x
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
s1before = _take(s1, a[1] - 1)
s2before = _take(s2, a[2] - 1)
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2)
# add the longest common substring that happens after
s1after = _drop(s1, a[1] + a[3] - 1)
s2after = _drop(s2, a[2] + a[3] - 1)
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1),
start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
return x
end
function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
if len1 > len2
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
function longest_common_pattern(s1, s2)
if length(s1) > length(s2)
start2, start1, len = longest_common_pattern(s2, s1)
else
start1, start2, len = 0, 0, 0
p = zeros(Int, len2)
p = zeros(Int, length(s2))
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
i2 = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
currentlength = i2 - newp + 1
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end

View File

@ -63,14 +63,14 @@ function _take(s, n::Integer)
Base.Iterators.take(s, n)
end
function _take(s::AbstractString, n::Integer)
SubString(s, firstindex(s), nextind(s, 0, n))
StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n)
end
function _drop(s, n::Integer)
Base.Iterators.drop(s, n)
end
function _drop(s::AbstractString, n::Integer)
SubString(s, nextind(s, 0, n + 1), lastindex(s))
StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n)
end
function _slice(s, n1::Integer, n2::Integer)