diff --git a/src/edit.jl b/src/edit.jl index a6d7bb1..a59d26f 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -224,51 +224,47 @@ struct RatcliffObershelp <: SemiMetric end function (dist::RatcliffObershelp)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing + s1, s2 = reorder(s1, s2) n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2) end function matching_blocks(s1, s2) - matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1) + matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1) end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, - len1::Integer, len2::Integer, start1::Integer, start2::Integer) - a = longest_common_pattern(s1, s2, len1 , len2) +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer) + a = longest_common_pattern(s1, s2) # exit if there is no common substring a[3] == 0 && return x # add the info of the common to the existing set push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) # add the longest common substring that happens before - s1before = _take(s1, a[1] - 1) - s2before = _take(s2, a[2] - 1) - matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2) + matching_blocks!(x, _take(s1, a[1] - 1), _take(s2, a[2] - 1), start1, start2) # add the longest common substring that happens after - s1after = _drop(s1, a[1] + a[3] - 1) - s2after = _drop(s2, a[2] + a[3] - 1) - matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, - len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) + matching_blocks!(x, _drop(s1, a[1] + a[3] - 1), _drop(s2, a[2] + a[3] - 1), + start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) return x end -function longest_common_pattern(s1, s2, len1::Integer, len2::Integer) - if len1 > len2 - start2, start1, len = longest_common_pattern(s2, s1, len2, len1) +function longest_common_pattern(s1, s2) + if length(s1) > length(s2) + start2, start1, len = longest_common_pattern(s2, s1) else start1, start2, len = 0, 0, 0 - p = zeros(Int, len2) + p = zeros(Int, length(s2)) i1 = 0 for ch1 in s1 i1 += 1 - i2 = 0 oldp = 0 + i2 = 0 for ch2 in s2 i2 += 1 newp = 0 if ch1 == ch2 newp = oldp > 0 ? oldp : i2 - currentlength = (i2 - newp + 1) + currentlength = i2 - newp + 1 if currentlength > len start1, start2, len = i1 - currentlength + 1, newp, currentlength end diff --git a/src/utils.jl b/src/utils.jl index a2d84bf..54dd656 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -63,14 +63,14 @@ function _take(s, n::Integer) Base.Iterators.take(s, n) end function _take(s::AbstractString, n::Integer) - SubString(s, firstindex(s), nextind(s, 0, n)) + StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n) end function _drop(s, n::Integer) Base.Iterators.drop(s, n) end function _drop(s::AbstractString, n::Integer) - SubString(s, nextind(s, 0, n + 1), lastindex(s)) + StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n) end function _slice(s, n1::Integer, n2::Integer)