diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index 90424fe..6b94573 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -1,5 +1,5 @@ -using StringDistances, Random +using Revise, StringDistances, Random Random.seed!(2) x = map(Random.randstring, rand(5:25,500_000)) y = map(Random.randstring, rand(5:25,500_000)) @@ -13,6 +13,7 @@ function g(dist, x, y) end +@time f(Hamming(), x, y); @time f(Jaro(), x, y); #0.3s @@ -25,7 +26,7 @@ end @time f(DamerauLevenshtein(), x, y, min_score = 0.8); # 0.08 @time f(RatcliffObershelp(), x, y); -# 1.35s +# 0.8s diff --git a/src/distances/edit.jl b/src/distances/edit.jl index d067bc6..b7650c6 100755 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -42,26 +42,28 @@ function (dist::Jaro)(s1, s2) (s1 === missing) | (s2 === missing) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - # Note: if both are empty, formula in Wikipedia gives 1, but it makes more sense to set it to s1 == s2 - len2 == 0 && return Float64(s1 == s2) + # If both iterators empty, formula in Wikipedia gives 1, but it makes more sense to set it to s1 == s2 + len2 > 0 || return Float64(s1 == s2) d = max(0, div(len2, 2) - 1) flag = fill(false, len2) ch1_match = Vector{eltype(s1)}() for (i1, ch1) in enumerate(s1) for (i2, ch2) in enumerate(s2) # for each character in s1, greedy search of matching character in s2 within a distance d - if (i2 >= i1 - d) && (i2 <= i1 + d) && (ch1 == ch2) && !flag[i2] + i2 >= i1 - d || continue + i2 <= i1 + d || break + if ch1 == ch2 && !flag[i2] flag[i2] = true push!(ch1_match, ch1) break end end end - # m counts number matching characters - m = length(ch1_match) - if m == 0 + if isempty(ch1_match) return 1.0 else + # m counts number matching characters + m = length(ch1_match) # t/2 counts number transpositions t = 0 i1 = 0 @@ -134,11 +136,11 @@ function (dist::Levenshtein)(s1, s2) v = collect(1:(len2-k)) current = 0 for (i1, ch1) in enumerate(s1) - i1 <= k && continue + i1 > k || continue left = current = i1 - k - 1 dist.max_dist !== nothing && (value_lb = left - 1) for (i2, ch2) in enumerate(s2) - i2 <= k && continue + i2 > k || continue @inbounds above, current, left = current, left, v[i2 - k] if ch1 != ch2 current = min(current, above, left) + 1 @@ -192,7 +194,7 @@ function (dist::DamerauLevenshtein)(s1, s2) prevch1, prevch2 = first(s1), first(s2) current = 0 for (i1, ch1) in enumerate(s1) - i1 <= k && (prevch1 = ch1 ; continue) + i1 > k || (prevch1 = ch1 ; continue) left = i1 - k - 1 current = left + 1 nextTransCost = 0 @@ -201,17 +203,17 @@ function (dist::DamerauLevenshtein)(s1, s2) i2_end += i2_end < len2 end for (i2, ch2) in enumerate(s2) - i2 <= k && (prevch2 = ch2 ; continue) + i2 > k || (prevch2 = ch2 ; continue) # no need to look beyond window of lower right diagonal - max distance cells #lower right diag is i1 - (len2 - len1)) and the upper left diagonal + dist.max_dist cells (upper left is i1) - (dist.max_dist !== nothing) && !(i2_start <= i2 - k - 1 < i2_end) && (prevch2 = ch2 ; continue) + dist.max_dist !== nothing && !(i2_start <= i2 - k - 1 < i2_end) && (prevch2 = ch2 ; continue) @inbounds above, current, left = current, left, v[i2 - k] @inbounds w[i2 - k], nextTransCost, thisTransCost = current, w[i2 - k], nextTransCost # left now equals current cost (which will be diagonal at next iteration) if ch1 != ch2 current = min(left, current, above) + 1 # never happens at i2 = k + 1 because then the two previous characters were equal - if (i1 - k - 1 > 0) && (i2 - k - 1 > 0) && (ch1 == prevch2) && (prevch1 == ch2) + if i1 - k - 1 > 0 && i2 - k - 1 > 0 && ch1 == prevch2 && prevch1 == ch2 thisTransCost += 1 current = min(current, thisTransCost) end @@ -240,58 +242,52 @@ struct RatcliffObershelp <: SemiMetric end function (dist::RatcliffObershelp)(s1, s2) (s1 === missing) | (s2 === missing) && return missing - s1, s2 = reorder(s1, s2) - n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) - len1 + len2 == 0 ? 0.0 : 1 - 2 * n_matched / (len1 + len2) + n_matched = length_matching_blocks(s1, s2, 1, 1, len1, len2) + len1 + len2 == 0 ? 0.0 : 1 - 2 * n_matched / (len1 + len2) end -function matching_blocks(s1, s2) - matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, 1, 1) -end - -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer) - n1, n2, len = longest_common_pattern(s1, s2) +function length_matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) + end1 >= start1 || return 0 + end2 >= start2 || return 0 + j1, j2, len = longest_common_pattern(s1, s2, start1, start2, end1, end2) # exit if there is no common substring - len == 0 && return x - # add the info of the common to the existing set - push!(x, (n1 + start1 - 1, n2 + start2 - 1, len)) - # add the longest common substring that happens before - matching_blocks!(x, _take(s1, n1 - 1), _take(s2, n2 - 1), start1, start2) - # add the longest common substring that happens after - matching_blocks!(x, _drop(s1, n1 + len - 1), _drop(s2, n2 + len - 1), - start1 + n1 + len - 1, start2 + n2 + len - 1) - return x + len == 0 && return 0 + # add the length of the longest common substring that happens before + len += length_matching_blocks(s1, s2, start1, start2, j1 - 1, j2 - 1) + # add the length of the longest common substring that happens after + len += length_matching_blocks(s1, s2, j1 + len, j2 + len, end1, end2) + return len end -_take(s, n::Integer) = Base.Iterators.take(s, n) -_take(s::StringWithLength, n::Integer) = StringWithLength(SubString(s, firstindex(s), nextind(s, 0, n)), n) - -_drop(s, n::Integer) = Base.Iterators.drop(s, n) -_drop(s::StringWithLength, n::Integer) = StringWithLength(SubString(s, nextind(s, 0, n + 1), lastindex(s)), length(s) - n) - - -function longest_common_pattern(s1, s2) - if length(s1) > length(s2) - start2, start1, len = longest_common_pattern(s2, s1) +function longest_common_pattern(s1, s2, start1, start2, end1, end2) + if end1 - start1 > end2 - start2 + j2, j1, len = longest_common_pattern(s2, s1, start2, start1, end2, end1) else - start1, start2, len = 0, 0, 0 - p = zeros(Int, length(s2)) + j1, j2, len = 0, 0, 0 + # p[i2-start2+1] stores the startingindex of the longest + # common pattern up to i2 with prevch1 as last matching character + p = zeros(Int, end2 - start2 + 1) for (i1, ch1) in enumerate(s1) - oldp = 0 + i1 >= start1 || continue + i1 <= end1 || break + oldj2 = 0 for (i2, ch2) in enumerate(s2) - newp = 0 - if ch1 == ch2 - newp = oldp > 0 ? oldp : i2 - currentlength = i2 - newp + 1 - if currentlength > len - start1, start2, len = i1 - currentlength + 1, newp, currentlength + i2 >= start2 || continue + i2 <= end2 || break + if ch1 != ch2 + newj2 = 0 + else + newj2 = oldj2 > 0 ? oldj2 : i2 + newlen = i2 - newj2 + 1 + if newlen > len + j1, j2, len = i1 - newlen + 1, newj2, newlen end end - p[i2], oldp = newp, p[i2] + p[i2 - start2 + 1], oldj2 = newj2, p[i2 - start2 + 1] end end end - return start1, start2, len + return j1, j2, len end diff --git a/src/modifiers.jl b/src/modifiers.jl index 854494f..d044a56 100755 --- a/src/modifiers.jl +++ b/src/modifiers.jl @@ -34,11 +34,10 @@ end function (dist::Partial{RatcliffObershelp})(s1, s2) (s1 === missing) | (s2 === missing) && return missing - s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return dist.dist(s1, s2) out = 1.0 - for r in matching_blocks(s1, s2) + for r in matching_blocks(s1, s2, 1, 1, len1, len2) # Make sure the substring of s2 has length len1 s2_start = r[2] - r[1] + 1 s2_end = s2_start + len1 - 1 @@ -49,13 +48,30 @@ function (dist::Partial{RatcliffObershelp})(s1, s2) s2_start += len2 - s2_end s2_end += len2 - s2_end end - curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end)) + n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_end) + curr = 1 - 2 * n_matched / (len1 + s2_end - s2_start + 1) out = min(out, curr) end return out end -_slice(s, n1::Integer, n2::Integer) = Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1) -_slice(s::StringWithLength, n1::Integer, n2::Integer) = SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2)) + +function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) + matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, start1, start2, end1, end2) +end + +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer) + j1, j2, len = longest_common_pattern(s1, s2, start1, start2, end1, end2) + # exit if there is no common substring + len == 0 && return x + # add the info of the common to the existing set + push!(x, (j1, j2, len)) + # add the longest common substring that happens before + matching_blocks!(x, s1, s2, start1, start2, j1 - 1, j2 - 1) + # add the longest common substring that happens after + matching_blocks!(x, s1, s2, j1 + len, j2 + len, end1, end2) + return x +end + """ TokenSort(dist) diff --git a/test/modifiers.jl b/test/modifiers.jl index 9117859..f095991 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -7,6 +7,7 @@ using StringDistances, Unicode, Test @test Partial(QGram(2))("martha", missing) === missing @test Partial(Levenshtein())("martha", "marhta") == 2 @test Partial(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5 + @test Partial(RatcliffObershelp())("martha", "marhtaXXX") ≈ 0.16666666 atol = 1e-5 @test Partial(RatcliffObershelp())("martha", missing) === missing # TokenSort