diff --git a/README.md b/README.md index 2f98974..98620c3 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ compare("martha", "marhta", Hamming()) - [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()` - [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` - [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()` +- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()` #### Q-Grams Distances @@ -34,11 +35,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string. - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q)` -#### Others -- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()` - - - ## Distance Modifiers The package includes distance "modifiers", that can be applied to any distance. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 9218349..4d57a11 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -16,12 +16,12 @@ Hamming, Levenshtein, DamerauLevenshtein, Jaro, +RatcliffObershelp, QGram, Cosine, Jaccard, SorensenDice, Overlap, -RatcliffObershelp, Winkler, Partial, TokenSort, @@ -37,7 +37,6 @@ qgram_iterator include("utils.jl") include("distances/edit.jl") include("distances/qgram.jl") -include("distances/RatcliffObershelp.jl") include("compare.jl") end diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl deleted file mode 100755 index 475f3e0..0000000 --- a/src/distances/RatcliffObershelp.jl +++ /dev/null @@ -1,60 +0,0 @@ -# Return start of commn substring in s1, start of common substring in s2, and length of substring -# Indexes refer to character number, not index (differ for Unicode strings) -function longest_common_substring(s1::AbstractString, s2::AbstractString) - if length(s1) > length(s2) - start2, start1, len = longest_common_substring(s2, s1) - else - start1, start2, len = 0, 0, 0 - p = zeros(Int, length(s2)) - i1 = 0 - for ch1 in s1 - i1 += 1 - i2 = 0 - oldp = 0 - for ch2 in s2 - i2 += 1 - newp = 0 - if ch1 == ch2 - newp = oldp > 0 ? oldp : i2 - currentlength = (i2 - newp + 1) - if currentlength > len - start1, start2, len = i1 - currentlength + 1, newp, currentlength - end - end - p[i2], oldp = newp, p[i2] - end - end - end - return start1, start2, len -end - -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer) - a = longest_common_substring(s1, s2) - if a[3] > 0 - push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) - s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1)) - s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1)) - matching_blocks!(x, s1before, s2before, start1, start2) - if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2)) - s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) - s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2)) - matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) - end - end -end - -function matching_blocks(s1::AbstractString, s2::AbstractString) - x = Set{Tuple{Int, Int, Int}}() - matching_blocks!(x, s1, s2, 1, 1) - return x -end - -struct RatcliffObershelp <: PreMetric end - -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) - n_matched = 0 - for x in matching_blocks(s1, s2) - n_matched += x[3] - end - 1.0 - 2 * n_matched / (length(s1) + length(s2)) -end diff --git a/src/distances/edit.jl b/src/distances/edit.jl index 7687a9b..054cf14 100755 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -182,4 +182,43 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) end end return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 -end \ No newline at end of file +end + +############################################################################## +## +## Ratcliff/Obershelp +## +############################################################################## + +struct RatcliffObershelp <: PreMetric end + +function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) + n_matched = sum(last.(matching_blocks(s1, s2))) + 1.0 - 2 * n_matched / (length(s1) + length(s2)) +end + +function matching_blocks(s1::AbstractString, s2::AbstractString) + matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1) +end + +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer) + a = longest_common_substring(s1, s2, len1 , len2) + # if there is a common substring + if a[3] > 0 + # add the info of the common to the existing set + push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) + # add the longest common substring that happens before + s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1)) + s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1)) + matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2) + # add the longest common substring that happens after + s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) + s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2)) + matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) + end + return x +end + + + + diff --git a/src/utils.jl b/src/utils.jl index 0375751..a290bc8 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -23,3 +23,35 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1 end return l, x1, x2 end + + + +# Return start of commn substring in s1, start of common substring in s2, and length of substring +# Indexes refer to character number, not index +function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int) + if len1 > len2 + start2, start1, len = longest_common_substring(s2, s1, len2, len1) + else + start1, start2, len = 0, 0, 0 + p = zeros(Int, len2) + i1 = 0 + for ch1 in s1 + i1 += 1 + i2 = 0 + oldp = 0 + for ch2 in s2 + i2 += 1 + newp = 0 + if ch1 == ch2 + newp = oldp > 0 ? oldp : i2 + currentlength = (i2 - newp + 1) + if currentlength > len + start1, start2, len = i1 - currentlength + 1, newp, currentlength + end + end + p[i2], oldp = newp, p[i2] + end + end + end + return start1, start2, len +end