simplify Radclikff

pull/17/head
matthieugomez 2019-08-17 12:57:35 -04:00
parent 402d24997f
commit efcace4f03
5 changed files with 74 additions and 68 deletions

View File

@ -24,6 +24,7 @@ compare("martha", "marhta", Hamming())
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
#### Q-Grams Distances
@ -34,11 +35,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string.
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q)`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q)`
#### Others
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
## Distance Modifiers
The package includes distance "modifiers", that can be applied to any distance.

View File

@ -16,12 +16,12 @@ Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
RatcliffObershelp,
QGram,
Cosine,
Jaccard,
SorensenDice,
Overlap,
RatcliffObershelp,
Winkler,
Partial,
TokenSort,
@ -37,7 +37,6 @@ qgram_iterator
include("utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("distances/RatcliffObershelp.jl")
include("compare.jl")
end

View File

@ -1,60 +0,0 @@
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
if length(s1) > length(s2)
start2, start1, len = longest_common_substring(s2, s1)
else
start1, start2, len = 0, 0, 0
p = zeros(Int, length(s2))
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, len
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
matching_blocks!(x, s1before, s2before, start1, start2)
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
end
end
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
x = Set{Tuple{Int, Int, Int}}()
matching_blocks!(x, s1, s2, 1, 1)
return x
end
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = 0
for x in matching_blocks(s1, s2)
n_matched += x[3]
end
1.0 - 2 * n_matched / (length(s1) + length(s2))
end

View File

@ -182,4 +182,43 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
end
end
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
end
##############################################################################
##
## Ratcliff/Obershelp
##
##############################################################################
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
1.0 - 2 * n_matched / (length(s1) + length(s2))
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2, len1 , len2)
# if there is a common substring
if a[3] > 0
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
# add the longest common substring that happens after
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
end
return x
end

View File

@ -23,3 +23,35 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
end
return l, x1, x2
end
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
if len1 > len2
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
else
start1, start2, len = 0, 0, 0
p = zeros(Int, len2)
i1 = 0
for ch1 in s1
i1 += 1
i2 = 0
oldp = 0
for ch2 in s2
i2 += 1
newp = 0
if ch1 == ch2
newp = oldp > 0 ? oldp : i2
currentlength = (i2 - newp + 1)
if currentlength > len
start1, start2, len = i1 - currentlength + 1, newp, currentlength
end
end
p[i2], oldp = newp, p[i2]
end
end
end
return start1, start2, len
end