simplify Radclikff
parent
402d24997f
commit
efcace4f03
|
@ -24,6 +24,7 @@ compare("martha", "marhta", Hamming())
|
|||
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
|
||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
|
||||
|
||||
#### Q-Grams Distances
|
||||
|
@ -34,11 +35,6 @@ Q-gram distances compare the set of all substrings of length `q` in each string.
|
|||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q)`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q)`
|
||||
|
||||
#### Others
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
|
||||
|
||||
|
||||
## Distance Modifiers
|
||||
The package includes distance "modifiers", that can be applied to any distance.
|
||||
|
||||
|
|
|
@ -16,12 +16,12 @@ Hamming,
|
|||
Levenshtein,
|
||||
DamerauLevenshtein,
|
||||
Jaro,
|
||||
RatcliffObershelp,
|
||||
QGram,
|
||||
Cosine,
|
||||
Jaccard,
|
||||
SorensenDice,
|
||||
Overlap,
|
||||
RatcliffObershelp,
|
||||
Winkler,
|
||||
Partial,
|
||||
TokenSort,
|
||||
|
@ -37,7 +37,6 @@ qgram_iterator
|
|||
include("utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("distances/RatcliffObershelp.jl")
|
||||
include("compare.jl")
|
||||
|
||||
end
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
||||
# Indexes refer to character number, not index (differ for Unicode strings)
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
||||
if length(s1) > length(s2)
|
||||
start2, start1, len = longest_common_substring(s2, s1)
|
||||
else
|
||||
start1, start2, len = 0, 0, 0
|
||||
p = zeros(Int, length(s2))
|
||||
i1 = 0
|
||||
for ch1 in s1
|
||||
i1 += 1
|
||||
i2 = 0
|
||||
oldp = 0
|
||||
for ch2 in s2
|
||||
i2 += 1
|
||||
newp = 0
|
||||
if ch1 == ch2
|
||||
newp = oldp > 0 ? oldp : i2
|
||||
currentlength = (i2 - newp + 1)
|
||||
if currentlength > len
|
||||
start1, start2, len = i1 - currentlength + 1, newp, currentlength
|
||||
end
|
||||
end
|
||||
p[i2], oldp = newp, p[i2]
|
||||
end
|
||||
end
|
||||
end
|
||||
return start1, start2, len
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
||||
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
|
||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
||||
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||
x = Set{Tuple{Int, Int, Int}}()
|
||||
matching_blocks!(x, s1, s2, 1, 1)
|
||||
return x
|
||||
end
|
||||
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
n_matched = 0
|
||||
for x in matching_blocks(s1, s2)
|
||||
n_matched += x[3]
|
||||
end
|
||||
1.0 - 2 * n_matched / (length(s1) + length(s2))
|
||||
end
|
|
@ -182,4 +182,43 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
end
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Ratcliff/Obershelp
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
1.0 - 2 * n_matched / (length(s1) + length(s2))
|
||||
end
|
||||
|
||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2, len1 , len2)
|
||||
# if there is a common substring
|
||||
if a[3] > 0
|
||||
# add the info of the common to the existing set
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
# add the longest common substring that happens before
|
||||
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
||||
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
||||
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
||||
# add the longest common substring that happens after
|
||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
||||
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1, len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
end
|
||||
return x
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
32
src/utils.jl
32
src/utils.jl
|
@ -23,3 +23,35 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
|
|||
end
|
||||
return l, x1, x2
|
||||
end
|
||||
|
||||
|
||||
|
||||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
||||
# Indexes refer to character number, not index
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
|
||||
if len1 > len2
|
||||
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
|
||||
else
|
||||
start1, start2, len = 0, 0, 0
|
||||
p = zeros(Int, len2)
|
||||
i1 = 0
|
||||
for ch1 in s1
|
||||
i1 += 1
|
||||
i2 = 0
|
||||
oldp = 0
|
||||
for ch2 in s2
|
||||
i2 += 1
|
||||
newp = 0
|
||||
if ch1 == ch2
|
||||
newp = oldp > 0 ? oldp : i2
|
||||
currentlength = (i2 - newp + 1)
|
||||
if currentlength > len
|
||||
start1, start2, len = i1 - currentlength + 1, newp, currentlength
|
||||
end
|
||||
end
|
||||
p[i2], oldp = newp, p[i2]
|
||||
end
|
||||
end
|
||||
end
|
||||
return start1, start2, len
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue