refinement
parent
b557386f07
commit
7edca83311
|
@ -1 +1 @@
|
||||||
@time f(RatcliffObershelp(), x, y)
|
@time f(Winkler(Jaro()), x, y; min_dist = 0.9)
|
||||||
|
|
|
@ -10,45 +10,35 @@
|
||||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# String with Length
|
function compare(s1::AbstractString, s2::AbstractString, dist::RatcliffObershelp; min_dist = 0.0)
|
||||||
# This allows to compute length once and only once
|
max(1.0 - evaluate(dist, s1, s2), min_dist)
|
||||||
struct StringWithLength{T} <: AbstractString
|
|
||||||
s::T
|
|
||||||
l::Int
|
|
||||||
end
|
end
|
||||||
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
|
||||||
string_with_length(s::StringWithLength) = s
|
|
||||||
Base.length(s::StringWithLength) = s.l
|
|
||||||
Base.iterate(s::StringWithLength) = iterate(s.s)
|
|
||||||
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
|
|
||||||
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
|
|
||||||
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
|
|
||||||
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
|
|
||||||
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
|
|
||||||
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist = 0.0)
|
function compare(s1::AbstractString, s2::AbstractString, dist::Jaro; min_dist = 0.0)
|
||||||
1.0 - evaluate(dist, s1, s2; max_dist = 1.0 - min_dist)
|
s1, s2 = reorder(s1, s2)
|
||||||
|
len1, len2 = length(s1), length(s2)
|
||||||
|
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 4
|
||||||
|
bound = 2 / 3 + len1 / (3 * len2)
|
||||||
|
bound <= min_dist && return min_dist
|
||||||
|
max(1.0 - evaluate(dist, s1, s2), min_dist)
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString,
|
||||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0)
|
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0)
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len = max(length(s1), length(s2))
|
len2 == 0 && return 1.0
|
||||||
len == 0 && return 1.0
|
max_dist = ceil(Int, len2 * (1 - min_dist))
|
||||||
max_dist = ceil(Int, len * (1 - min_dist))
|
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
|
||||||
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len, min_dist)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString,
|
||||||
dist::AbstractQGramDistance)
|
dist::AbstractQGramDistance)
|
||||||
# When string length < q for qgram distance, returns s1 == s2
|
# When string length < q for qgram distance, returns s1 == s2
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||||
if typeof(dist) <: QGram
|
if typeof(dist) <: QGram
|
||||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
||||||
else
|
else
|
||||||
|
@ -75,13 +65,28 @@ struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
|
||||||
end
|
end
|
||||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
Winkler(x) = Winkler(x, 0.1, 0.7)
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
|
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler{Jaro}; min_dist = 0.0)
|
||||||
score = compare(s1, s2, dist.dist)
|
s1, s2 = reorder(s1, s2)
|
||||||
|
len1, len2 = length(s1), length(s2)
|
||||||
l = remove_prefix(s1, s2, 4)[1]
|
l = remove_prefix(s1, s2, 4)[1]
|
||||||
|
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 5
|
||||||
|
bound = 2 / 3 + len1 / (3 * len2) + l * dist.scaling_factor * (1 / 3 - len1 / (3 * len2))
|
||||||
|
bound <= min_dist && return min_dist
|
||||||
|
score = compare(s1, s2, dist.dist)
|
||||||
if score >= dist.boosting_threshold
|
if score >= dist.boosting_threshold
|
||||||
score += l * dist.scaling_factor * (1 - score)
|
score += l * dist.scaling_factor * (1 - score)
|
||||||
end
|
end
|
||||||
return score
|
return max(score, min_dist)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist = 0.0)
|
||||||
|
l = remove_prefix(s1, s2, 4)[1]
|
||||||
|
score = compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||||
|
if score >= dist.boosting_threshold
|
||||||
|
score += l * dist.scaling_factor * (1 - score)
|
||||||
|
end
|
||||||
|
return max(score, min_dist)
|
||||||
end
|
end
|
||||||
|
|
||||||
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
|
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
|
||||||
|
@ -102,11 +107,7 @@ struct Partial{T <: PreMetric} <: PreMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||||
len1 == 0 && return compare("", "", dist.dist)
|
len1 == 0 && return compare("", "", dist.dist)
|
||||||
|
@ -119,11 +120,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||||
out = 0.0
|
out = 0.0
|
||||||
|
@ -211,11 +208,7 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
dist0 = compare(s1, s2, dist.dist)
|
dist0 = compare(s1, s2, dist.dist)
|
||||||
unbase_scale = 0.95
|
unbase_scale = 0.95
|
||||||
|
|
35
src/edit.jl
35
src/edit.jl
|
@ -36,19 +36,12 @@ where ``m`` is the number of matching characters and
|
||||||
struct Jaro <: SemiMetric end
|
struct Jaro <: SemiMetric end
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
max_dist = 1.0)
|
s1, s2 = reorder(s1, s2)
|
||||||
s1 = string_with_length(s1)
|
|
||||||
s2 = string_with_length(s2)
|
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
maxdist = max(0, div(len2, 2) - 1)
|
||||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||||
len2 == 0 && return 0.0
|
len2 == 0 && return 0.0
|
||||||
# Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4)
|
|
||||||
1 - (2 / 3 + len1 / (3 * len2)) >= max_dist && return max_dist
|
|
||||||
maxdist = max(0, div(len2, 2) - 1)
|
|
||||||
flag = fill(false, len2)
|
flag = fill(false, len2)
|
||||||
prevstate1 = firstindex(s1)
|
prevstate1 = firstindex(s1)
|
||||||
i1_match = prevstate1 * ones(Int, len1)
|
i1_match = prevstate1 * ones(Int, len1)
|
||||||
|
@ -83,7 +76,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
|
||||||
i1 += 1
|
i1 += 1
|
||||||
prevstate1 = state1
|
prevstate1 = state1
|
||||||
end
|
end
|
||||||
m == 0 && return min(1.0, max_dist)
|
m == 0 && return 1.0
|
||||||
# t counts number of transpositions
|
# t counts number of transpositions
|
||||||
t = 0
|
t = 0
|
||||||
i1 = 0
|
i1 = 0
|
||||||
|
@ -96,7 +89,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||||
return min(current, max_dist)
|
return current
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -116,12 +109,8 @@ struct Levenshtein <: SemiMetric end
|
||||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
||||||
max_dist = max(length(s1), length(s2)))
|
max_dist = max(length(s1), length(s2)))
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
|
||||||
len2 - len1 >= max_dist && return max_dist
|
len2 - len1 >= max_dist && return max_dist
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, x1, x2start = remove_prefix(s1, s2)
|
k, x1, x2start = remove_prefix(s1, s2)
|
||||||
|
@ -175,11 +164,7 @@ struct DamerauLevenshtein <: SemiMetric end
|
||||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
||||||
max_dist = max(length(s1), length(s2)))
|
max_dist = max(length(s1), length(s2)))
|
||||||
s1 = string_with_length(s1)
|
s1, s2 = reorder(s1, s2)
|
||||||
s2 = string_with_length(s2)
|
|
||||||
if length(s1) > length(s2)
|
|
||||||
s2, s1 = s1, s2
|
|
||||||
end
|
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 - len1 >= max_dist && return max_dist
|
len2 - len1 >= max_dist && return max_dist
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
|
@ -254,10 +239,10 @@ The distance between two strings is defined as one minus the number of matching
|
||||||
"""
|
"""
|
||||||
struct RatcliffObershelp <: PreMetric end
|
struct RatcliffObershelp <: PreMetric end
|
||||||
|
|
||||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = 1.0)
|
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist)
|
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||||
|
|
25
src/utils.jl
25
src/utils.jl
|
@ -1,3 +1,28 @@
|
||||||
|
# String with Length
|
||||||
|
# This allows to compute length once and only once
|
||||||
|
struct StringWithLength{T} <: AbstractString
|
||||||
|
s::T
|
||||||
|
l::Int
|
||||||
|
end
|
||||||
|
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
||||||
|
string_with_length(s::StringWithLength) = s
|
||||||
|
Base.length(s::StringWithLength) = s.l
|
||||||
|
Base.iterate(s::StringWithLength) = iterate(s.s)
|
||||||
|
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
|
||||||
|
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
|
||||||
|
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
|
||||||
|
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
|
||||||
|
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
|
||||||
|
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
|
||||||
|
function reorder(s1::AbstractString, s2::AbstractString)
|
||||||
|
s1 = string_with_length(s1)
|
||||||
|
s2 = string_with_length(s2)
|
||||||
|
if length(s1) > length(s2)
|
||||||
|
s2, s1 = s1, s2
|
||||||
|
end
|
||||||
|
return s1, s2
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
## Find common prefixes (up to lim. -1 means Inf)
|
## Find common prefixes (up to lim. -1 means Inf)
|
||||||
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||||
|
|
|
@ -99,18 +99,6 @@ for x in solutions
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
for dist in (Hamming, Levenshtein, DamerauLevenshtein)
|
|
||||||
for i in eachindex(strings)
|
|
||||||
@test evaluate(dist(), strings[i]..., max_dist = 3) == min(evaluate(dist(), strings[i]...), 3)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
for i in eachindex(strings)
|
|
||||||
@test evaluate(Jaro(), strings[i]..., max_dist = 0.6) == min(evaluate(Jaro(), strings[i]...), 0.6)
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro)
|
for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro)
|
||||||
for i in eachindex(strings)
|
for i in eachindex(strings)
|
||||||
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)
|
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)
|
||||||
|
|
Loading…
Reference in New Issue