refinement

pull/17/head
matthieugomez 2019-08-19 13:54:38 -04:00
parent b557386f07
commit 7edca83311
5 changed files with 73 additions and 82 deletions

View File

@ -1 +1 @@
@time f(RatcliffObershelp(), x, y)
@time f(Winkler(Jaro()), x, y; min_dist = 0.9)

View File

@ -10,45 +10,35 @@
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
"""
# String with Length
# This allows to compute length once and only once
struct StringWithLength{T} <: AbstractString
s::T
l::Int
function compare(s1::AbstractString, s2::AbstractString, dist::RatcliffObershelp; min_dist = 0.0)
max(1.0 - evaluate(dist, s1, s2), min_dist)
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength) = iterate(s.s)
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist = 0.0)
1.0 - evaluate(dist, s1, s2; max_dist = 1.0 - min_dist)
function compare(s1::AbstractString, s2::AbstractString, dist::Jaro; min_dist = 0.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 4
bound = 2 / 3 + len1 / (3 * len2)
bound <= min_dist && return min_dist
max(1.0 - evaluate(dist, s1, s2), min_dist)
end
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
len = max(length(s1), length(s2))
len == 0 && return 1.0
max_dist = ceil(Int, len * (1 - min_dist))
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len, min_dist)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
max_dist = ceil(Int, len2 * (1 - min_dist))
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
end
function compare(s1::AbstractString, s2::AbstractString,
dist::AbstractQGramDistance)
# When string length < q for qgram distance, returns s1 == s2
s1 = string_with_length(s1)
s2 = string_with_length(s2)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
else
@ -75,13 +65,28 @@ struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
end
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
score = compare(s1, s2, dist.dist)
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler{Jaro}; min_dist = 0.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
l = remove_prefix(s1, s2, 4)[1]
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 5
bound = 2 / 3 + len1 / (3 * len2) + l * dist.scaling_factor * (1 / 3 - len1 / (3 * len2))
bound <= min_dist && return min_dist
score = compare(s1, s2, dist.dist)
if score >= dist.boosting_threshold
score += l * dist.scaling_factor * (1 - score)
end
return score
return max(score, min_dist)
end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist = 0.0)
l = remove_prefix(s1, s2, 4)[1]
score = compare(s1, s2, dist.dist; min_dist = min_dist)
if score >= dist.boosting_threshold
score += l * dist.scaling_factor * (1 - score)
end
return max(score, min_dist)
end
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
@ -102,11 +107,7 @@ struct Partial{T <: PreMetric} <: PreMetric
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare("", "", dist.dist)
@ -119,11 +120,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
@ -211,11 +208,7 @@ struct TokenMax{T <: PreMetric} <: PreMetric
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist0 = compare(s1, s2, dist.dist)
unbase_scale = 0.95

View File

@ -36,19 +36,12 @@ where ``m`` is the number of matching characters and
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
max_dist = 1.0)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
maxdist = max(0, div(len2, 2) - 1)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
# Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4)
1 - (2 / 3 + len1 / (3 * len2)) >= max_dist && return max_dist
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
prevstate1 = firstindex(s1)
i1_match = prevstate1 * ones(Int, len1)
@ -83,7 +76,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
i1 += 1
prevstate1 = state1
end
m == 0 && return min(1.0, max_dist)
m == 0 && return 1.0
# t counts number of transpositions
t = 0
i1 = 0
@ -96,7 +89,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
end
end
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
return min(current, max_dist)
return current
end
##############################################################################
@ -116,12 +109,8 @@ struct Levenshtein <: SemiMetric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
max_dist = max(length(s1), length(s2)))
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 - len1 >= max_dist && return max_dist
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
@ -175,11 +164,7 @@ struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
max_dist = max(length(s1), length(s2)))
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 - len1 >= max_dist && return max_dist
# prefix common to both strings can be ignored
@ -254,10 +239,10 @@ The distance between two strings is defined as one minus the number of matching
"""
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = 1.0)
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist)
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
end
function matching_blocks(s1::AbstractString, s2::AbstractString)

View File

@ -1,4 +1,29 @@
# String with Length
# This allows to compute length once and only once
struct StringWithLength{T} <: AbstractString
s::T
l::Int
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength) = iterate(s.s)
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
function reorder(s1::AbstractString, s2::AbstractString)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
return s1, s2
end
## Find common prefixes (up to lim. -1 means Inf)
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
l = 0

View File

@ -99,18 +99,6 @@ for x in solutions
end
for dist in (Hamming, Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
@test evaluate(dist(), strings[i]..., max_dist = 3) == min(evaluate(dist(), strings[i]...), 3)
end
end
for i in eachindex(strings)
@test evaluate(Jaro(), strings[i]..., max_dist = 0.6) == min(evaluate(Jaro(), strings[i]...), 0.6)
end
for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro)
for i in eachindex(strings)
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) max(compare(strings[i]..., dist()), 1 / 3)