clarify default max_dist

pull/17/head
matthieugomez 2019-08-19 13:33:33 -04:00
parent 6aef47bc89
commit b557386f07
5 changed files with 51 additions and 32 deletions

View File

@ -1,6 +1 @@
function h(t, x, y; max_dist = Inf)
all(evaluate(t, x[i], y[i]; max_dist = max_dist) == min(max_dist, evaluate(t, x[i], y[i])) for i in eachindex(x))
end
h(Jaro(), x, y)
h(Levenshtein(), x, y)
h(DamerauLevenshtein(), x, y)
@time f(RatcliffObershelp(), x, y)

View File

@ -4,17 +4,19 @@ Random.seed!(2)
x = map(Random.randstring, rand(5:25,500_000))
y = map(Random.randstring, rand(5:25,500_000))
function f(t, x, y; max_dist = Inf)
[evaluate(t, x[i], y[i]; max_dist = max_dist) for i in 1:length(x)]
function f(t, x, y; min_dist = 0.0)
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
end
@time f(Hamming(), x, y)
@time f(Jaro(), x, y)
@time f(Jaro(), x, y; min_dist = 0.9)
@time f(Levenshtein(), x, y)
# 0.3s. A bit faster than StringDist
@time f(Levenshtein(), x, y, max_dist = 10)
@time f(Levenshtein(), x, y, min_dist = 0.8)
@time f(DamerauLevenshtein(), x, y)
@time f(DamerauLevenshtein(), x, y, max_dist = 10)
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
# 0.39s. Much faster than StringDist
@time f(RatcliffObershelp(), x, y)

View File

@ -17,6 +17,7 @@ struct StringWithLength{T} <: AbstractString
l::Int
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength) = iterate(s.s)
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
@ -101,7 +102,12 @@ struct Partial{T <: PreMetric} <: PreMetric
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s2, len2, s1, len1 = reorder(s1, s2)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare("", "", dist.dist)
out = 0.0
@ -113,7 +119,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
s2, len2, s1, len1 = reorder(s1, s2)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
for r in matching_blocks(s1, s2)
@ -200,15 +211,20 @@ struct TokenMax{T <: PreMetric} <: PreMetric
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
dist0 = compare(s1, s2, dist.dist)
s2, len2, s1, len1 = reorder(s1, s2)
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if len2 >= 1.5 * len1
if length(s2) >= 1.5 * length(s1)
partial = compare(s1, s2, Partial(dist.dist))
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
return max(dist0,
partial * partial_scale,
ptsor * unbase_scale * partial_scale,

View File

@ -5,7 +5,7 @@
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString;
max_dist = typemax(Int))
max_dist = max(length(s1), length(s2)))
current = abs(length(s2) - length(s1))
current >= max_dist && return max_dist
for (ch1, ch2) in zip(s1, s2)
@ -37,8 +37,13 @@ struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
max_dist = Inf)
s2, len2, s1, len1 = reorder(s1, s2)
max_dist = 1.0)
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
# Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4)
@ -110,8 +115,13 @@ struct Levenshtein <: SemiMetric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
max_dist = typemax(Int))
s2, len2, s1, len1 = reorder(s1, s2)
max_dist = max(length(s1), length(s2)))
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
len2 - len1 >= max_dist && return max_dist
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
@ -164,8 +174,13 @@ struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
max_dist = typemax(Int))
s2, len2, s1, len1 = reorder(s1, s2)
max_dist = max(length(s1), length(s2)))
s1 = string_with_length(s1)
s2 = string_with_length(s2)
if length(s1) > length(s2)
s2, s1 = s1, s2
end
len1, len2 = length(s1), length(s2)
len2 - len1 >= max_dist && return max_dist
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
@ -239,7 +254,7 @@ The distance between two strings is defined as one minus the number of matching
"""
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = Inf)
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = 1.0)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist)

View File

@ -1,12 +1,3 @@
function reorder(s1::AbstractString, s2::AbstractString)
len1 = length(s1)
len2 = length(s2)
if len2 > len1
return s2, len2, s1, len1
else
return s1, len1, s2, len2
end
end
## Find common prefixes (up to lim. -1 means Inf)
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)