clarify default max_dist
parent
6aef47bc89
commit
b557386f07
|
@ -1,6 +1 @@
|
|||
function h(t, x, y; max_dist = Inf)
|
||||
all(evaluate(t, x[i], y[i]; max_dist = max_dist) == min(max_dist, evaluate(t, x[i], y[i])) for i in eachindex(x))
|
||||
end
|
||||
h(Jaro(), x, y)
|
||||
h(Levenshtein(), x, y)
|
||||
h(DamerauLevenshtein(), x, y)
|
||||
@time f(RatcliffObershelp(), x, y)
|
|
@ -4,17 +4,19 @@ Random.seed!(2)
|
|||
x = map(Random.randstring, rand(5:25,500_000))
|
||||
y = map(Random.randstring, rand(5:25,500_000))
|
||||
|
||||
function f(t, x, y; max_dist = Inf)
|
||||
[evaluate(t, x[i], y[i]; max_dist = max_dist) for i in 1:length(x)]
|
||||
function f(t, x, y; min_dist = 0.0)
|
||||
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
||||
end
|
||||
|
||||
@time f(Hamming(), x, y)
|
||||
@time f(Jaro(), x, y)
|
||||
@time f(Jaro(), x, y; min_dist = 0.9)
|
||||
|
||||
@time f(Levenshtein(), x, y)
|
||||
# 0.3s. A bit faster than StringDist
|
||||
@time f(Levenshtein(), x, y, max_dist = 10)
|
||||
@time f(Levenshtein(), x, y, min_dist = 0.8)
|
||||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time f(DamerauLevenshtein(), x, y, max_dist = 10)
|
||||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||
# 0.39s. Much faster than StringDist
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ struct StringWithLength{T} <: AbstractString
|
|||
l::Int
|
||||
end
|
||||
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
||||
string_with_length(s::StringWithLength) = s
|
||||
Base.length(s::StringWithLength) = s.l
|
||||
Base.iterate(s::StringWithLength) = iterate(s.s)
|
||||
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
|
||||
|
@ -101,7 +102,12 @@ struct Partial{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
len1 == 0 && return compare("", "", dist.dist)
|
||||
out = 0.0
|
||||
|
@ -113,7 +119,12 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
|||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
out = 0.0
|
||||
for r in matching_blocks(s1, s2)
|
||||
|
@ -200,15 +211,20 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist0 = compare(s1, s2, dist.dist)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if len2 >= 1.5 * len1
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial = compare(s1, s2, Partial(dist.dist))
|
||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
|
||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
|
||||
partial_scale = len2 > (8 * len1) ? 0.6 : 0.9
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
return max(dist0,
|
||||
partial * partial_scale,
|
||||
ptsor * unbase_scale * partial_scale,
|
||||
|
|
31
src/edit.jl
31
src/edit.jl
|
@ -5,7 +5,7 @@
|
|||
##
|
||||
##############################################################################
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = typemax(Int))
|
||||
max_dist = max(length(s1), length(s2)))
|
||||
current = abs(length(s2) - length(s1))
|
||||
current >= max_dist && return max_dist
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
|
@ -37,8 +37,13 @@ struct Jaro <: SemiMetric end
|
|||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = Inf)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
max_dist = 1.0)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
# Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4)
|
||||
|
@ -110,8 +115,13 @@ struct Levenshtein <: SemiMetric end
|
|||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = typemax(Int))
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
max_dist = max(length(s1), length(s2)))
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 - len1 >= max_dist && return max_dist
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
|
@ -164,8 +174,13 @@ struct DamerauLevenshtein <: SemiMetric end
|
|||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = typemax(Int))
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
max_dist = max(length(s1), length(s2)))
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
if length(s1) > length(s2)
|
||||
s2, s1 = s1, s2
|
||||
end
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 - len1 >= max_dist && return max_dist
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
|
@ -239,7 +254,7 @@ The distance between two strings is defined as one minus the number of matching
|
|||
"""
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = Inf)
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = 1.0)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist)
|
||||
|
|
|
@ -1,12 +1,3 @@
|
|||
function reorder(s1::AbstractString, s2::AbstractString)
|
||||
len1 = length(s1)
|
||||
len2 = length(s2)
|
||||
if len2 > len1
|
||||
return s2, len2, s1, len1
|
||||
else
|
||||
return s1, len1, s2, len2
|
||||
end
|
||||
end
|
||||
|
||||
## Find common prefixes (up to lim. -1 means Inf)
|
||||
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
|
|
Loading…
Reference in New Issue