Only keep compare for Levenshtein and Damerau
parent
c2a6fea477
commit
5ea65c389a
|
@ -1 +1 @@
|
||||||
@time f(Winkler(Jaro()), x, y; min_dist = 0.9)
|
@time f(DamerauLevenshtein(), x, y)
|
||||||
|
|
|
@ -4,14 +4,15 @@ Random.seed!(2)
|
||||||
x = map(Random.randstring, rand(5:25,500_000))
|
x = map(Random.randstring, rand(5:25,500_000))
|
||||||
y = map(Random.randstring, rand(5:25,500_000))
|
y = map(Random.randstring, rand(5:25,500_000))
|
||||||
|
|
||||||
function f(t, x, y; min_dist = 0.0)
|
function f(t, x, y; min_dist = nothing)
|
||||||
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
||||||
end
|
end
|
||||||
|
function f(t, x, y; min_dist = nothing)
|
||||||
|
[evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)]
|
||||||
|
end
|
||||||
@time f(Hamming(), x, y)
|
@time f(Hamming(), x, y)
|
||||||
@time f(Jaro(), x, y)
|
@time f(Jaro(), x, y)
|
||||||
@time f(Jaro(), x, y; min_dist = 0.9)
|
|
||||||
@time f(Winkler(Jaro()), x, y; min_dist = 0.9)
|
|
||||||
|
|
||||||
@time f(Levenshtein(), x, y)
|
@time f(Levenshtein(), x, y)
|
||||||
# 0.3s. A bit faster than StringDist
|
# 0.3s. A bit faster than StringDist
|
||||||
|
@ -19,7 +20,6 @@ end
|
||||||
@time f(DamerauLevenshtein(), x, y)
|
@time f(DamerauLevenshtein(), x, y)
|
||||||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||||
# 0.39s. Much faster than StringDist
|
# 0.39s. Much faster than StringDist
|
||||||
@time f(RatcliffObershelp(), x, y)
|
|
||||||
|
|
||||||
function g(t, x, y)
|
function g(t, x, y)
|
||||||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||||
|
|
111
src/compare.jl
111
src/compare.jl
|
@ -10,35 +10,30 @@
|
||||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::RatcliffObershelp; min_dist = 0.0)
|
|
||||||
max(1.0 - evaluate(dist, s1, s2), min_dist)
|
|
||||||
end
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Jaro; min_dist = 0.0)
|
|
||||||
s1, s2 = reorder(s1, s2)
|
|
||||||
len1, len2 = length(s1), length(s2)
|
|
||||||
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 4
|
|
||||||
bound = 2 / 3 + len1 / (3 * len2)
|
|
||||||
bound <= min_dist && return min_dist
|
|
||||||
max(1.0 - evaluate(dist, s1, s2), min_dist)
|
|
||||||
end
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString,
|
||||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0)
|
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 == 0 && return 1.0
|
len2 == 0 && return 1.0
|
||||||
max_dist = ceil(Int, len2 * (1 - min_dist))
|
if min_dist === nothing
|
||||||
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
|
1.0 - evaluate(dist, s1, s2) / len2
|
||||||
|
else
|
||||||
|
max_dist = ceil(Int, len2 * (1 - min_dist))
|
||||||
|
# need to add max in case of integer stuff
|
||||||
|
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist::Nothing = nothing)
|
||||||
|
1.0 - evaluate(dist, s1, s2)
|
||||||
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance;
|
||||||
dist::AbstractQGramDistance)
|
min_dist::Nothing = nothing)
|
||||||
# When string length < q for qgram distance, returns s1 == s2
|
# When string length < q for qgram distance, returns s1 == s2
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
|
||||||
if typeof(dist) <: QGram
|
if typeof(dist) <: QGram
|
||||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
||||||
else
|
else
|
||||||
|
@ -54,39 +49,32 @@ end
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
"""
|
"""
|
||||||
Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7)
|
Winkler(dist::Premetric, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
|
||||||
|
|
||||||
Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`)
|
Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`)
|
||||||
"""
|
"""
|
||||||
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
|
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real, T4 <: Integer} <: PreMetric
|
||||||
dist::T1
|
dist::T1
|
||||||
scaling_factor::T2 # scaling factor. Default to 0.1
|
p::T2 # scaling factor. Default to 0.1
|
||||||
boosting_threshold::T3 # boost threshold. Default to 0.7
|
boosting_threshold::T3 # boost threshold. Default to 0.7
|
||||||
|
l::Integer # length of common prefix. Default to 4
|
||||||
|
function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
|
||||||
|
p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
|
||||||
|
new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler{Jaro}; min_dist = 0.0)
|
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
|
||||||
s1, s2 = reorder(s1, s2)
|
|
||||||
len1, len2 = length(s1), length(s2)
|
# hard to use min_dist because of whether there is boost or not in the end
|
||||||
l = remove_prefix(s1, s2, 4)[1]
|
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist::Nothing = nothing)
|
||||||
# http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 5
|
l = remove_prefix(s1, s2, dist.l)[1]
|
||||||
bound = 2 / 3 + len1 / (3 * len2) + l * dist.scaling_factor * (1 / 3 - len1 / (3 * len2))
|
# cannot do min_dist because of boosting threshold
|
||||||
bound <= min_dist && return min_dist
|
|
||||||
score = compare(s1, s2, dist.dist)
|
score = compare(s1, s2, dist.dist)
|
||||||
if score >= dist.boosting_threshold
|
if score >= dist.boosting_threshold
|
||||||
score += l * dist.scaling_factor * (1 - score)
|
score += l * dist.p * (1 - score)
|
||||||
end
|
end
|
||||||
return max(score, min_dist)
|
return score
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist = 0.0)
|
|
||||||
l = remove_prefix(s1, s2, 4)[1]
|
|
||||||
score = compare(s1, s2, dist.dist; min_dist = min_dist)
|
|
||||||
if score >= dist.boosting_threshold
|
|
||||||
score += l * dist.scaling_factor * (1 - score)
|
|
||||||
end
|
|
||||||
return max(score, min_dist)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
|
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
|
||||||
|
@ -106,20 +94,21 @@ struct Partial{T <: PreMetric} <: PreMetric
|
||||||
dist::T
|
dist::T
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||||
len1 == 0 && return compare("", "", dist.dist)
|
len1 == 0 && return 1.0
|
||||||
out = 0.0
|
out = 0.0
|
||||||
for x in qgram(s2, len1)
|
for x in qgram(s2, len1)
|
||||||
curr = compare(s1, x, dist.dist)
|
curr = compare(s1, x, dist.dist; min_dist = min_dist)
|
||||||
out = max(out, curr)
|
out = max(out, curr)
|
||||||
end
|
end
|
||||||
return out
|
return out
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp};
|
||||||
|
min_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||||
|
@ -158,10 +147,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
||||||
dist::T
|
dist::T
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort)
|
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_dist = nothing)
|
||||||
s1 = join(sort!(split(s1)), " ")
|
s1 = join(sort!(split(s1)), " ")
|
||||||
s2 = join(sort!(split(s2)), " ")
|
s2 = join(sort!(split(s2)), " ")
|
||||||
compare(s1, s2, dist.dist)
|
compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -179,17 +168,17 @@ struct TokenSet{T <: PreMetric} <: PreMetric
|
||||||
dist::T
|
dist::T
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
|
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_dist = nothing)
|
||||||
v1 = SortedSet(split(s1))
|
v1 = SortedSet(split(s1))
|
||||||
v2 = SortedSet(split(s2))
|
v2 = SortedSet(split(s2))
|
||||||
v0 = intersect(v1, v2)
|
v0 = intersect(v1, v2)
|
||||||
s0 = join(v0, " ")
|
s0 = join(v0, " ")
|
||||||
s1 = join(v1, " ")
|
s1 = join(v1, " ")
|
||||||
s2 = join(v2, " ")
|
s2 = join(v2, " ")
|
||||||
isempty(s0) && return compare(s1, s2, dist.dist)
|
isempty(s0) && return compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||||
max(compare(s0, s1, dist.dist),
|
max(compare(s0, s1, dist.dist; min_dist = min_dist),
|
||||||
compare(s0, s2, dist.dist),
|
compare(s0, s2, dist.dist; min_dist = min_dist),
|
||||||
compare(s1, s2, dist.dist))
|
compare(s1, s2, dist.dist; min_dist = min_dist))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -207,24 +196,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
||||||
dist::T
|
dist::T
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
dist0 = compare(s1, s2, dist.dist)
|
dist0 = compare(s1, s2, dist.dist)
|
||||||
unbase_scale = 0.95
|
unbase_scale = 0.95
|
||||||
# if one string is much shorter than the other, use partial
|
# if one string is much shorter than the other, use partial
|
||||||
if length(s2) >= 1.5 * length(s1)
|
if length(s2) >= 1.5 * length(s1)
|
||||||
partial = compare(s1, s2, Partial(dist.dist))
|
partial = compare(s1, s2, Partial(dist.dist); min_dist = min_dist)
|
||||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
|
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_dist = min_dist)
|
||||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
|
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_dist = min_dist)
|
||||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||||
return max(dist0,
|
return max(dist0,
|
||||||
partial * partial_scale,
|
partial * partial_scale,
|
||||||
ptsor * unbase_scale * partial_scale,
|
ptsor * unbase_scale * partial_scale,
|
||||||
ptser * unbase_scale * partial_scale)
|
ptser * unbase_scale * partial_scale)
|
||||||
else
|
else
|
||||||
ptsor = compare(s1, s2, TokenSort(dist.dist))
|
ptsor = compare(s1, s2, TokenSort(dist.dist); min_dist = min_dist)
|
||||||
ptser = compare(s1, s2, TokenSet(dist.dist))
|
ptser = compare(s1, s2, TokenSet(dist.dist); min_dist = min_dist)
|
||||||
return max(dist0,
|
return max(dist0,
|
||||||
ptsor * unbase_scale,
|
ptsor * unbase_scale,
|
||||||
ptser * unbase_scale)
|
ptser * unbase_scale)
|
||||||
|
|
36
src/edit.jl
36
src/edit.jl
|
@ -4,13 +4,12 @@
|
||||||
## Hamming
|
## Hamming
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString; max_dist = nothing)
|
||||||
max_dist = max(length(s1), length(s2)))
|
|
||||||
current = abs(length(s2) - length(s1))
|
current = abs(length(s2) - length(s1))
|
||||||
current >= max_dist && return max_dist
|
max_dist !== nothing && current >= max_dist && return max_dist
|
||||||
for (ch1, ch2) in zip(s1, s2)
|
for (ch1, ch2) in zip(s1, s2)
|
||||||
current += ch1 != ch2
|
current += ch1 != ch2
|
||||||
current >= max_dist && return max_dist
|
max_dist !== nothing && current >= max_dist && return max_dist
|
||||||
end
|
end
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
@ -39,12 +38,12 @@ struct Jaro <: SemiMetric end
|
||||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
maxdist = max(0, div(len2, 2) - 1)
|
|
||||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||||
len2 == 0 && return 0.0
|
len2 == 0 && return 0.0
|
||||||
|
maxdist = max(0, div(len2, 2) - 1)
|
||||||
flag = fill(false, len2)
|
flag = fill(false, len2)
|
||||||
prevstate1 = firstindex(s1)
|
prevstate1 = firstindex(s1)
|
||||||
i1_match = prevstate1 * ones(Int, len1)
|
i1_match = fill(prevstate1, len1)
|
||||||
# m counts number matching characters
|
# m counts number matching characters
|
||||||
m = 0
|
m = 0
|
||||||
i1 = 1
|
i1 = 1
|
||||||
|
@ -61,9 +60,9 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
i2curr = i2
|
i2curr = i2
|
||||||
x2curr = x2
|
x2curr = x2
|
||||||
while x2curr !== nothing
|
while x2curr !== nothing
|
||||||
(i2curr > i1 + maxdist) && break
|
i2curr > i1 + maxdist && break
|
||||||
ch2, state2 = x2curr
|
ch2, state2 = x2curr
|
||||||
if (ch1 == ch2) & !flag[i2curr]
|
if (ch1 == ch2) && !flag[i2curr]
|
||||||
m += 1
|
m += 1
|
||||||
flag[i2curr] = true
|
flag[i2curr] = true
|
||||||
i1_match[m] = prevstate1
|
i1_match[m] = prevstate1
|
||||||
|
@ -108,13 +107,13 @@ struct Levenshtein <: SemiMetric end
|
||||||
|
|
||||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
||||||
max_dist = max(length(s1), length(s2)))
|
max_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 - len1 >= max_dist && return max_dist
|
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, x1, x2start = remove_prefix(s1, s2)
|
k, x1, x2start = remove_prefix(s1, s2)
|
||||||
(x1 == nothing) && return len2 - k
|
x1 == nothing && return len2 - k
|
||||||
# distance initialized to first row of matrix
|
# distance initialized to first row of matrix
|
||||||
# => distance between "" and s2[1:i}
|
# => distance between "" and s2[1:i}
|
||||||
v0 = collect(1:(len2 - k))
|
v0 = collect(1:(len2 - k))
|
||||||
|
@ -140,11 +139,12 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
||||||
x2 = iterate(s2, state2)
|
x2 = iterate(s2, state2)
|
||||||
i2 += 1
|
i2 += 1
|
||||||
end
|
end
|
||||||
min_dist >= max_dist && return max_dist
|
max_dist !== nothing && min_dist >= max_dist && return max_dist
|
||||||
x1 = iterate(s1, state1)
|
x1 = iterate(s1, state1)
|
||||||
i1 += 1
|
i1 += 1
|
||||||
end
|
end
|
||||||
return min(current, max_dist)
|
max_dist !== nothing && return min(current, max_dist)
|
||||||
|
return current
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -163,10 +163,10 @@ struct DamerauLevenshtein <: SemiMetric end
|
||||||
|
|
||||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
||||||
max_dist = max(length(s1), length(s2)))
|
max_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 - len1 >= max_dist && return max_dist
|
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, x1, x2start = remove_prefix(s1, s2)
|
k, x1, x2start = remove_prefix(s1, s2)
|
||||||
(x1 == nothing) && return len2 - k
|
(x1 == nothing) && return len2 - k
|
||||||
|
@ -214,11 +214,12 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
||||||
i2 += 1
|
i2 += 1
|
||||||
prevch2 = ch2
|
prevch2 = ch2
|
||||||
end
|
end
|
||||||
(v0[i1 + len2 - len1] >= max_dist) && return max_dist
|
max_dist !== nothing && (v0[i1 + len2 - len1] >= max_dist) && return max_dist
|
||||||
x1 = iterate(s1, state1)
|
x1 = iterate(s1, state1)
|
||||||
i1 += 1
|
i1 += 1
|
||||||
prevch1 = ch1
|
prevch1 = ch1
|
||||||
end
|
end
|
||||||
|
max_dist !== nothing && return min(current, max_dist)
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -239,7 +240,8 @@ The distance between two strings is defined as one minus the number of matching
|
||||||
"""
|
"""
|
||||||
struct RatcliffObershelp <: PreMetric end
|
struct RatcliffObershelp <: PreMetric end
|
||||||
|
|
||||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString;
|
||||||
|
max_dist::Nothing = nothing)
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
|
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
|
||||||
|
|
|
@ -17,11 +17,12 @@ Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
|
||||||
function reorder(s1::AbstractString, s2::AbstractString)
|
function reorder(s1::AbstractString, s2::AbstractString)
|
||||||
s1 = string_with_length(s1)
|
s1 = string_with_length(s1)
|
||||||
s2 = string_with_length(s2)
|
s2 = string_with_length(s2)
|
||||||
if length(s1) > length(s2)
|
if length(s1) <= length(s2)
|
||||||
s2, s1 = s1, s2
|
return s1, s2
|
||||||
|
else
|
||||||
|
return s2, s1
|
||||||
end
|
end
|
||||||
return s1, s2
|
end
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
## Find common prefixes (up to lim. -1 means Inf)
|
## Find common prefixes (up to lim. -1 means Inf)
|
||||||
|
|
|
@ -99,7 +99,7 @@ for x in solutions
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro)
|
for dist in (Hamming, Levenshtein, DamerauLevenshtein)
|
||||||
for i in eachindex(strings)
|
for i in eachindex(strings)
|
||||||
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)
|
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)
|
||||||
end
|
end
|
||||||
|
|
|
@ -20,13 +20,13 @@ using StringDistances, Test
|
||||||
|
|
||||||
|
|
||||||
# Winkler
|
# Winkler
|
||||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.9611 atol = 1e-4
|
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
|
||||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.84 atol = 1e-4
|
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
|
||||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.81333 atol = 1e-4
|
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
|
||||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.975 atol = 1e-4
|
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
|
||||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4
|
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) ≈ 1.0 atol = 1e-4
|
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
|
||||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4
|
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||||
|
|
||||||
strings = [
|
strings = [
|
||||||
("martha", "marhta"),
|
("martha", "marhta"),
|
||||||
|
@ -37,7 +37,7 @@ strings = [
|
||||||
]
|
]
|
||||||
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
|
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
|
||||||
for i in 1:length(solutions)
|
for i in 1:length(solutions)
|
||||||
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) ≈ (1 - solutions[i]) atol = 1e-4
|
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0, 4)) ≈ (1 - solutions[i]) atol = 1e-4
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue