From 5ea65c389a1170bf8796a372843687697977889a Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Tue, 20 Aug 2019 11:59:23 -0400 Subject: [PATCH] Only keep compare for Levenshtein and Damerau --- benchmark/.sublime2Terminal.jl | 2 +- benchmark/benchmark.jl | 10 +-- src/compare.jl | 111 +++++++++++++++------------------ src/edit.jl | 36 ++++++----- src/utils.jl | 9 +-- test/distances.jl | 2 +- test/modifiers.jl | 16 ++--- 7 files changed, 89 insertions(+), 97 deletions(-) diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index 411fab5..79ead95 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1 @@ -@time f(Winkler(Jaro()), x, y; min_dist = 0.9) +@time f(DamerauLevenshtein(), x, y) diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index 79d2d2d..ccb7428 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -4,14 +4,15 @@ Random.seed!(2) x = map(Random.randstring, rand(5:25,500_000)) y = map(Random.randstring, rand(5:25,500_000)) -function f(t, x, y; min_dist = 0.0) +function f(t, x, y; min_dist = nothing) [compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)] end - +function f(t, x, y; min_dist = nothing) + [evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)] +end @time f(Hamming(), x, y) @time f(Jaro(), x, y) -@time f(Jaro(), x, y; min_dist = 0.9) -@time f(Winkler(Jaro()), x, y; min_dist = 0.9) + @time f(Levenshtein(), x, y) # 0.3s. A bit faster than StringDist @@ -19,7 +20,6 @@ end @time f(DamerauLevenshtein(), x, y) @time f(DamerauLevenshtein(), x, y, min_dist = 0.8) # 0.39s. Much faster than StringDist -@time f(RatcliffObershelp(), x, y) function g(t, x, y) [evaluate(t, x[i], y[i]) for i in 1:length(x)] diff --git a/src/compare.jl b/src/compare.jl index 3162331..19a2f12 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -10,35 +10,30 @@ compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` """ -function compare(s1::AbstractString, s2::AbstractString, dist::RatcliffObershelp; min_dist = 0.0) - max(1.0 - evaluate(dist, s1, s2), min_dist) -end - -function compare(s1::AbstractString, s2::AbstractString, dist::Jaro; min_dist = 0.0) - s1, s2 = reorder(s1, s2) - len1, len2 = length(s1), length(s2) - # http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 4 - bound = 2 / 3 + len1 / (3 * len2) - bound <= min_dist && return min_dist - max(1.0 - evaluate(dist, s1, s2), min_dist) -end - function compare(s1::AbstractString, s2::AbstractString, - dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0) + dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 - max_dist = ceil(Int, len2 * (1 - min_dist)) - max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist) + if min_dist === nothing + 1.0 - evaluate(dist, s1, s2) / len2 + else + max_dist = ceil(Int, len2 * (1 - min_dist)) + # need to add max in case of integer stuff + max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist) + end end +function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist::Nothing = nothing) + 1.0 - evaluate(dist, s1, s2) +end -function compare(s1::AbstractString, s2::AbstractString, - dist::AbstractQGramDistance) +function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance; + min_dist::Nothing = nothing) # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) + len1 <= dist.q - 1 && return convert(Float64, s1 == s2) if typeof(dist) <: QGram 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) else @@ -54,39 +49,32 @@ end ## ############################################################################## """ - Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7) + Winkler(dist::Premetric, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4) -Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`) +Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `p` when the strings share a common prefix with lenth lower than `l` (the boost is only applied the similarity score above `boosting_threshold`) """ -struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric +struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real, T4 <: Integer} <: PreMetric dist::T1 - scaling_factor::T2 # scaling factor. Default to 0.1 + p::T2 # scaling factor. Default to 0.1 boosting_threshold::T3 # boost threshold. Default to 0.7 + l::Integer # length of common prefix. Default to 4 + function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4} + p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one") + new{T1, T2, T3, T4}(dist, p, boosting_threshold, l) + end end -Winkler(x) = Winkler(x, 0.1, 0.7) -function compare(s1::AbstractString, s2::AbstractString, dist::Winkler{Jaro}; min_dist = 0.0) - s1, s2 = reorder(s1, s2) - len1, len2 = length(s1), length(s2) - l = remove_prefix(s1, s2, 4)[1] - # http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 5 - bound = 2 / 3 + len1 / (3 * len2) + l * dist.scaling_factor * (1 / 3 - len1 / (3 * len2)) - bound <= min_dist && return min_dist +Winkler(x) = Winkler(x, 0.1, 0.7, 4) + +# hard to use min_dist because of whether there is boost or not in the end +function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist::Nothing = nothing) + l = remove_prefix(s1, s2, dist.l)[1] + # cannot do min_dist because of boosting threshold score = compare(s1, s2, dist.dist) if score >= dist.boosting_threshold - score += l * dist.scaling_factor * (1 - score) + score += l * dist.p * (1 - score) end - return max(score, min_dist) -end - - -function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist = 0.0) - l = remove_prefix(s1, s2, 4)[1] - score = compare(s1, s2, dist.dist; min_dist = min_dist) - if score >= dist.boosting_threshold - score += l * dist.scaling_factor * (1 - score) - end - return max(score, min_dist) + return score end JaroWinkler() = Winkler(Jaro(), 0.1, 0.7) @@ -106,20 +94,21 @@ struct Partial{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::Partial) +function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len1 == len2 && return compare(s1, s2, dist.dist) - len1 == 0 && return compare("", "", dist.dist) + len1 == len2 && return compare(s1, s2, dist.dist; min_dist = min_dist) + len1 == 0 && return 1.0 out = 0.0 for x in qgram(s2, len1) - curr = compare(s1, x, dist.dist) + curr = compare(s1, x, dist.dist; min_dist = min_dist) out = max(out, curr) end return out end -function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}) +function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; + min_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist) @@ -158,10 +147,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_dist = nothing) s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") - compare(s1, s2, dist.dist) + compare(s1, s2, dist.dist; min_dist = min_dist) end ############################################################################## @@ -179,17 +168,17 @@ struct TokenSet{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_dist = nothing) v1 = SortedSet(split(s1)) v2 = SortedSet(split(s2)) v0 = intersect(v1, v2) s0 = join(v0, " ") s1 = join(v1, " ") s2 = join(v2, " ") - isempty(s0) && return compare(s1, s2, dist.dist) - max(compare(s0, s1, dist.dist), - compare(s0, s2, dist.dist), - compare(s1, s2, dist.dist)) + isempty(s0) && return compare(s1, s2, dist.dist; min_dist = min_dist) + max(compare(s0, s1, dist.dist; min_dist = min_dist), + compare(s0, s2, dist.dist; min_dist = min_dist), + compare(s1, s2, dist.dist; min_dist = min_dist)) end @@ -207,24 +196,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax) +function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) dist0 = compare(s1, s2, dist.dist) unbase_scale = 0.95 # if one string is much shorter than the other, use partial if length(s2) >= 1.5 * length(s1) - partial = compare(s1, s2, Partial(dist.dist)) - ptsor = compare(s1, s2, TokenSort(Partial(dist.dist))) - ptser = compare(s1, s2, TokenSet(Partial(dist.dist))) + partial = compare(s1, s2, Partial(dist.dist); min_dist = min_dist) + ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_dist = min_dist) + ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_dist = min_dist) partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9 return max(dist0, partial * partial_scale, ptsor * unbase_scale * partial_scale, ptser * unbase_scale * partial_scale) else - ptsor = compare(s1, s2, TokenSort(dist.dist)) - ptser = compare(s1, s2, TokenSet(dist.dist)) + ptsor = compare(s1, s2, TokenSort(dist.dist); min_dist = min_dist) + ptser = compare(s1, s2, TokenSet(dist.dist); min_dist = min_dist) return max(dist0, ptsor * unbase_scale, ptser * unbase_scale) diff --git a/src/edit.jl b/src/edit.jl index 9a48c47..2765661 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -4,13 +4,12 @@ ## Hamming ## ############################################################################## -function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString; - max_dist = max(length(s1), length(s2))) +function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString; max_dist = nothing) current = abs(length(s2) - length(s1)) - current >= max_dist && return max_dist + max_dist !== nothing && current >= max_dist && return max_dist for (ch1, ch2) in zip(s1, s2) current += ch1 != ch2 - current >= max_dist && return max_dist + max_dist !== nothing && current >= max_dist && return max_dist end return current end @@ -39,12 +38,12 @@ struct Jaro <: SemiMetric end function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - maxdist = max(0, div(len2, 2) - 1) # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case len2 == 0 && return 0.0 + maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) prevstate1 = firstindex(s1) - i1_match = prevstate1 * ones(Int, len1) + i1_match = fill(prevstate1, len1) # m counts number matching characters m = 0 i1 = 1 @@ -61,9 +60,9 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) i2curr = i2 x2curr = x2 while x2curr !== nothing - (i2curr > i1 + maxdist) && break + i2curr > i1 + maxdist && break ch2, state2 = x2curr - if (ch1 == ch2) & !flag[i2curr] + if (ch1 == ch2) && !flag[i2curr] m += 1 flag[i2curr] = true i1_match[m] = prevstate1 @@ -108,13 +107,13 @@ struct Levenshtein <: SemiMetric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; - max_dist = max(length(s1), length(s2))) + max_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len2 - len1 >= max_dist && return max_dist + max_dist !== nothing && len2 - len1 >= max_dist && return max_dist # prefix common to both strings can be ignored k, x1, x2start = remove_prefix(s1, s2) - (x1 == nothing) && return len2 - k + x1 == nothing && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} v0 = collect(1:(len2 - k)) @@ -140,11 +139,12 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; x2 = iterate(s2, state2) i2 += 1 end - min_dist >= max_dist && return max_dist + max_dist !== nothing && min_dist >= max_dist && return max_dist x1 = iterate(s1, state1) i1 += 1 end - return min(current, max_dist) + max_dist !== nothing && return min(current, max_dist) + return current end ############################################################################## @@ -163,10 +163,10 @@ struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString; - max_dist = max(length(s1), length(s2))) + max_dist = nothing) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - len2 - len1 >= max_dist && return max_dist + max_dist !== nothing && len2 - len1 >= max_dist && return max_dist # prefix common to both strings can be ignored k, x1, x2start = remove_prefix(s1, s2) (x1 == nothing) && return len2 - k @@ -214,11 +214,12 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri i2 += 1 prevch2 = ch2 end - (v0[i1 + len2 - len1] >= max_dist) && return max_dist + max_dist !== nothing && (v0[i1 + len2 - len1] >= max_dist) && return max_dist x1 = iterate(s1, state1) i1 += 1 prevch1 = ch1 end + max_dist !== nothing && return min(current, max_dist) return current end @@ -239,7 +240,8 @@ The distance between two strings is defined as one minus the number of matching """ struct RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) +function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; + max_dist::Nothing = nothing) n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2) diff --git a/src/utils.jl b/src/utils.jl index 6a393a1..98f8559 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -17,11 +17,12 @@ Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) function reorder(s1::AbstractString, s2::AbstractString) s1 = string_with_length(s1) s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 + if length(s1) <= length(s2) + return s1, s2 + else + return s2, s1 end - return s1, s2 - end +end ## Find common prefixes (up to lim. -1 means Inf) diff --git a/test/distances.jl b/test/distances.jl index 62298d7..cbd92a8 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -99,7 +99,7 @@ for x in solutions end -for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro) +for dist in (Hamming, Levenshtein, DamerauLevenshtein) for i in eachindex(strings) @test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3) end diff --git a/test/modifiers.jl b/test/modifiers.jl index 18ef928..7108b50 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -20,13 +20,13 @@ using StringDistances, Test # Winkler -@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.9611 atol = 1e-4 -@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.84 atol = 1e-4 -@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.81333 atol = 1e-4 -@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.975 atol = 1e-4 -@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4 -@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0)) ≈ 1.0 atol = 1e-4 -@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0)) ≈ 0.0 atol = 1e-4 +@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4 +@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4 +@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4 +@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4 +@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 +@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4 +@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 strings = [ ("martha", "marhta"), @@ -37,7 +37,7 @@ strings = [ ] solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000] for i in 1:length(solutions) - @test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0)) ≈ (1 - solutions[i]) atol = 1e-4 + @test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0, 4)) ≈ (1 - solutions[i]) atol = 1e-4 end