From 7edca83311ae76b93bf952e446bba7f8f7183b4c Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Mon, 19 Aug 2019 13:54:38 -0400 Subject: [PATCH] refinement --- benchmark/.sublime2Terminal.jl | 2 +- src/compare.jl | 81 ++++++++++++++++------------------ src/edit.jl | 35 +++++---------- src/utils.jl | 25 +++++++++++ test/distances.jl | 12 ----- 5 files changed, 73 insertions(+), 82 deletions(-) diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index 2ddffcc..411fab5 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1 @@ -@time f(RatcliffObershelp(), x, y) \ No newline at end of file +@time f(Winkler(Jaro()), x, y; min_dist = 0.9) diff --git a/src/compare.jl b/src/compare.jl index 89dd1b6..3162331 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -10,45 +10,35 @@ compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` """ -# String with Length -# This allows to compute length once and only once -struct StringWithLength{T} <: AbstractString - s::T - l::Int +function compare(s1::AbstractString, s2::AbstractString, dist::RatcliffObershelp; min_dist = 0.0) + max(1.0 - evaluate(dist, s1, s2), min_dist) end -string_with_length(s::AbstractString) = StringWithLength(s, length(s)) -string_with_length(s::StringWithLength) = s -Base.length(s::StringWithLength) = s.l -Base.iterate(s::StringWithLength) = iterate(s.s) -Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i) -Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2) -Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s) -Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) -Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s) -Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) -function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist = 0.0) - 1.0 - evaluate(dist, s1, s2; max_dist = 1.0 - min_dist) +function compare(s1::AbstractString, s2::AbstractString, dist::Jaro; min_dist = 0.0) + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + # http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 4 + bound = 2 / 3 + len1 / (3 * len2) + bound <= min_dist && return min_dist + max(1.0 - evaluate(dist, s1, s2), min_dist) end function compare(s1::AbstractString, s2::AbstractString, dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = 0.0) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - len = max(length(s1), length(s2)) - len == 0 && return 1.0 - max_dist = ceil(Int, len * (1 - min_dist)) - max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len, min_dist) + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + len2 == 0 && return 1.0 + max_dist = ceil(Int, len2 * (1 - min_dist)) + max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist) end function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance) # When string length < q for qgram distance, returns s1 == s2 - s1 = string_with_length(s1) - s2 = string_with_length(s2) + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) - min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2) + len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) if typeof(dist) <: QGram 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) else @@ -75,13 +65,28 @@ struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric end Winkler(x) = Winkler(x, 0.1, 0.7) -function compare(s1::AbstractString, s2::AbstractString, dist::Winkler) - score = compare(s1, s2, dist.dist) +function compare(s1::AbstractString, s2::AbstractString, dist::Winkler{Jaro}; min_dist = 0.0) + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) l = remove_prefix(s1, s2, 4)[1] + # http://ceur-ws.org/Vol-1317/om2014_Tpaper4.pdf formula 5 + bound = 2 / 3 + len1 / (3 * len2) + l * dist.scaling_factor * (1 / 3 - len1 / (3 * len2)) + bound <= min_dist && return min_dist + score = compare(s1, s2, dist.dist) if score >= dist.boosting_threshold score += l * dist.scaling_factor * (1 - score) end - return score + return max(score, min_dist) +end + + +function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist = 0.0) + l = remove_prefix(s1, s2, 4)[1] + score = compare(s1, s2, dist.dist; min_dist = min_dist) + if score >= dist.boosting_threshold + score += l * dist.scaling_factor * (1 - score) + end + return max(score, min_dist) end JaroWinkler() = Winkler(Jaro(), 0.1, 0.7) @@ -102,11 +107,7 @@ struct Partial{T <: PreMetric} <: PreMetric end function compare(s1::AbstractString, s2::AbstractString, dist::Partial) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist) len1 == 0 && return compare("", "", dist.dist) @@ -119,11 +120,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial) end function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist) out = 0.0 @@ -211,11 +208,7 @@ struct TokenMax{T <: PreMetric} <: PreMetric end function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) dist0 = compare(s1, s2, dist.dist) unbase_scale = 0.95 diff --git a/src/edit.jl b/src/edit.jl index 5014b92..31b1358 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -36,19 +36,12 @@ where ``m`` is the number of matching characters and struct Jaro <: SemiMetric end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html -function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString; - max_dist = 1.0) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end +function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) + maxdist = max(0, div(len2, 2) - 1) # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case len2 == 0 && return 0.0 - # Time-Efficient Execution of Bounded Jaro-Winkler Distances Equation (4) - 1 - (2 / 3 + len1 / (3 * len2)) >= max_dist && return max_dist - maxdist = max(0, div(len2, 2) - 1) flag = fill(false, len2) prevstate1 = firstindex(s1) i1_match = prevstate1 * ones(Int, len1) @@ -83,7 +76,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString; i1 += 1 prevstate1 = state1 end - m == 0 && return min(1.0, max_dist) + m == 0 && return 1.0 # t counts number of transpositions t = 0 i1 = 0 @@ -96,7 +89,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString; end end current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 - return min(current, max_dist) + return current end ############################################################################## @@ -116,12 +109,8 @@ struct Levenshtein <: SemiMetric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max_dist = max(length(s1), length(s2))) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end - len1, len2 = length(s1), length(s2) +s1, s2 = reorder(s1, s2) +len1, len2 = length(s1), length(s2) len2 - len1 >= max_dist && return max_dist # prefix common to both strings can be ignored k, x1, x2start = remove_prefix(s1, s2) @@ -175,11 +164,7 @@ struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString; max_dist = max(length(s1), length(s2))) - s1 = string_with_length(s1) - s2 = string_with_length(s2) - if length(s1) > length(s2) - s2, s1 = s1, s2 - end + s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 - len1 >= max_dist && return max_dist # prefix common to both strings can be ignored @@ -254,10 +239,10 @@ The distance between two strings is defined as one minus the number of matching """ struct RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString; max_dist = 1.0) +function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) - len1 + len2 == 0 ? 0 : min(1.0 - 2 * n_matched / (len1 + len2), max_dist) + len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2) end function matching_blocks(s1::AbstractString, s2::AbstractString) diff --git a/src/utils.jl b/src/utils.jl index d718c60..6a393a1 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,29 @@ +# String with Length +# This allows to compute length once and only once +struct StringWithLength{T} <: AbstractString + s::T + l::Int +end +string_with_length(s::AbstractString) = StringWithLength(s, length(s)) +string_with_length(s::StringWithLength) = s +Base.length(s::StringWithLength) = s.l +Base.iterate(s::StringWithLength) = iterate(s.s) +Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i) +Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2) +Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s) +Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) +Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s) +Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) +function reorder(s1::AbstractString, s2::AbstractString) + s1 = string_with_length(s1) + s2 = string_with_length(s2) + if length(s1) > length(s2) + s2, s1 = s1, s2 + end + return s1, s2 + end + ## Find common prefixes (up to lim. -1 means Inf) function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1) l = 0 diff --git a/test/distances.jl b/test/distances.jl index 1537aca..62298d7 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -99,18 +99,6 @@ for x in solutions end -for dist in (Hamming, Levenshtein, DamerauLevenshtein) - for i in eachindex(strings) - @test evaluate(dist(), strings[i]..., max_dist = 3) == min(evaluate(dist(), strings[i]...), 3) - end -end - -for i in eachindex(strings) - @test evaluate(Jaro(), strings[i]..., max_dist = 0.6) == min(evaluate(Jaro(), strings[i]...), 0.6) -end - - - for dist in (Hamming, Levenshtein, DamerauLevenshtein, Jaro) for i in eachindex(strings) @test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)