From 5aa033bf05d40befd6161134324eca223d4809e3 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Tue, 20 Aug 2019 12:32:52 -0400 Subject: [PATCH] add extract + handle Missing --- README.md | 7 +++++++ benchmark/.sublime2Terminal.jl | 2 +- benchmark/benchmark.jl | 10 ++++------ src/StringDistances.jl | 3 ++- src/compare.jl | 35 ++++++++++++++++++++++++++++++++-- test/modifiers.jl | 11 +++++++++++ 6 files changed, 58 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 7601cf6..e2a8dd8 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,13 @@ evaluate(Levenshtein(), "New York", "New York") #> 0 ``` +## Extract +An experimental `extract` funciton finds the best match in an iterator of AbstractStrings: +```julia +extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) = "NewYork" +``` +The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax` + ## Which distance should I use? As a rule of thumb, diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index 79ead95..d220e56 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1 @@ -@time f(DamerauLevenshtein(), x, y) +@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein())) diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index ccb7428..11ace76 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -7,15 +7,13 @@ y = map(Random.randstring, rand(5:25,500_000)) function f(t, x, y; min_dist = nothing) [compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)] end -function f(t, x, y; min_dist = nothing) - [evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)] -end + @time f(Hamming(), x, y) +#0.1s @time f(Jaro(), x, y) - - +#0.3s @time f(Levenshtein(), x, y) -# 0.3s. A bit faster than StringDist +# 0.35s. A bit faster than StringDist @time f(Levenshtein(), x, y, min_dist = 0.8) @time f(DamerauLevenshtein(), x, y) @time f(DamerauLevenshtein(), x, y, min_dist = 0.8) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index ca1041b..7b7d92e 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -26,7 +26,8 @@ Partial, TokenSort, TokenSet, TokenMax, -qgram +qgram, +extract ############################################################################## ## diff --git a/src/compare.jl b/src/compare.jl index 19a2f12..653f856 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -9,7 +9,6 @@ compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` """ - function compare(s1::AbstractString, s2::AbstractString, dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing) s1, s2 = reorder(s1, s2) @@ -43,6 +42,16 @@ end @deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist) +# Handle missing values +function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing) + missing +end +function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing) + missing +end +function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing) + missing +end ############################################################################## ## ## Winkler @@ -218,4 +227,26 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dis ptsor * unbase_scale, ptser * unbase_scale) end -end \ No newline at end of file +end + +############################################################################## +## +## Extract +## +############################################################################## +function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein} + best_score = 0.0 + best_s2 = nothing + for s2 in iter_s2 + score = compare(s1, s2, dist; min_dist = best_score) + if (score !== missing) && (score > best_score) + best_s2 = s2 + best_score = score + end + end + return best_s2 +end +function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein} + missing +end + diff --git a/test/modifiers.jl b/test/modifiers.jl index 7108b50..b90264b 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -145,3 +145,14 @@ s1 = SubString(s1, 1, 4) s2 = SubString(s2, 1, 4) dist = QGram(2) @test evaluate(dist, s1, s2) == 2 + + +# check missing +@test compare(s1, missing, Levenshtein()) === missing + +# check extract +@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork" +@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork" +@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork" + +