add extract + handle Missing

pull/17/head
matthieugomez 2019-08-20 12:32:52 -04:00
parent 5ea65c389a
commit 5aa033bf05
6 changed files with 58 additions and 10 deletions

View File

@ -99,6 +99,13 @@ evaluate(Levenshtein(), "New York", "New York")
#> 0
```
## Extract
An experimental `extract` funciton finds the best match in an iterator of AbstractStrings:
```julia
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) = "NewYork"
```
The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax`
## Which distance should I use?
As a rule of thumb,

View File

@ -1 +1 @@
@time f(DamerauLevenshtein(), x, y)
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))

View File

@ -7,15 +7,13 @@ y = map(Random.randstring, rand(5:25,500_000))
function f(t, x, y; min_dist = nothing)
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
end
function f(t, x, y; min_dist = nothing)
[evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)]
end
@time f(Hamming(), x, y)
#0.1s
@time f(Jaro(), x, y)
#0.3s
@time f(Levenshtein(), x, y)
# 0.3s. A bit faster than StringDist
# 0.35s. A bit faster than StringDist
@time f(Levenshtein(), x, y, min_dist = 0.8)
@time f(DamerauLevenshtein(), x, y)
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)

View File

@ -26,7 +26,8 @@ Partial,
TokenSort,
TokenSet,
TokenMax,
qgram
qgram,
extract
##############################################################################
##

View File

@ -9,7 +9,6 @@
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
"""
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
s1, s2 = reorder(s1, s2)
@ -43,6 +42,16 @@ end
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
# Handle missing values
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing)
missing
end
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing)
missing
end
function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing)
missing
end
##############################################################################
##
## Winkler
@ -218,4 +227,26 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dis
ptsor * unbase_scale,
ptser * unbase_scale)
end
end
end
##############################################################################
##
## Extract
##
##############################################################################
function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist; min_dist = best_score)
if (score !== missing) && (score > best_score)
best_s2 = s2
best_score = score
end
end
return best_s2
end
function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
missing
end

View File

@ -145,3 +145,14 @@ s1 = SubString(s1, 1, 4)
s2 = SubString(s2, 1, 4)
dist = QGram(2)
@test evaluate(dist, s1, s2) == 2
# check missing
@test compare(s1, missing, Levenshtein()) === missing
# check extract
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"