add extract + handle Missing
parent
5ea65c389a
commit
5aa033bf05
|
@ -99,6 +99,13 @@ evaluate(Levenshtein(), "New York", "New York")
|
|||
#> 0
|
||||
```
|
||||
|
||||
## Extract
|
||||
An experimental `extract` funciton finds the best match in an iterator of AbstractStrings:
|
||||
```julia
|
||||
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) = "NewYork"
|
||||
```
|
||||
The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax`
|
||||
|
||||
## Which distance should I use?
|
||||
|
||||
As a rule of thumb,
|
||||
|
|
|
@ -1 +1 @@
|
|||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
|
||||
|
|
|
@ -7,15 +7,13 @@ y = map(Random.randstring, rand(5:25,500_000))
|
|||
function f(t, x, y; min_dist = nothing)
|
||||
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
||||
end
|
||||
function f(t, x, y; min_dist = nothing)
|
||||
[evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)]
|
||||
end
|
||||
|
||||
@time f(Hamming(), x, y)
|
||||
#0.1s
|
||||
@time f(Jaro(), x, y)
|
||||
|
||||
|
||||
#0.3s
|
||||
@time f(Levenshtein(), x, y)
|
||||
# 0.3s. A bit faster than StringDist
|
||||
# 0.35s. A bit faster than StringDist
|
||||
@time f(Levenshtein(), x, y, min_dist = 0.8)
|
||||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||
|
|
|
@ -26,7 +26,8 @@ Partial,
|
|||
TokenSort,
|
||||
TokenSet,
|
||||
TokenMax,
|
||||
qgram
|
||||
qgram,
|
||||
extract
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
|
||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||
"""
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -43,6 +42,16 @@ end
|
|||
|
||||
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
|
||||
|
||||
# Handle missing values
|
||||
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||
missing
|
||||
end
|
||||
##############################################################################
|
||||
##
|
||||
## Winkler
|
||||
|
@ -218,4 +227,26 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dis
|
|||
ptsor * unbase_scale,
|
||||
ptser * unbase_scale)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Extract
|
||||
##
|
||||
##############################################################################
|
||||
function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist; min_dist = best_score)
|
||||
if (score !== missing) && (score > best_score)
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
missing
|
||||
end
|
||||
|
||||
|
|
|
@ -145,3 +145,14 @@ s1 = SubString(s1, 1, 4)
|
|||
s2 = SubString(s2, 1, 4)
|
||||
dist = QGram(2)
|
||||
@test evaluate(dist, s1, s2) == 2
|
||||
|
||||
|
||||
# check missing
|
||||
@test compare(s1, missing, Levenshtein()) === missing
|
||||
|
||||
# check extract
|
||||
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue