add extract + handle Missing
parent
5ea65c389a
commit
5aa033bf05
|
@ -99,6 +99,13 @@ evaluate(Levenshtein(), "New York", "New York")
|
||||||
#> 0
|
#> 0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Extract
|
||||||
|
An experimental `extract` funciton finds the best match in an iterator of AbstractStrings:
|
||||||
|
```julia
|
||||||
|
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) = "NewYork"
|
||||||
|
```
|
||||||
|
The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax`
|
||||||
|
|
||||||
## Which distance should I use?
|
## Which distance should I use?
|
||||||
|
|
||||||
As a rule of thumb,
|
As a rule of thumb,
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
@time f(DamerauLevenshtein(), x, y)
|
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
|
||||||
|
|
|
@ -7,15 +7,13 @@ y = map(Random.randstring, rand(5:25,500_000))
|
||||||
function f(t, x, y; min_dist = nothing)
|
function f(t, x, y; min_dist = nothing)
|
||||||
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
[compare(x[i], y[i], t; min_dist = min_dist) for i in 1:length(x)]
|
||||||
end
|
end
|
||||||
function f(t, x, y; min_dist = nothing)
|
|
||||||
[evaluate(t, x[i], y[i]; min_dist = min_dist) for i in 1:length(x)]
|
|
||||||
end
|
|
||||||
@time f(Hamming(), x, y)
|
@time f(Hamming(), x, y)
|
||||||
|
#0.1s
|
||||||
@time f(Jaro(), x, y)
|
@time f(Jaro(), x, y)
|
||||||
|
#0.3s
|
||||||
|
|
||||||
@time f(Levenshtein(), x, y)
|
@time f(Levenshtein(), x, y)
|
||||||
# 0.3s. A bit faster than StringDist
|
# 0.35s. A bit faster than StringDist
|
||||||
@time f(Levenshtein(), x, y, min_dist = 0.8)
|
@time f(Levenshtein(), x, y, min_dist = 0.8)
|
||||||
@time f(DamerauLevenshtein(), x, y)
|
@time f(DamerauLevenshtein(), x, y)
|
||||||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||||
|
|
|
@ -26,7 +26,8 @@ Partial,
|
||||||
TokenSort,
|
TokenSort,
|
||||||
TokenSet,
|
TokenSet,
|
||||||
TokenMax,
|
TokenMax,
|
||||||
qgram
|
qgram,
|
||||||
|
extract
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
|
|
|
@ -9,7 +9,6 @@
|
||||||
|
|
||||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString,
|
function compare(s1::AbstractString, s2::AbstractString,
|
||||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
|
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
|
@ -43,6 +42,16 @@ end
|
||||||
|
|
||||||
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
|
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
|
||||||
|
|
||||||
|
# Handle missing values
|
||||||
|
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||||
|
missing
|
||||||
|
end
|
||||||
|
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing)
|
||||||
|
missing
|
||||||
|
end
|
||||||
|
function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||||
|
missing
|
||||||
|
end
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Winkler
|
## Winkler
|
||||||
|
@ -219,3 +228,25 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dis
|
||||||
ptser * unbase_scale)
|
ptser * unbase_scale)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
##
|
||||||
|
## Extract
|
||||||
|
##
|
||||||
|
##############################################################################
|
||||||
|
function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||||
|
best_score = 0.0
|
||||||
|
best_s2 = nothing
|
||||||
|
for s2 in iter_s2
|
||||||
|
score = compare(s1, s2, dist; min_dist = best_score)
|
||||||
|
if (score !== missing) && (score > best_score)
|
||||||
|
best_s2 = s2
|
||||||
|
best_score = score
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return best_s2
|
||||||
|
end
|
||||||
|
function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||||
|
missing
|
||||||
|
end
|
||||||
|
|
||||||
|
|
|
@ -145,3 +145,14 @@ s1 = SubString(s1, 1, 4)
|
||||||
s2 = SubString(s2, 1, 4)
|
s2 = SubString(s2, 1, 4)
|
||||||
dist = QGram(2)
|
dist = QGram(2)
|
||||||
@test evaluate(dist, s1, s2) == 2
|
@test evaluate(dist, s1, s2) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# check missing
|
||||||
|
@test compare(s1, missing, Levenshtein()) === missing
|
||||||
|
|
||||||
|
# check extract
|
||||||
|
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||||
|
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
|
||||||
|
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue