extract
parent
bdf13765d3
commit
1196915726
|
@ -100,12 +100,12 @@ evaluate(Levenshtein(), "New York", "New York")
|
|||
```
|
||||
|
||||
## Extract
|
||||
An experimental `extract` funciton finds the best match in an iterator of AbstractStrings:
|
||||
On master, there is an experimental `extract` function that returns the best match in an iterator of AbstractStrings:
|
||||
```julia
|
||||
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
|
||||
#> "NewYork"
|
||||
```
|
||||
The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax`
|
||||
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
|
||||
## Which distance should I use?
|
||||
|
||||
|
|
|
@ -19,6 +19,10 @@ end
|
|||
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
|
||||
# 0.39s. Much faster than StringDist
|
||||
|
||||
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
|
||||
|
||||
|
||||
|
||||
function g(t, x, y)
|
||||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||
end
|
||||
|
|
|
@ -246,7 +246,21 @@ function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSo
|
|||
end
|
||||
return best_s2
|
||||
end
|
||||
function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
missing
|
||||
function extract(s1::AbstractString, iter_s2, dist::PreMetric)
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist)
|
||||
if (score !== missing) && (score > best_score)
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
|
||||
|
||||
function extract(::Missing, iter_s2, dist::PreMetric)
|
||||
return missing
|
||||
end
|
||||
|
||||
|
|
|
@ -152,6 +152,7 @@ dist = QGram(2)
|
|||
|
||||
# check extract
|
||||
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
|
||||
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"
|
||||
|
||||
|
|
Loading…
Reference in New Issue