pull/17/head
matthieugomez 2019-08-20 12:43:21 -04:00
parent bdf13765d3
commit 1196915726
4 changed files with 23 additions and 4 deletions

View File

@ -100,12 +100,12 @@ evaluate(Levenshtein(), "New York", "New York")
```
## Extract
An experimental `extract` funciton finds the best match in an iterator of AbstractStrings:
On master, there is an experimental `extract` function that returns the best match in an iterator of AbstractStrings:
```julia
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
#> "NewYork"
```
The function works for `Levenshtein`, `DamerauLevenshtein`, possibly modified by `Partial`, `TokenSort`, `TokenSet`, `TokenMax`
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
## Which distance should I use?

View File

@ -19,6 +19,10 @@ end
@time f(DamerauLevenshtein(), x, y, min_dist = 0.8)
# 0.39s. Much faster than StringDist
@time extract.(x[1:10], Ref(y), Ref(DamerauLevenshtein()))
function g(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
end

View File

@ -246,7 +246,21 @@ function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSo
end
return best_s2
end
function extract(::Missing, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
missing
function extract(s1::AbstractString, iter_s2, dist::PreMetric)
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist)
if (score !== missing) && (score > best_score)
best_s2 = s2
best_score = score
end
end
return best_s2
end
function extract(::Missing, iter_s2, dist::PreMetric)
return missing
end

View File

@ -152,6 +152,7 @@ dist = QGram(2)
# check extract
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"