findclosest

compathelper/new_version/2020-10-08-17-05-17-769-1797568811
matthieugomez 2020-09-28 14:55:18 -07:00
parent 46ae721329
commit e6898f5274
5 changed files with 19 additions and 18 deletions

View File

@ -3,6 +3,7 @@ os:
- linux - linux
julia: julia:
- 1.0 - 1.0
- 1.5
- nightly - nightly
matrix: matrix:
allow_failures: allow_failures:

View File

@ -58,9 +58,9 @@ compare("martha", "martha", Levenshtein())
### Find ### Find
- `findbest` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is: - `findclosest` returns the value and index of the element in `itr` with the lowest distance with `s`. Its syntax is:
```julia ```julia
findbest(s, itr, dist::StringDistance; min_score = 0.0) findclosest(s, itr, dist::StringDistance; min_score = 0.0)
``` ```
- `findall` returns the indices of all elements in `itr` with a similarity score with `s` higher than a minimum value (default to 0.8). Its syntax is: - `findall` returns the indices of all elements in `itr` with a similarity score with `s` higher than a minimum value (default to 0.8). Its syntax is:
@ -68,7 +68,7 @@ compare("martha", "martha", Levenshtein())
findall(s, itr, dist::StringDistance; min_score = 0.8) findall(s, itr, dist::StringDistance; min_score = 0.8)
``` ```
The functions `findbest` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`). The functions `findclosest` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
## References ## References

View File

@ -54,6 +54,6 @@ compare,
result_type, result_type,
qgrams, qgrams,
normalize, normalize,
findbest findclosest
end end

View File

@ -1,7 +1,7 @@
""" """
findbest(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index) findclosest(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index)
`findbest` returns the value and index of the element of `itr` that has the `findclosest` returns the value and index of the element of `itr` that has the
highest similarity score with `s` according to the distance `dist`. highest similarity score with `s` according to the distance `dist`.
It returns `(nothing, nothing)` if none of the elements has a similarity score It returns `(nothing, nothing)` if none of the elements has a similarity score
higher or equal to `min_score` (default to 0.0). higher or equal to `min_score` (default to 0.0).
@ -14,13 +14,13 @@ It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`
julia> using StringDistances julia> using StringDistances
julia> s = "Newark" julia> s = "Newark"
julia> iter = ["New York", "Princeton", "San Francisco"] julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findbest(s, iter, Levenshtein()) julia> findclosest(s, iter, Levenshtein())
("NewYork", 1) ("NewYork", 1)
julia> findbest(s, iter, Levenshtein(); min_score = 0.9) julia> findclosest(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing) (nothing, nothing)
``` ```
""" """
function findbest(s, itr, dist::StringDistance; min_score = 0.0) function findclosest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score) min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()] scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()] is = [0 for _ in 1:Threads.nthreads()]
@ -39,8 +39,8 @@ end
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0) function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findbest(s, itr, dist; min_score)" @warn "findmax(s, itr, dist; min_score) is deprecated. Use findclosest(s, itr, dist; min_score)"
findbest(s, itr, dist; min_score = min_score) findclosest(s, itr, dist; min_score = min_score)
end end
""" """
findall(s, itr , dist::StringDistance; min_score = 0.8) findall(s, itr , dist::StringDistance; min_score = 0.8)

View File

@ -99,18 +99,18 @@ using StringDistances, Unicode, Test
end end
# check find_best and find_all # check find_best and find_all
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1) @test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
@test findbest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2) @test findclosest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
@test findbest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3) @test findclosest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing) @test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) @test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
if VERSION >= v"1.2.0" if VERSION >= v"1.2.0"
@test findbest("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1) @test findclosest("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1)
@test findbest("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing) @test findclosest("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing)
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1] @test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == [] @test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
end end