findclosest
parent
46ae721329
commit
e6898f5274
|
@ -3,6 +3,7 @@ os:
|
||||||
- linux
|
- linux
|
||||||
julia:
|
julia:
|
||||||
- 1.0
|
- 1.0
|
||||||
|
- 1.5
|
||||||
- nightly
|
- nightly
|
||||||
matrix:
|
matrix:
|
||||||
allow_failures:
|
allow_failures:
|
||||||
|
|
|
@ -58,9 +58,9 @@ compare("martha", "martha", Levenshtein())
|
||||||
|
|
||||||
|
|
||||||
### Find
|
### Find
|
||||||
- `findbest` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
|
- `findclosest` returns the value and index of the element in `itr` with the lowest distance with `s`. Its syntax is:
|
||||||
```julia
|
```julia
|
||||||
findbest(s, itr, dist::StringDistance; min_score = 0.0)
|
findclosest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||||
```
|
```
|
||||||
|
|
||||||
- `findall` returns the indices of all elements in `itr` with a similarity score with `s` higher than a minimum value (default to 0.8). Its syntax is:
|
- `findall` returns the indices of all elements in `itr` with a similarity score with `s` higher than a minimum value (default to 0.8). Its syntax is:
|
||||||
|
@ -68,7 +68,7 @@ compare("martha", "martha", Levenshtein())
|
||||||
findall(s, itr, dist::StringDistance; min_score = 0.8)
|
findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||||
```
|
```
|
||||||
|
|
||||||
The functions `findbest` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
The functions `findclosest` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||||
|
|
||||||
|
|
||||||
## References
|
## References
|
||||||
|
|
|
@ -54,6 +54,6 @@ compare,
|
||||||
result_type,
|
result_type,
|
||||||
qgrams,
|
qgrams,
|
||||||
normalize,
|
normalize,
|
||||||
findbest
|
findclosest
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
14
src/find.jl
14
src/find.jl
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
findbest(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index)
|
findclosest(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index)
|
||||||
|
|
||||||
`findbest` returns the value and index of the element of `itr` that has the
|
`findclosest` returns the value and index of the element of `itr` that has the
|
||||||
highest similarity score with `s` according to the distance `dist`.
|
highest similarity score with `s` according to the distance `dist`.
|
||||||
It returns `(nothing, nothing)` if none of the elements has a similarity score
|
It returns `(nothing, nothing)` if none of the elements has a similarity score
|
||||||
higher or equal to `min_score` (default to 0.0).
|
higher or equal to `min_score` (default to 0.0).
|
||||||
|
@ -14,13 +14,13 @@ It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`
|
||||||
julia> using StringDistances
|
julia> using StringDistances
|
||||||
julia> s = "Newark"
|
julia> s = "Newark"
|
||||||
julia> iter = ["New York", "Princeton", "San Francisco"]
|
julia> iter = ["New York", "Princeton", "San Francisco"]
|
||||||
julia> findbest(s, iter, Levenshtein())
|
julia> findclosest(s, iter, Levenshtein())
|
||||||
("NewYork", 1)
|
("NewYork", 1)
|
||||||
julia> findbest(s, iter, Levenshtein(); min_score = 0.9)
|
julia> findclosest(s, iter, Levenshtein(); min_score = 0.9)
|
||||||
(nothing, nothing)
|
(nothing, nothing)
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
function findbest(s, itr, dist::StringDistance; min_score = 0.0)
|
function findclosest(s, itr, dist::StringDistance; min_score = 0.0)
|
||||||
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
|
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
|
||||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||||
is = [0 for _ in 1:Threads.nthreads()]
|
is = [0 for _ in 1:Threads.nthreads()]
|
||||||
|
@ -39,8 +39,8 @@ end
|
||||||
|
|
||||||
|
|
||||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findbest(s, itr, dist; min_score)"
|
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findclosest(s, itr, dist; min_score)"
|
||||||
findbest(s, itr, dist; min_score = min_score)
|
findclosest(s, itr, dist; min_score = min_score)
|
||||||
end
|
end
|
||||||
"""
|
"""
|
||||||
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
findall(s, itr , dist::StringDistance; min_score = 0.8)
|
||||||
|
|
|
@ -99,18 +99,18 @@ using StringDistances, Unicode, Test
|
||||||
end
|
end
|
||||||
|
|
||||||
# check find_best and find_all
|
# check find_best and find_all
|
||||||
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
|
@test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
|
||||||
@test findbest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
|
@test findclosest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
|
||||||
@test findbest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
|
@test findclosest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
|
||||||
|
|
||||||
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
|
@test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
|
||||||
@test findbest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
@test findclosest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
||||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
|
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
|
||||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
|
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
|
||||||
if VERSION >= v"1.2.0"
|
if VERSION >= v"1.2.0"
|
||||||
@test findbest("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1)
|
@test findclosest("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1)
|
||||||
@test findbest("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing)
|
@test findclosest("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing)
|
||||||
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
|
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
|
||||||
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
|
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue