allow skipmissing iterator
parent
de44cc2e41
commit
164448f5d7
|
@ -1,3 +1,3 @@
|
|||
benchmark/benchmark.md
|
||||
benchmark/
|
||||
PC25
|
||||
Manifest.toml
|
||||
|
|
10
README.md
10
README.md
|
@ -46,17 +46,17 @@ compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet
|
|||
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
|
||||
```
|
||||
|
||||
A good distance to link adresses etc (where the word order does not matter) is `TokenMax(Levenshtein())`
|
||||
In case the word order does not matter, a good distance is `TokenMax(Levenshtein())`
|
||||
|
||||
## Find
|
||||
- `findmax` returns the value and index of the element in `iter` with the highest similarity score with `x`. Its syntax is:
|
||||
- `findmax` returns the value and index of the element in `itr` with the highest similarity score with `x`. Its syntax is:
|
||||
```julia
|
||||
findmax(x::AbstractString, iter::AbstractString, dist::StringDistance; min_score = 0.0)
|
||||
findmax(x::AbstractString, itr, dist::StringDistance; min_score = 0.0)
|
||||
```
|
||||
|
||||
- `findall` returns the indices of all elements in `iter` with a similarity score with `x` higher than a minimum value (default to 0.8). Its syntax is:
|
||||
- `findall` returns the indices of all elements in `itr` with a similarity score with `x` higher than a minimum value (default to 0.8). Its syntax is:
|
||||
```julia
|
||||
findall(x::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.8)
|
||||
findall(x::AbstractString, itr, dist::StringDistance; min_score = 0.8)
|
||||
```
|
||||
|
||||
The functions `findmax` and `findall` are particularly optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
|
||||
|
||||
|
||||
|
||||
# check
|
||||
function h(t, x, y; min_score = 1/3)
|
||||
out = fill(false, length(x))
|
||||
for i in eachindex(x)
|
||||
if compare(x[i], y[i], t) < min_score
|
||||
out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ 0.0
|
||||
else
|
||||
out[i] = compare(x[i], y[i], t ; min_score = min_score) ≈ compare(x[i], y[i], t)
|
||||
end
|
||||
end
|
||||
all(out)
|
||||
end
|
||||
h(Levenshtein(), x, y)
|
||||
h(DamerauLevenshtein(), x, y)
|
38
src/find.jl
38
src/find.jl
|
@ -1,40 +1,38 @@
|
|||
"""
|
||||
findmax(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.0)
|
||||
findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
|
||||
|
||||
`findmax` returns the value and index of the element of `iter` that has the highest similarity score with `s` according to the distance `dist`.
|
||||
`findmax` returns the value and index of the element of `itr` that has the highest similarity score with `s` according to the distance `dist`.
|
||||
It returns `(nothing, nothing)` if none of the elements has a similarity score higher or equal to `min_score` (default to 0.0)
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function Base.findmax(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.0)
|
||||
min_score >= 0 || throw("min_score should be positive")
|
||||
function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
|
||||
vmin = Threads.Atomic{typeof(min_score)}(min_score)
|
||||
vs = [0.0 for _ in 1:Threads.nthreads()]
|
||||
xs = eltype(itr)["" for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
xs = eltype(iter)["" for _ in 1:Threads.nthreads()]
|
||||
scores = [-1.0 for _ in 1:Threads.nthreads()]
|
||||
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
|
||||
Threads.@threads for i in 1:length(iter)
|
||||
score = compare(s, iter[i], dist; min_score = min_score_atomic[])
|
||||
min_score_atomic_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= min_score_atomic_old
|
||||
score == 1.0 && return i
|
||||
Threads.@threads for i in (itr isa Base.SkipMissing ? collect(keys(itr)) : keys(itr))
|
||||
v = compare(s, itr[i], dist; min_score = vmin[])
|
||||
v_old = Threads.atomic_max!(vmin, v)
|
||||
if v >= v_old
|
||||
vs[Threads.threadid()] = v
|
||||
xs[Threads.threadid()] = itr[i]
|
||||
is[Threads.threadid()] = i
|
||||
xs[Threads.threadid()] = iter[i]
|
||||
scores[Threads.threadid()] = score
|
||||
end
|
||||
end
|
||||
i = argmax(scores)
|
||||
i = argmax(vs)
|
||||
is[i] == 0 ? (nothing, nothing) : (xs[i], is[i])
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
findall(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.8)
|
||||
`findall` returns the vector of indices for elements of `iter` that have a similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
|
||||
`findall` returns the vector of indices for elements of `itr` that have a similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function Base.findall(s::AbstractString, iter::AbstractVector, dist::StringDistance; min_score = 0.8)
|
||||
function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
Threads.@threads for i in 1:length(iter)
|
||||
score = compare(s, iter[i], dist; min_score = min_score)
|
||||
Threads.@threads for i in (itr isa Base.SkipMissing ? collect(keys(itr)) : keys(itr))
|
||||
score = compare(s, itr[i], dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
push!(out[Threads.threadid()], i)
|
||||
end
|
||||
|
|
|
@ -102,6 +102,12 @@ using StringDistances, Test
|
|||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
|
||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
|
||||
if VERSION >= v"1.2.0"
|
||||
@test findmax("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == ("NewYork", 1)
|
||||
@test findmax("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == (nothing, nothing)
|
||||
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
|
||||
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue