StringDistances.jl/src/find.jl

"""
    findmax(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index)

`findmax` returns the value and index of the element of `itr` that has the 
highest similarity score with `s` according to the distance `dist`. 
It returns `(nothing, nothing)` if none of the elements has a similarity score 
higher or equal to `min_score` (default to 0.0).

It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).

### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findmax(s, iter, Levenshtein())
("NewYork", 1)
julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
    min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
    scores = [0.0 for _ in 1:Threads.nthreads()]
    is = [0 for _ in 1:Threads.nthreads()]
    # need collect since @threads requires a length method
    Threads.@threads for i in collect(eachindex(itr))
        score = compare(s, itr[i], dist; min_score = min_score_atomic[])
        score_old = Threads.atomic_max!(min_score_atomic, score)
        if score >= score_old
            scores[Threads.threadid()] = score
            is[Threads.threadid()] = i
        end
    end
    imax = is[argmax(scores)]
    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end

"""
    findall(s, itr , dist::StringDistance; min_score = 0.8)
    
`findall` returns the vector of indices for elements of `itr` that have a 
similarity score higher or equal than `min_score` according to the distance `dist`.
If there are no such elements, return an empty array. 

It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).

### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["Newwark", "Princeton", "San Francisco"]
julia> findall(s, iter, Levenshtein())
1-element Array{Int64,1}:
 1
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
    out = [Int[] for _ in 1:Threads.nthreads()]
    # need collect since @threads requires a length method
    Threads.@threads for i in collect(eachindex(itr))
        score = compare(s, itr[i], dist; min_score = min_score)
        if score >= min_score
            push!(out[Threads.threadid()], i)
        end
    end
    vcat(out...)
end
update 2019-08-20 19:21:31 +02:00			`"""`
Update find.jl 2020-04-20 20:27:03 +02:00			`findmax(s, itr, dist::StringDistance; min_score = 0.0) -> (x, index)`
update 2019-08-20 19:21:31 +02:00
remove trie 2019-12-13 00:55:41 +01:00			`findmax` returns the value and index of the element of `itr` that has the
			highest similarity score with `s` according to the distance `dist`.
			It returns `(nothing, nothing)` if none of the elements has a similarity score
Update find.jl 2019-12-13 15:32:23 +01:00			higher or equal to `min_score` (default to 0.0).
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00
			It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
			(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).

			`### Examples`
			```julia-repl
			`julia> using StringDistances`
Update find.jl 2020-02-08 17:54:40 +01:00			`julia> s = "Newark"`
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00			`julia> iter = ["New York", "Princeton", "San Francisco"]`
			`julia> findmax(s, iter, Levenshtein())`
			`("NewYork", 1)`
			`julia> findmax(s, iter, Levenshtein(); min_score = 0.9)`
			`(nothing, nothing)`
			```
update 2019-08-20 19:21:31 +02:00			`"""`
Update find.jl 2020-04-20 20:27:03 +02:00			`function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)`
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00			`min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)`
Update find.jl 2019-12-13 15:32:23 +01:00			`scores = [0.0 for _ in 1:Threads.nthreads()]`
return indices 2019-12-12 20:26:25 +01:00			`is = [0 for _ in 1:Threads.nthreads()]`
Update find.jl 2020-04-20 20:09:52 +02:00			`# need collect since @threads requires a length method`
Update find.jl 2020-04-20 20:08:29 +02:00			`Threads.@threads for i in collect(eachindex(itr))`
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00			`score = compare(s, itr[i], dist; min_score = min_score_atomic[])`
			`score_old = Threads.atomic_max!(min_score_atomic, score)`
Update find.jl 2019-12-13 15:32:23 +01:00			`if score >= score_old`
			`scores[Threads.threadid()] = score`
return indices 2019-12-12 20:26:25 +01:00			`is[Threads.threadid()] = i`
update 2019-08-20 19:21:31 +02:00			`end`
			`end`
Update find.jl 2019-12-13 15:32:23 +01:00			`imax = is[argmax(scores)]`
Update find.jl 2019-12-13 15:15:39 +01:00			`imax == 0 ? (nothing, nothing) : (itr[imax], imax)`
update 2019-08-20 19:21:31 +02:00			`end`

			`"""`
Update find.jl 2020-04-20 20:27:03 +02:00			`findall(s, itr , dist::StringDistance; min_score = 0.8)`
remove trie 2019-12-13 00:55:41 +01:00
			`findall` returns the vector of indices for elements of `itr` that have a
Update find.jl 2019-12-13 15:32:23 +01:00			similarity score higher or equal than `min_score` according to the distance `dist`.
			`If there are no such elements, return an empty array.`
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00
			It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
Update find.jl 2019-12-13 15:32:23 +01:00			(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
rmv datastructures + add docs 2019-12-13 16:33:06 +01:00
			`### Examples`
			```julia-repl
			`julia> using StringDistances`
			`julia> s = "Newark"`
			`julia> iter = ["Newwark", "Princeton", "San Francisco"]`
			`julia> findall(s, iter, Levenshtein())`
			`1-element Array{Int64,1}:`
			`1`
			`julia> findall(s, iter, Levenshtein(); min_score = 0.9)`
			`0-element Array{Int64,1}`
			```
update 2019-08-20 19:21:31 +02:00			`"""`
Update find.jl 2020-04-20 20:27:03 +02:00			`function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)`
return indices 2019-12-12 20:26:25 +01:00			`out = [Int[] for _ in 1:Threads.nthreads()]`
Update find.jl 2020-04-20 20:09:52 +02:00			`# need collect since @threads requires a length method`
Update find.jl 2020-04-20 20:08:29 +02:00			`Threads.@threads for i in collect(eachindex(itr))`
allow skipmissing iterator 2019-12-12 22:49:20 +01:00			`score = compare(s, itr[i], dist; min_score = min_score)`
parellelize find functions 2019-12-12 19:21:36 +01:00			`if score >= min_score`
return indices 2019-12-12 20:26:25 +01:00			`push!(out[Threads.threadid()], i)`
parellelize find functions 2019-12-12 19:21:36 +01:00			`end`
			`end`
return indices 2019-12-12 20:26:25 +01:00			`vcat(out...)`
update 2019-08-20 19:21:31 +02:00			`end`