re-organize and default to _preprocess = true

pull/57/head
matthieugomez 2021-09-11 10:04:03 -04:00
parent 7dfd864d63
commit 69491d9f78
6 changed files with 78 additions and 84 deletions

View File

@ -14,8 +14,8 @@ The available distances are:
- Hamming Distance `Hamming()`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
- Q-gram distances compare the set of all substrings of length `q` in each string.
- QGram Distance `Qgram(q::Int)`
@ -28,7 +28,7 @@ The available distances are:
## Basic Use
### evaluate
### distance
You can always compute a certain distance between two strings using the following syntax:
```julia
@ -43,8 +43,15 @@ evaluate(Levenshtein(), "martha", "marhta")
Levenshtein()("martha", "marhta")
```
In contrast, the function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns an element of type `Float64`. A value of 0.0 means completely different and a value of 1.0 means completely similar.
```julia
compare("martha", "martha", Levenshtein())
#> 1.0
```
### pairwise
`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings
`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings (or iterators)
```julia
pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
@ -52,16 +59,17 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).
### distance modifiers
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.
### similarly score
- The function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns a Float64. A value of 0.0 means completely different and a value of 1.0 means completely similar.
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
```julia
Levenshtein()("martha", "martha")
#> 0.0
compare("martha", "martha", Levenshtein())
#> 1.0
```
### find
The package also adds some convience function to find the element in a list that is closest to a given string
- `findnearest` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
```julia
@ -76,17 +84,4 @@ The function `pairwise` is particularly optimized for QGram-distances (each elem
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).
### distance modifiers
The package also defines Distance "modifiers" that can be applied to any distance.
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
## References
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
- [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)

View File

@ -9,11 +9,32 @@ include("distances/qgram.jl")
include("normalize.jl")
include("fuzzywuzzy.jl")
const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
include("compare.jl")
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end
include("find.jl")
include("pairwise.jl")
# Distances API
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2)
T = Union{T, Missing}
end
return T
end
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))

View File

@ -192,9 +192,7 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
#q-grams of AbstractVector
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
# so it does not seem to be worth it.
# q-grams of General Iterators
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1

View File

@ -1,19 +1,3 @@
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
end
"""
findnearest(s, itr, dist::StringDistance) -> (x, index)
@ -38,10 +22,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
min_score_atomic = Threads.Atomic{Float64}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
s = _preprocess(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
@ -51,9 +35,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
imax = is[argmax(scores)]
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
end
_helper(dist::AbstractQGramDistance, ::Missing) = missing
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_helper(dist::StringDistance, s) = s
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
_preprocess(dist::StringDistance, s) = s
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
@ -84,10 +68,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
"""
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]
s = _helper(dist, s)
s = _preprocess(dist, s)
# need collect since @threads requires a length method
Threads.@threads for i in collect(eachindex(itr))
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score)
if score >= min_score
push!(out[Threads.threadid()], i)
end

View File

@ -60,4 +60,5 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
```
"""
normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)

View File

@ -1,4 +1,3 @@
"""
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
@ -26,11 +25,8 @@ julia> pairwise(Levenshtein(), iter, iter2)
10.0
```
"""
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
T = result_type(dist, eltype(xs), eltype(ys))
if Missing <: Union{eltype(xs), eltype(ys)}
T = Union{T, Missing}
end
R = Matrix{T}(undef, length(xs), length(ys))
pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
@ -45,7 +41,7 @@ For AbstractQGramDistances preprocessing will be used either if `preprocess` is
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
false if no preprocessing should be used, regardless of length.
"""
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
((xs === ys) & (dist isa SemiMetric)) ?
@ -53,36 +49,35 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
end
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
objs = _preprocess(xs, dist, preprocess)
for i in 1:length(objs)
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
if preprocess
xs = _preprocess_list(dist, xs)
end
for i in 1:length(xs)
# handle missing
R[i, i] = objs[i] != objs[i]
Threads.@threads for j in (i+1):length(objs)
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
R[i, i] = xs[i] != xs[i]
Threads.@threads for j in (i+1):length(xs)
R[i, j] = R[j, i] = evaluate(dist, xs[i], xs[j])
end
end
return R
end
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
objsxs = _preprocess(xs, dist, preprocess)
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
for i in 1:length(objsxs)
Threads.@threads for j in 1:length(objsys)
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
end
end
return R
end
function _preprocess(xs, dist::StringDistance, preprocess)
if preprocess === nothing
preprocess = length(xs) >= 5
end
if (dist isa AbstractQGramDistance) && preprocess
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
if preprocess
objxs = _preprocess_list(dist, xs)
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
else
return xs
objxs = xs
objys = ys
end
end
for i in 1:length(objxs)
Threads.@threads for j in 1:length(objys)
R[i, j] = evaluate(dist, objxs[i], objys[j])
end
end
return R
end
_preprocess_list(dist::StringDistance, xs) = xs
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))