re-organize and default to _preprocess = true
parent
7dfd864d63
commit
69491d9f78
43
README.md
43
README.md
|
@ -14,8 +14,8 @@ The available distances are:
|
|||
- Hamming Distance `Hamming()`
|
||||
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
- Q-gram distances compare the set of all substrings of length `q` in each string.
|
||||
- QGram Distance `Qgram(q::Int)`
|
||||
|
@ -28,7 +28,7 @@ The available distances are:
|
|||
|
||||
|
||||
## Basic Use
|
||||
### evaluate
|
||||
### distance
|
||||
You can always compute a certain distance between two strings using the following syntax:
|
||||
|
||||
```julia
|
||||
|
@ -43,8 +43,15 @@ evaluate(Levenshtein(), "martha", "marhta")
|
|||
Levenshtein()("martha", "marhta")
|
||||
```
|
||||
|
||||
In contrast, the function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns an element of type `Float64`. A value of 0.0 means completely different and a value of 1.0 means completely similar.
|
||||
|
||||
```julia
|
||||
compare("martha", "martha", Levenshtein())
|
||||
#> 1.0
|
||||
```
|
||||
|
||||
### pairwise
|
||||
`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings
|
||||
`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings (or iterators)
|
||||
|
||||
```julia
|
||||
pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
|
||||
|
@ -52,16 +59,17 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
|
|||
The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).
|
||||
|
||||
|
||||
### distance modifiers
|
||||
The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.
|
||||
|
||||
### similarly score
|
||||
- The function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns a Float64. A value of 0.0 means completely different and a value of 1.0 means completely similar.
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
|
||||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
|
||||
```julia
|
||||
Levenshtein()("martha", "martha")
|
||||
#> 0.0
|
||||
compare("martha", "martha", Levenshtein())
|
||||
#> 1.0
|
||||
```
|
||||
|
||||
### find
|
||||
The package also adds some convience function to find the element in a list that is closest to a given string
|
||||
|
||||
- `findnearest` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
|
||||
```julia
|
||||
|
@ -76,17 +84,4 @@ The function `pairwise` is particularly optimized for QGram-distances (each elem
|
|||
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).
|
||||
|
||||
|
||||
### distance modifiers
|
||||
The package also defines Distance "modifiers" that can be applied to any distance.
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically.
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
|
||||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
|
||||
|
||||
|
||||
## References
|
||||
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
|
||||
- [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
|
||||
|
||||
|
|
|
@ -9,11 +9,32 @@ include("distances/qgram.jl")
|
|||
include("normalize.jl")
|
||||
include("fuzzywuzzy.jl")
|
||||
const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
|
||||
include("compare.jl")
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
include("find.jl")
|
||||
include("pairwise.jl")
|
||||
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
|
||||
function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
|
||||
T = typeof(dist("", ""))
|
||||
if (Missing <: s1) | (Missing <: s2)
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
return T
|
||||
end
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
||||
|
||||
|
|
|
@ -192,9 +192,7 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
|||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||||
|
||||
|
||||
#q-grams of AbstractVector
|
||||
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
|
||||
# so it does not seem to be worth it.
|
||||
# q-grams of General Iterators
|
||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||
|
|
|
@ -1,19 +1,3 @@
|
|||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
|
||||
end
|
||||
|
||||
"""
|
||||
findnearest(s, itr, dist::StringDistance) -> (x, index)
|
||||
|
||||
|
@ -38,10 +22,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
min_score_atomic = Threads.Atomic{Float64}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
s = _preprocess(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
|
||||
score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score_atomic[])
|
||||
score_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= score_old
|
||||
scores[Threads.threadid()] = score
|
||||
|
@ -51,9 +35,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
|
|||
imax = is[argmax(scores)]
|
||||
imax == 0 ? (nothing, nothing) : (itr[imax], imax)
|
||||
end
|
||||
_helper(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_helper(dist::StringDistance, s) = s
|
||||
_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
|
||||
_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
|
||||
_preprocess(dist::StringDistance, s) = s
|
||||
|
||||
function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
|
||||
@warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
|
||||
|
@ -84,10 +68,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
|||
"""
|
||||
function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
s = _helper(dist, s)
|
||||
s = _preprocess(dist, s)
|
||||
# need collect since @threads requires a length method
|
||||
Threads.@threads for i in collect(eachindex(itr))
|
||||
score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
|
||||
score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score)
|
||||
if score >= min_score
|
||||
push!(out[Threads.threadid()], i)
|
||||
end
|
|
@ -60,4 +60,5 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
|
|||
```
|
||||
"""
|
||||
normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
|
||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||
normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
"""
|
||||
pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
|
||||
|
@ -26,11 +25,8 @@ julia> pairwise(Levenshtein(), iter, iter2)
|
|||
10.0
|
||||
```
|
||||
"""
|
||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
T = result_type(dist, eltype(xs), eltype(ys))
|
||||
if Missing <: Union{eltype(xs), eltype(ys)}
|
||||
T = Union{T, Missing}
|
||||
end
|
||||
R = Matrix{T}(undef, length(xs), length(ys))
|
||||
pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
@ -45,7 +41,7 @@ For AbstractQGramDistances preprocessing will be used either if `preprocess` is
|
|||
to true or if there are more than 5 elements in `xs`. Set `preprocess` to
|
||||
false if no preprocessing should be used, regardless of length.
|
||||
"""
|
||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
|
||||
function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
|
||||
length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
|
||||
length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
|
||||
((xs === ys) & (dist isa SemiMetric)) ?
|
||||
|
@ -53,36 +49,35 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
|
|||
_asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
|
||||
end
|
||||
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
|
||||
objs = _preprocess(xs, dist, preprocess)
|
||||
for i in 1:length(objs)
|
||||
function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
|
||||
if preprocess
|
||||
xs = _preprocess_list(dist, xs)
|
||||
end
|
||||
for i in 1:length(xs)
|
||||
# handle missing
|
||||
R[i, i] = objs[i] != objs[i]
|
||||
Threads.@threads for j in (i+1):length(objs)
|
||||
R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
|
||||
R[i, i] = xs[i] != xs[i]
|
||||
Threads.@threads for j in (i+1):length(xs)
|
||||
R[i, j] = R[j, i] = evaluate(dist, xs[i], xs[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
|
||||
objsxs = _preprocess(xs, dist, preprocess)
|
||||
objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
|
||||
for i in 1:length(objsxs)
|
||||
Threads.@threads for j in 1:length(objsys)
|
||||
R[i, j] = evaluate(dist, objsxs[i], objsys[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
function _preprocess(xs, dist::StringDistance, preprocess)
|
||||
if preprocess === nothing
|
||||
preprocess = length(xs) >= 5
|
||||
end
|
||||
if (dist isa AbstractQGramDistance) && preprocess
|
||||
return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
||||
function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
|
||||
if preprocess
|
||||
objxs = _preprocess_list(dist, xs)
|
||||
objys = xs === ys ? objxs : _preprocess_list(dist, ys)
|
||||
else
|
||||
return xs
|
||||
objxs = xs
|
||||
objys = ys
|
||||
end
|
||||
end
|
||||
for i in 1:length(objxs)
|
||||
Threads.@threads for j in 1:length(objys)
|
||||
R[i, j] = evaluate(dist, objxs[i], objys[j])
|
||||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
|
||||
_preprocess_list(dist::StringDistance, xs) = xs
|
||||
_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
|
Loading…
Reference in New Issue