re-organize and default to _preprocess = true

2021-09-11 10:04:03 -04:00 · 2021-09-11 10:04:03 -04:00 · 69491d9f78
parent 7dfd864d63
commit 69491d9f78
6 changed files with 78 additions and 84 deletions
--- a/README.md
+++ b/README.md
@ -14,8 +14,8 @@ The available distances are:
 	- Hamming Distance `Hamming()`
 	- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
 	- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
-	- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
 	- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
+	- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
 	- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
 - Q-gram distances compare the set of all substrings of length `q` in each string.
 	- QGram Distance `Qgram(q::Int)`
@ -28,7 +28,7 @@ The available distances are:


 ## Basic Use
-### evaluate
+### distance
 You can always compute a certain distance between two strings using the following syntax:

 ```julia
@ -43,8 +43,15 @@ evaluate(Levenshtein(), "martha", "marhta")
 Levenshtein()("martha", "marhta")
 ```

+In contrast, the function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns an element of type `Float64`. A value of 0.0 means completely different and a value of 1.0 means completely similar.
+
+```julia
+compare("martha", "martha", Levenshtein())
+#> 1.0
+```
+
 ### pairwise
-`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings
+`pairwise` returns the matrix of distance between two `AbstractVectors` of AbstractStrings (or iterators)

 ```julia
 pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
@ -52,16 +59,17 @@ pairwise(Jaccard(3), ["martha", "kitten"], ["marhta", "sitting"])
 The function `pairwise` is particularly optimized for QGram-distances (each element is processed only once).


+### distance modifiers
+The package also defines Distance "modifiers" that are defined in the Python package - [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/). These modifiers are particularly helpful to match strings composed of multiple words.

-### similarly score
- The function `compare` returns the similarity score, defined as 1 minus the normalized distance between two strings. It always returns a Float64. A value of 0.0 means completely different and a value of 1.0 means completely similar.
+- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
+- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically. 
+- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
+- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses.   `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)

-	```julia
-	Levenshtein()("martha", "martha")
-	#> 0.0
-	compare("martha", "martha", Levenshtein())
-	#> 1.0
-	```
+
+### find
+The package also adds some convience function to find the element in a list that is closest to a given string

 - `findnearest` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
 	```julia
@ -76,17 +84,4 @@ The function `pairwise` is particularly optimized for QGram-distances (each elem
 The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances (these distances stop early if the distance is higher than a certain threshold).


-### distance modifiers
-The package also defines Distance "modifiers" that can be applied to any distance.
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the distance of the two strings, after re-ordering words alphabetically. 
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the distance between the intersection of two strings with each string.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) normalizes the distance, and combine the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses.   `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
-
-
-
-## References
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
- [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
-

--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -9,11 +9,32 @@ include("distances/qgram.jl")
 include("normalize.jl")
 include("fuzzywuzzy.jl")
 const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
-include("compare.jl")
+"""
+    compare(s1, s2, dist)
+
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`.
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
+"""
+function compare(s1, s2, dist::StringDistance; min_score = 0.0)
+    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
+end 
+include("find.jl")
 include("pairwise.jl")

 # Distances API
-Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
+function Distances.result_type(dist::StringDistance, s1::Type, s2::Type)
+    T = typeof(dist("", ""))
+    if (Missing <: s1) | (Missing <: s2)
+        T = Union{T, Missing}
+    end
+    return T
+end
 Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))


--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -192,9 +192,7 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
 qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)


-#q-grams of AbstractVector
-# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
-# so it does not seem to be worth it.
+# q-grams of General Iterators
 function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
 	state + qgram.q - 1 > lastindex(qgram.s) && return nothing
 	view(qgram.s, state:(state + qgram.q - 1)), state + 1
--- a/src/compare.jl
+++ b/src/compare.jl
@ -1,19 +1,3 @@
-"""
-    compare(s1, s2, dist)
-
-return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`.
-
-### Examples
-```julia-repl
-julia> compare("martha", "marhta", Levenshtein())
-0.6666666666666667
-```
-"""
-function compare(s1, s2, dist::StringDistance; min_score = 0.0)
-    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
-end 
-
 """
    findnearest(s, itr, dist::StringDistance) -> (x, index)

@ -38,10 +22,10 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
    min_score_atomic = Threads.Atomic{Float64}(min_score)
    scores = [0.0 for _ in 1:Threads.nthreads()]
    is = [0 for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
+    s = _preprocess(dist, s)
    # need collect since @threads requires a length method
    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
+        score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score_atomic[])
        score_old = Threads.atomic_max!(min_score_atomic, score)
        if score >= score_old
            scores[Threads.threadid()] = score
@ -51,9 +35,9 @@ function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
    imax = is[argmax(scores)]
    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
 end
-_helper(dist::AbstractQGramDistance, ::Missing) = missing
-_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
-_helper(dist::StringDistance, s) = s
+_preprocess(dist::AbstractQGramDistance, ::Missing) = missing
+_preprocess(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
+_preprocess(dist::StringDistance, s) = s

 function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
@ -84,10 +68,10 @@ julia> findall(s, iter, Levenshtein(); min_score = 0.9)
 """
 function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
    out = [Int[] for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
+    s = _preprocess(dist, s)
    # need collect since @threads requires a length method
    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
+        score = compare(s, _preprocess(dist, itr[i]), dist; min_score = min_score)
        if score >= min_score
            push!(out[Threads.threadid()], i)
        end
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -60,4 +60,5 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
 ```
 """
 normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
-normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
+normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
+
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -1,4 +1,3 @@
-
 """
    pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)

@ -26,11 +25,8 @@ julia> pairwise(Levenshtein(), iter, iter2)
 10.0
 ```
 """
-function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
    T = result_type(dist, eltype(xs), eltype(ys))
-    if Missing <: Union{eltype(xs), eltype(ys)}
-        T = Union{T, Missing}
-    end
    R = Matrix{T}(undef, length(xs), length(ys))
    pairwise!(R, dist, xs, ys; preprocess = preprocess)
 end
@ -45,7 +41,7 @@ For AbstractQGramDistances preprocessing will be used either if `preprocess` is
 to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
 false if no preprocessing should be used, regardless of length.
 """
-function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = true)
    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
    ((xs === ys) & (dist isa SemiMetric)) ?
@ -53,36 +49,35 @@ function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector,
        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
 end

-function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
-    objs = _preprocess(xs, dist, preprocess)
-    for i in 1:length(objs)
+function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = true)
+    if preprocess
+        xs = _preprocess_list(dist, xs)
+    end
+    for i in 1:length(xs)
        # handle missing
-        R[i, i] = objs[i] != objs[i]
-        Threads.@threads for j in (i+1):length(objs)
-            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
+        R[i, i] = xs[i] != xs[i]
+        Threads.@threads for j in (i+1):length(xs)
+            R[i, j] = R[j, i] = evaluate(dist, xs[i], xs[j])
        end
    end
    return R
 end

-function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
-    objsxs = _preprocess(xs, dist, preprocess)
-    objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
-    for i in 1:length(objsxs)
-        Threads.@threads for j in 1:length(objsys)
-            R[i, j] = evaluate(dist, objsxs[i], objsys[j])
-        end
-    end
-    return R
-end
-
-function _preprocess(xs, dist::StringDistance, preprocess)
-    if preprocess === nothing
-        preprocess = length(xs) >= 5
-    end
-    if (dist isa AbstractQGramDistance) && preprocess
-        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
+function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = true)
+    if preprocess
+        objxs = _preprocess_list(dist, xs)
+        objys = xs === ys ? objxs : _preprocess_list(dist, ys)
    else
-        return xs
+        objxs = xs
+        objys = ys
    end
-end
+    for i in 1:length(objxs)
+        Threads.@threads for j in 1:length(objys)
+            R[i, j] = evaluate(dist, objxs[i], objys[j])
+        end
+    end
+    return R
+end
+
+_preprocess_list(dist::StringDistance, xs)  = xs
+_preprocess_list(dist::AbstractQGramDistance, xs) = fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))