reorganize

2021-09-10 22:31:14 -04:00 · 2021-09-10 22:31:14 -04:00 · 7dfd864d63
parent 4c73b55825
commit 7dfd864d63
6 changed files with 262 additions and 255 deletions
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -6,10 +6,12 @@ import StatsAPI: pairwise, pairwise!
 include("distances/utils.jl")
 include("distances/edit.jl")
 include("distances/qgram.jl")
-
-include("modifiers.jl")
 include("normalize.jl")
-include("convenience.jl")
+include("fuzzywuzzy.jl")
+const StringDistance = Union{Hamming, Jaro, JaroWinkler, Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Normalized, Partial, TokenSort, TokenSet, TokenMax}
+include("compare.jl")
+include("pairwise.jl")
+
 # Distances API
 Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
 Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
--- a/src/compare.jl
+++ b/src/compare.jl
@ -0,0 +1,96 @@
+"""
+    compare(s1, s2, dist)
+
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`.
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
+"""
+function compare(s1, s2, dist::StringDistance; min_score = 0.0)
+    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
+end 
+
+"""
+    findnearest(s, itr, dist::StringDistance) -> (x, index)
+
+`findnearest` returns the value and index of the element of `itr` that has the 
+lowest distance with `s` according to the distance `dist`. 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["New York", "Princeton", "San Francisco"]
+julia> findnearest(s, iter, Levenshtein())
+("NewYork", 1)
+julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
+(nothing, nothing)
+```
+"""
+function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
+    min_score_atomic = Threads.Atomic{Float64}(min_score)
+    scores = [0.0 for _ in 1:Threads.nthreads()]
+    is = [0 for _ in 1:Threads.nthreads()]
+    s = _helper(dist, s)
+    # need collect since @threads requires a length method
+    Threads.@threads for i in collect(eachindex(itr))
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
+        score_old = Threads.atomic_max!(min_score_atomic, score)
+        if score >= score_old
+            scores[Threads.threadid()] = score
+            is[Threads.threadid()] = i
+        end
+    end
+    imax = is[argmax(scores)]
+    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
+end
+_helper(dist::AbstractQGramDistance, ::Missing) = missing
+_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
+_helper(dist::StringDistance, s) = s
+
+function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
+    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
+    findnearest(s, itr, dist; min_score = min_score)
+end
+
+"""
+    findall(s, itr , dist::StringDistance; min_score = 0.8)
+    
+`findall` returns the vector of indices for elements of `itr` that have a 
+similarity score higher or equal than `min_score` according to the distance `dist`.
+If there are no such elements, return an empty array. 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["Newwark", "Princeton", "San Francisco"]
+julia> findall(s, iter, Levenshtein())
+1-element Array{Int64,1}:
+ 1
+julia> findall(s, iter, Levenshtein(); min_score = 0.9)
+0-element Array{Int64,1}
+```
+"""
+function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
+    out = [Int[] for _ in 1:Threads.nthreads()]
+    s = _helper(dist, s)
+    # need collect since @threads requires a length method
+    Threads.@threads for i in collect(eachindex(itr))
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
+        if score >= min_score
+            push!(out[Threads.threadid()], i)
+        end
+    end
+    vcat(out...)
+end
--- a/src/convenience.jl
+++ b/src/convenience.jl
@ -1,187 +0,0 @@
-const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
-
-"""
-    compare(s1, s2, dist)
-
-return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`.
-
-### Examples
-```julia-repl
-julia> compare("martha", "marhta", Levenshtein())
-0.6666666666666667
-```
-"""
-function compare(s1, s2, dist::StringDistance; min_score = 0.0)
-    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
-end 
-
-"""
-    findnearest(s, itr, dist::StringDistance) -> (x, index)
-
-`findnearest` returns the value and index of the element of `itr` that has the 
-lowest distance with `s` according to the distance `dist`. 
-
-It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
-(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> s = "Newark"
-julia> iter = ["New York", "Princeton", "San Francisco"]
-julia> findnearest(s, iter, Levenshtein())
-("NewYork", 1)
-julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
-(nothing, nothing)
-```
-"""
-function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
-    min_score_atomic = Threads.Atomic{Float64}(min_score)
-    scores = [0.0 for _ in 1:Threads.nthreads()]
-    is = [0 for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
-    # need collect since @threads requires a length method
-    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
-        score_old = Threads.atomic_max!(min_score_atomic, score)
-        if score >= score_old
-            scores[Threads.threadid()] = score
-            is[Threads.threadid()] = i
-        end
-    end
-    imax = is[argmax(scores)]
-    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
-end
-_helper(dist::AbstractQGramDistance, ::Missing) = missing
-_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
-_helper(dist::StringDistance, s) = s
-
-function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
-    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
-    findnearest(s, itr, dist; min_score = min_score)
-end
-
-"""
-    findall(s, itr , dist::StringDistance; min_score = 0.8)
-    
-`findall` returns the vector of indices for elements of `itr` that have a 
-similarity score higher or equal than `min_score` according to the distance `dist`.
-If there are no such elements, return an empty array. 
-
-It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
-(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> s = "Newark"
-julia> iter = ["Newwark", "Princeton", "San Francisco"]
-julia> findall(s, iter, Levenshtein())
-1-element Array{Int64,1}:
- 1
-julia> findall(s, iter, Levenshtein(); min_score = 0.9)
-0-element Array{Int64,1}
-```
-"""
-function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
-    out = [Int[] for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
-    # need collect since @threads requires a length method
-    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
-        if score >= min_score
-            push!(out[Threads.threadid()], i)
-        end
-    end
-    vcat(out...)
-end
-
-
-"""
-    pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-
-Compute distances between all pairs of elements in `xs`  and `ys` according to the
-`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
-
-For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
-to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
-false if no preprocessing should be used, regardless of length.
-
-Both symmetric and asymmetric versions are available.
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> iter = ["New York", "Princeton"]
-julia> pairwise(Levenshtein(), iter)
-2×2 Array{Float64,2}:
- 0.0  9.0
- 9.0  0.0
-julia> iter2 = ["San Francisco"]
-julia> pairwise(Levenshtein(), iter, iter2)
-2×1 Array{Float64,2}:
- 12.0
- 10.0
-```
-"""
-function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-    T = result_type(dist, eltype(xs), eltype(ys))
-    if Missing <: Union{eltype(xs), eltype(ys)}
-        T = Union{T, Missing}
-    end
-    R = Matrix{T}(undef, length(xs), length(ys))
-    pairwise!(R, dist, xs, ys; preprocess = preprocess)
-end
-
-"""
-    pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-
-Compute distances between all pairs of elements in `xs` and `ys` according to the
-`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
-
-For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
-to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
-false if no preprocessing should be used, regardless of length.
-"""
-function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
-    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
-    ((xs === ys) & (dist isa SemiMetric)) ?
-        _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
-        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
-end
-
-function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
-    objs = _preprocess(xs, dist, preprocess)
-    for i in 1:length(objs)
-        # handle missing
-        R[i, i] = objs[i] != objs[i]
-        Threads.@threads for j in (i+1):length(objs)
-            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
-        end
-    end
-    return R
-end
-
-function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
-    objsxs = _preprocess(xs, dist, preprocess)
-    objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
-    for i in 1:length(objsxs)
-        Threads.@threads for j in 1:length(objsys)
-            R[i, j] = evaluate(dist, objsxs[i], objsys[j])
-        end
-    end
-    return R
-end
-
-function _preprocess(xs, dist::StringDistance, preprocess)
-    if preprocess === nothing
-        preprocess = length(xs) >= 5
-    end
-    if (dist isa AbstractQGramDistance) && preprocess
-        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
-    else
-        return xs
-    end
-end
--- a/src/fuzzywuzzy.jl
+++ b/src/fuzzywuzzy.jl
@ -74,6 +74,10 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2,
    return x
 end

+function normalize(dist::Partial; max_dist = 1.0)
+    Partial(normalize(dist.dist; max_dist = max_dist))
+end
+
 """
   TokenSort(dist)

@ -104,10 +108,14 @@ function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{Abstrac
    out = dist.dist(s1, s2)
 end

+function normalize(dist::TokenSort; max_dist = 1.0)
+    TokenSort(normalize(dist.dist; max_dist = max_dist))
+end
+
 """
   TokenSet(dist)

-Creates the `TokenSet{dist}` distance.
+Creates the `TokenSet{dist}` distance, which is only defined on AbstractStrings.

 `TokenSet{dist}` returns the minimum the distances between:
 [SORTED_INTERSECTION]
@ -115,8 +123,6 @@ Creates the `TokenSet{dist}` distance.
 [SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
 See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/

-It is only defined on AbstractStrings.
-
 ### Examples
 ```julia-repl
 julia> s1 = "New York Mets vs Atlanta"
@ -144,4 +150,67 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
    min(score_01, score_02, score_12)
 end

+function normalize(dist::TokenSet; max_dist = 1.0)
+    TokenSet(normalize(dist.dist; max_dist = max_dist))
+end

+
+"""
+   TokenMax(dist)
+
+Creates the `TokenMax{dist}` distance, which is only defined on AbstractStrings.
+
+`TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
+its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its 
+[`TokenSet`](@ref) modifier, with penalty terms depending on the strings lengths.
+
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
+0.05
+```
+"""
+struct TokenMax{S <: SemiMetric} <: SemiMetric
+    dist::S
+    max_dist::Float64
+end
+TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
+
+function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
+    (s1 === missing) | (s2 === missing) && return missing
+    s1, s2 = reorder(s1, s2)
+    len1, len2 = length(s1), length(s2)
+    max_dist = dist.max_dist
+    dist0 = normalize(dist.dist; max_dist = max_dist)
+    score = dist0(s1, s2)
+    min_score = min(max_dist, score)
+    unbase_scale = 0.95
+    # if one string is much shorter than the other, use partial
+    if length(s2) >= 1.5 * length(s1)
+        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
+        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / partial_scale)
+        score_partial = 1 - partial_scale * (1 - Partial(dist0)(s1, s2))
+        min_score = min(max_dist, score_partial)
+        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
+        score_sort = 1 - unbase_scale * partial_scale * (1 - TokenSort(Partial(dist0))(s1, s2))
+        max_dist = min(max_dist, score_sort)
+        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
+        score_set = 1 - unbase_scale * partial_scale * (1 - TokenSet(Partial(dist0))(s1, s2)) 
+        out = min(score, score_partial, score_sort, score_set)
+    else
+        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
+        score_sort = 1 - unbase_scale * (1 - TokenSort(dist0)(s1, s2))
+        max_dist = min(max_dist, score_sort)
+        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
+        score_set = 1 - unbase_scale * (1 - TokenSet(dist0)(s1, s2))
+        out = min(score, score_sort, score_set)
+    end
+    out > max_dist ? 1.0 : out
+end
+
+function normalize(dist::TokenMax; max_dist = 1.0)
+    TokenMax(dist.dist, max_dist)
+end
--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -60,65 +60,4 @@ julia> StringDistances.normalize(Levenshtein())(s1, s2)
 ```
 """
 normalize(dist::SemiMetric; max_dist = 1.0) = Normalized{typeof(dist)}(dist, max_dist)
-normalize(dist::Partial; max_dist = 1.0) = Partial(normalize(dist.dist; max_dist = max_dist))
-normalize(dist::TokenSort; max_dist = 1.0) = TokenSort(normalize(dist.dist; max_dist = max_dist))
-normalize(dist::TokenSet; max_dist = 1.0) = TokenSet(normalize(dist.dist; max_dist = max_dist))
-normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
-
-
-"""
-   TokenMax(dist)
-
-Creates the `TokenMax{dist}` distance.
-
-`TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
-its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its 
-[`TokenSet`](@ref) modifier, with penalty terms depending on the iterator length.
-
-
-### Examples
-```julia-repl
-julia> s1 = "New York Mets vs Atlanta"
-julia> s2 = "Atlanta Braves vs New York Mets"
-julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
-0.05
-```
-"""
-struct TokenMax{S <: SemiMetric} <: SemiMetric
-    dist::S
-    max_dist::Float64
-end
-TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
-normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)
-
-function (dist::TokenMax)(s1, s2)
-    (s1 === missing) | (s2 === missing) && return missing
-    s1, s2 = reorder(s1, s2)
-    len1, len2 = length(s1), length(s2)
-    max_dist = dist.max_dist
-    dist0 = normalize(dist.dist; max_dist = max_dist)
-    score = dist0(s1, s2)
-    min_score = min(max_dist, score)
-    unbase_scale = 0.95
-    # if one string is much shorter than the other, use partial
-    if length(s2) >= 1.5 * length(s1)
-        partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
-        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / partial_scale)
-        score_partial = 1 - partial_scale * (1 - Partial(dist0)(s1, s2))
-        min_score = min(max_dist, score_partial)
-        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
-        score_sort = 1 - unbase_scale * partial_scale * (1 - TokenSort(Partial(dist0))(s1, s2))
-        max_dist = min(max_dist, score_sort)
-        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / (unbase_scale * partial_scale))
-        score_set = 1 - unbase_scale * partial_scale * (1 - TokenSet(Partial(dist0))(s1, s2)) 
-        out = min(score, score_partial, score_sort, score_set)
-    else
-        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
-        score_sort = 1 - unbase_scale * (1 - TokenSort(dist0)(s1, s2))
-        max_dist = min(max_dist, score_sort)
-        dist0 = normalize(dist0, max_dist = 1 - (1 - max_dist) / unbase_scale)
-        score_set = 1 - unbase_scale * (1 - TokenSet(dist0)(s1, s2))
-        out = min(score, score_sort, score_set)
-    end
-    out > max_dist ? 1.0 : out
-end
+normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -0,0 +1,88 @@
+
+"""
+    pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+
+Compute distances between all pairs of elements in `xs`  and `ys` according to the
+`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
+
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
+to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
+false if no preprocessing should be used, regardless of length.
+
+Both symmetric and asymmetric versions are available.
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> iter = ["New York", "Princeton"]
+julia> pairwise(Levenshtein(), iter)
+2×2 Array{Float64,2}:
+ 0.0  9.0
+ 9.0  0.0
+julia> iter2 = ["San Francisco"]
+julia> pairwise(Levenshtein(), iter, iter2)
+2×1 Array{Float64,2}:
+ 12.0
+ 10.0
+```
+"""
+function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+    T = result_type(dist, eltype(xs), eltype(ys))
+    if Missing <: Union{eltype(xs), eltype(ys)}
+        T = Union{T, Missing}
+    end
+    R = Matrix{T}(undef, length(xs), length(ys))
+    pairwise!(R, dist, xs, ys; preprocess = preprocess)
+end
+
+"""
+    pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+
+Compute distances between all pairs of elements in `xs` and `ys` according to the
+`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
+
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
+to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
+false if no preprocessing should be used, regardless of length.
+"""
+function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
+    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
+    ((xs === ys) & (dist isa SemiMetric)) ?
+        _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
+        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
+end
+
+function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
+    objs = _preprocess(xs, dist, preprocess)
+    for i in 1:length(objs)
+        # handle missing
+        R[i, i] = objs[i] != objs[i]
+        Threads.@threads for j in (i+1):length(objs)
+            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
+        end
+    end
+    return R
+end
+
+function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
+    objsxs = _preprocess(xs, dist, preprocess)
+    objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
+    for i in 1:length(objsxs)
+        Threads.@threads for j in 1:length(objsys)
+            R[i, j] = evaluate(dist, objsxs[i], objsys[j])
+        end
+    end
+    return R
+end
+
+function _preprocess(xs, dist::StringDistance, preprocess)
+    if preprocess === nothing
+        preprocess = length(xs) >= 5
+    end
+    if (dist isa AbstractQGramDistance) && preprocess
+        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
+    else
+        return xs
+    end
+end