use correct (unrestricted) DamerauLevenshtein.

The old DamerauLevenshtein distance is renamed to OptimalStringAlignement.
2021-09-10 17:14:21 -04:00 · 2021-09-10 17:14:21 -04:00 · 5bec23d357
parent 0faf255f93
commit 5bec23d357
10 changed files with 316 additions and 251 deletions
--- a/README.md
+++ b/README.md
@ -14,6 +14,7 @@ The available distances are:
 	- Hamming Distance `Hamming()`
 	- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler()`
 	- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
+	- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement()`
 	- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
 	- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
 - Q-gram distances compare the set of all substrings of length `q` in each string.
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -21,10 +21,12 @@ end
 # 0.36s
@time f(Levenshtein(), x, y, min_score = 0.8);
 # 0.11 
-@time f(DamerauLevenshtein(), x, y);
+@time f(OptimalStringAlignement(), x, y);
 # 0.56s.
-@time f(DamerauLevenshtein(), x, y, min_score = 0.8);
+@time f(OptimalStringAlignement(), x, y, min_score = 0.8);
 # 0.08
+@time f(DamerauLevenshtein(), x, y);
+# 2s
@time f(RatcliffObershelp(), x, y);
 # 0.65s

@ -33,7 +35,7 @@ end

@time findnearest(x[1], y, Levenshtein());
 # 0.1
-@time findnearest(x[1], y, DamerauLevenshtein());
+@time findnearest(x[1], y, OptimalStringAlignement());
 # 0.1
@time findnearest(x[1], y, QGram(2));
 # 0.75
@ -42,17 +44,17 @@ end

@time findall(x[1], y, Levenshtein());
 # 0.05
-@time findall(x[1], y, DamerauLevenshtein());
+@time findall(x[1], y, OptimalStringAlignement());
 # 0.05
-@time findall(x[1], y, Partial(DamerauLevenshtein()));
+@time findall(x[1], y, Partial(OptimalStringAlignement()));
 # 0.96
@time findall(x[1], y, QGram(2));
 # 0.81
-@time findall(x[1], y, TokenSort(DamerauLevenshtein()));
+@time findall(x[1], y, TokenSort(OptimalStringAlignement()));
 # 0.27 (now 0.32)
-@time findall(x[1], y, TokenSet(DamerauLevenshtein()));
+@time findall(x[1], y, TokenSet(OptimalStringAlignement()));
 # 0.55
-@time findall(x[1], y, TokenMax(DamerauLevenshtein()));
+@time findall(x[1], y, TokenMax(OptimalStringAlignement()));
 # 2.25 (now 3.6)


--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -9,7 +9,7 @@ include("distances/qgram.jl")

 include("modifiers.jl")
 include("normalize.jl")
-include("pairwise.jl")
+include("convenience.jl")
 # Distances API
 Distances.result_type(dist::StringDistance, s1::Type, s2::Type) = typeof(dist("", ""))
 Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
@ -27,10 +27,11 @@ Distances.result_type(dist::StringDistance, s1, s2) = result_type(dist, typeof(s
 export
 StringDistance,
 Hamming,
-Levenshtein,
-DamerauLevenshtein,
 Jaro,
 JaroWinkler,
+Levenshtein,
+OptimalStringAlignement,
+DamerauLevenshtein,
 RatcliffObershelp,
 AbstractQGramDistance,
 QGramDict,
--- a/src/convenience.jl
+++ b/src/convenience.jl
@ -0,0 +1,187 @@
+const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, OptimalStringAlignement, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
+
+"""
+    compare(s1, s2, dist)
+
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the distance `dist`.
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
+"""
+function compare(s1, s2, dist::StringDistance; min_score = 0.0)
+    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
+end 
+
+"""
+    findnearest(s, itr, dist::StringDistance) -> (x, index)
+
+`findnearest` returns the value and index of the element of `itr` that has the 
+lowest distance with `s` according to the distance `dist`. 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["New York", "Princeton", "San Francisco"]
+julia> findnearest(s, iter, Levenshtein())
+("NewYork", 1)
+julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
+(nothing, nothing)
+```
+"""
+function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
+    min_score_atomic = Threads.Atomic{Float64}(min_score)
+    scores = [0.0 for _ in 1:Threads.nthreads()]
+    is = [0 for _ in 1:Threads.nthreads()]
+    s = _helper(dist, s)
+    # need collect since @threads requires a length method
+    Threads.@threads for i in collect(eachindex(itr))
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
+        score_old = Threads.atomic_max!(min_score_atomic, score)
+        if score >= score_old
+            scores[Threads.threadid()] = score
+            is[Threads.threadid()] = i
+        end
+    end
+    imax = is[argmax(scores)]
+    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
+end
+_helper(dist::AbstractQGramDistance, ::Missing) = missing
+_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
+_helper(dist::StringDistance, s) = s
+
+function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
+    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
+    findnearest(s, itr, dist; min_score = min_score)
+end
+
+"""
+    findall(s, itr , dist::StringDistance; min_score = 0.8)
+    
+`findall` returns the vector of indices for elements of `itr` that have a 
+similarity score higher or equal than `min_score` according to the distance `dist`.
+If there are no such elements, return an empty array. 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["Newwark", "Princeton", "San Francisco"]
+julia> findall(s, iter, Levenshtein())
+1-element Array{Int64,1}:
+ 1
+julia> findall(s, iter, Levenshtein(); min_score = 0.9)
+0-element Array{Int64,1}
+```
+"""
+function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
+    out = [Int[] for _ in 1:Threads.nthreads()]
+    s = _helper(dist, s)
+    # need collect since @threads requires a length method
+    Threads.@threads for i in collect(eachindex(itr))
+        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
+        if score >= min_score
+            push!(out[Threads.threadid()], i)
+        end
+    end
+    vcat(out...)
+end
+
+
+"""
+    pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+
+Compute distances between all pairs of elements in `xs`  and `ys` according to the
+`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
+
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
+to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
+false if no preprocessing should be used, regardless of length.
+
+Both symmetric and asymmetric versions are available.
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> iter = ["New York", "Princeton"]
+julia> pairwise(Levenshtein(), iter)
+2×2 Array{Float64,2}:
+ 0.0  9.0
+ 9.0  0.0
+julia> iter2 = ["San Francisco"]
+julia> pairwise(Levenshtein(), iter, iter2)
+2×1 Array{Float64,2}:
+ 12.0
+ 10.0
+```
+"""
+function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+    T = result_type(dist, eltype(xs), eltype(ys))
+    if Missing <: Union{eltype(xs), eltype(ys)}
+        T = Union{T, Missing}
+    end
+    R = Matrix{T}(undef, length(xs), length(ys))
+    pairwise!(R, dist, xs, ys; preprocess = preprocess)
+end
+
+"""
+    pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+
+Compute distances between all pairs of elements in `xs` and `ys` according to the
+`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
+
+For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
+to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
+false if no preprocessing should be used, regardless of length.
+"""
+function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
+    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
+    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
+    ((xs === ys) & (dist isa SemiMetric)) ?
+        _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
+        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
+end
+
+function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
+    objs = _preprocess(xs, dist, preprocess)
+    for i in 1:length(objs)
+        # handle missing
+        R[i, i] = objs[i] != objs[i]
+        Threads.@threads for j in (i+1):length(objs)
+            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
+        end
+    end
+    return R
+end
+
+function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
+    objsxs = _preprocess(xs, dist, preprocess)
+    objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
+    for i in 1:length(objsxs)
+        Threads.@threads for j in 1:length(objsys)
+            R[i, j] = evaluate(dist, objsxs[i], objsys[j])
+        end
+    end
+    return R
+end
+
+function _preprocess(xs, dist::StringDistance, preprocess)
+    if preprocess === nothing
+        preprocess = length(xs) >= 5
+    end
+    if (dist isa AbstractQGramDistance) && preprocess
+        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
+    else
+        return xs
+    end
+end
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -155,29 +155,29 @@ function (dist::Levenshtein)(s1, s2)
 end

 """
-    DamerauLevenshtein()
+        OptimalStringAlignement()

-Creates the restricted DamerauLevenshtein distance
+    Creates the OptimalStringAlignement distance (also known ad the unrestricted DamerauLevenshtein distance).

-The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, 
-deletions or substitutions of a single character, or transposition of two adjacent characters) 
-required to change one string into the other.
+    It is the minimum number of operations (consisting of insertions, 
+    deletions or substitutions of a single character, or transposition of two adjacent characters) 
+    required to change one string into the other.

-The restricted distance differs slightly from the classic Damerau-Levenshtein algorithm by imposing 
-the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit 
-distanceof 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
-uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy 
-the triangle inequality.
+    The distance differs slightly from the Damerau-Levenshtein algorithm by imposing 
+    the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit 
+    distance of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
+    uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy 
+    the triangle inequality.
 """

-struct DamerauLevenshtein{V <: Union{Integer, Nothing}} <: SemiMetric
+struct OptimalStringAlignement{V <: Union{Integer, Nothing}} <: SemiMetric
   max_dist::V
 end
-DamerauLevenshtein() = DamerauLevenshtein(nothing)
+OptimalStringAlignement() = OptimalStringAlignement(nothing)

 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 # Return max_dist + 1 if distance higher than max_dist
-function (dist::DamerauLevenshtein)(s1, s2)
+function (dist::OptimalStringAlignement)(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -228,6 +228,59 @@ function (dist::DamerauLevenshtein)(s1, s2)
    return current
 end

+
+"""
+    DamerauLevenshtein()
+
+Creates the DamerauLevenshtein distance
+
+The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, 
+deletions or substitutions of a single character, or transposition of two adjacent characters) 
+required to change one string into the other.
+"""
+
+struct DamerauLevenshtein <: Metric
+end
+
+## https://en.wikipedia.org/wiki/Damerau–Levenshtein_distance
+function (dist::DamerauLevenshtein)(s1, s2)
+    (s1 === missing) | (s2 === missing) && return missing
+    s1, s2 = reorder(s1, s2)
+    len1, len2 = length(s1), length(s2)
+    T = promote_type(eltype(s1), eltype(s2))
+    da = Dict{T, Int}(x => 0 for x in Iterators.flatten((s1, s2)))
+    d = zeros(Int, len1 + 2, len2 + 2)
+    md = len1 + len2
+    @inbounds for i in 0:len1
+        d[i + 2, 1] = md
+        d[i + 2, 2] = i
+    end
+    @inbounds for j in 0:len2
+        d[1, j + 2] = md
+        d[2, j + 2] = j
+    end
+    # fill in the distance matrix d
+    for (i1, ch1) in enumerate(s1)
+        db = 0
+        for (i2, ch2) in enumerate(s2)
+            j1 = da[ch2]
+            j2 = db
+            if ch1 == ch2
+                cost = 0
+                db = i2
+            else
+                cost = 1
+            end
+            @inbounds d[i1 + 2, i2 + 2] = min(d[i1 + 1, i2 + 1] + cost, 
+                                  d[i1 + 2, i2 + 1] + 1,
+                                  d[i1 + 1, i2 + 2] + 1,
+                                  d[j1 + 1, j2 + 1] + (i1 - j1 - 1) + 1 + (i2 - j2 - 1))
+        end
+        da[ch1] = i1
+    end
+    return d[end, end]
+end
+
 """
    RatcliffObershelp()

--- a/src/normalize.jl
+++ b/src/normalize.jl
@ -3,24 +3,27 @@ struct Normalized{V <: SemiMetric} <: SemiMetric
    max_dist::Float64
 end

-function (dist::Normalized{<:Hamming})(s1, s2)
-    (s1 === missing) | (s2 === missing) && return missing
-    s1, s2 = reorder(s1, s2)
-    len1, len2 = length(s1), length(s2)
-    len2 == 0 && return 1.0
-    out = dist.dist(s1, s2) / len2
+function (dist::Normalized{<: Union{Jaro, JaroWinkler, RatcliffObershelp}})(s1, s2)
+    out = dist.dist(s1, s2)
    out > dist.max_dist ? 1.0 : out
 end

-function (dist::Normalized{<:Union{Levenshtein{Nothing}, DamerauLevenshtein{Nothing}}})(s1, s2)
+function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
+    isempty(s1) && isempty(s2) && return 0.0
+    out = dist.dist(s1, s2) / length(s2)
+    out > dist.max_dist ? 1.0 : out
+end
+
+function (dist::Normalized{<:Union{Levenshtein{Nothing}, OptimalStringAlignement{Nothing}}})(s1, s2)
+    (s1 === missing) | (s2 === missing) && return missing
+    isempty(s1) && isempty(s2) && return 0.0
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
-    len2 == 0 && return 1.0
    if dist.dist isa Levenshtein
        d = Levenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
    else
-        d = DamerauLevenshtein(ceil(Int, len2 * dist.max_dist))(s1, s2)
+        d = OptimalStringAlignement(ceil(Int, len2 * dist.max_dist))(s1, s2)
    end
    out = d / len2
    out > dist.max_dist ? 1.0 : out
@ -40,10 +43,6 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2)
    out > dist.max_dist ? 1.0 : out
 end

-function (dist::Normalized)(s1, s2)
-    out = dist.dist(s1, s2)
-    out > dist.max_dist ? 1.0 : out
-end

 """
   normalize(dist)
@ -70,13 +69,12 @@ normalize(dist::Normalized; max_dist = 1.0) = Normalized(dist.dist, max_dist)
 """
   TokenMax(dist)

-Creates the `TokenMax{dist}` distance
+Creates the `TokenMax{dist}` distance.

 `TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
 its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its 
-[`TokenSet`](@ref) modifier, with penalty terms depending on string length.
+[`TokenSet`](@ref) modifier, with penalty terms depending on the iterator length.

-It is only defined on AbstractStrings

 ### Examples
 ```julia-repl
@ -93,7 +91,7 @@ end
 TokenMax(dist::SemiMetric; max_dist = 1.0) = TokenMax(dist, max_dist)
 normalize(dist::TokenMax; max_dist = 1.0) = TokenMax(dist.dist, max_dist)

-function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
+function (dist::TokenMax)(s1, s2)
    (s1 === missing) | (s2 === missing) && return missing
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
@ -124,104 +122,3 @@ function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
    end
    out > max_dist ? 1.0 : out
 end
-
-
-
-
-const StringDistance = Union{Hamming, Jaro, JaroWinkler,Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Partial, TokenSort, TokenSet, TokenMax, Normalized}
-
-"""
-    compare(s1, s2, dist)
-
-return a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`.
-
-### Examples
-```julia-repl
-julia> compare("martha", "marhta", Levenshtein())
-0.6666666666666667
-```
-"""
-function compare(s1, s2, dist::StringDistance; min_score = 0.0)
-    1 - normalize(dist, max_dist = 1 - min_score)(s1, s2)
-end 
-
-"""
-    findnearest(s, itr, dist::StringDistance) -> (x, index)
-
-`findnearest` returns the value and index of the element of `itr` that has the 
-lowest distance with `s` according to the distance `dist`. 
-
-It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
-(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> s = "Newark"
-julia> iter = ["New York", "Princeton", "San Francisco"]
-julia> findnearest(s, iter, Levenshtein())
-("NewYork", 1)
-julia> findnearest(s, iter, Levenshtein(); min_score = 0.9)
-(nothing, nothing)
-```
-"""
-function findnearest(s, itr, dist::StringDistance; min_score = 0.0)
-    min_score_atomic = Threads.Atomic{Float64}(min_score)
-    scores = [0.0 for _ in 1:Threads.nthreads()]
-    is = [0 for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
-    # need collect since @threads requires a length method
-    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score_atomic[])
-        score_old = Threads.atomic_max!(min_score_atomic, score)
-        if score >= score_old
-            scores[Threads.threadid()] = score
-            is[Threads.threadid()] = i
-        end
-    end
-    imax = is[argmax(scores)]
-    imax == 0 ? (nothing, nothing) : (itr[imax], imax)
-end
-_helper(dist::AbstractQGramDistance, ::Missing) = missing
-_helper(dist::AbstractQGramDistance, s) = QGramSortedVector(s, dist.q)
-_helper(dist::StringDistance, s) = s
-
-function Base.findmax(s, itr, dist::StringDistance; min_score = 0.0)
-    @warn "findmax(s, itr, dist; min_score) is deprecated. Use findnearest(s, itr, dist; min_score)"
-    findnearest(s, itr, dist; min_score = min_score)
-end
-"""
-    findall(s, itr , dist::StringDistance; min_score = 0.8)
-    
-`findall` returns the vector of indices for elements of `itr` that have a 
-similarity score higher or equal than `min_score` according to the distance `dist`.
-If there are no such elements, return an empty array. 
-
-It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
-(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> s = "Newark"
-julia> iter = ["Newwark", "Princeton", "San Francisco"]
-julia> findall(s, iter, Levenshtein())
-1-element Array{Int64,1}:
- 1
-julia> findall(s, iter, Levenshtein(); min_score = 0.9)
-0-element Array{Int64,1}
-```
-"""
-function Base.findall(s, itr, dist::StringDistance; min_score = 0.8)
-    out = [Int[] for _ in 1:Threads.nthreads()]
-    s = _helper(dist, s)
-    # need collect since @threads requires a length method
-    Threads.@threads for i in collect(eachindex(itr))
-        score = compare(s, _helper(dist, itr[i]), dist; min_score = min_score)
-        if score >= min_score
-            push!(out[Threads.threadid()], i)
-        end
-    end
-    vcat(out...)
-end
--- a/src/pairwise.jl
+++ b/src/pairwise.jl
@ -1,87 +0,0 @@
-@doc """
-    pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-
-Compute distances between all pairs of elements in `xs`  and `ys` according to the
-`StringDistance` `dist`. Returns a matrix R such that `R[i, j]` corrresponds to the distance between `xs[i]` and `ys[j]`.
-
-For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
-to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
-false if no preprocessing should be used, regardless of length.
-
-Both symmetric and asymmetric versions are available.
-
-### Examples
-```julia-repl
-julia> using StringDistances
-julia> iter = ["New York", "Princeton"]
-julia> pairwise(Levenshtein(), iter)
-2×2 Array{Float64,2}:
- 0.0  9.0
- 9.0  0.0
-julia> iter2 = ["San Francisco"]
-julia> pairwise(Levenshtein(), iter, iter2)
-2×1 Array{Float64,2}:
- 12.0
- 10.0
-```
-"""
-function pairwise(dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-    T = result_type(dist, eltype(xs), eltype(ys))
-    if Missing <: Union{eltype(xs), eltype(ys)}
-        T = Union{T, Missing}
-    end
-    R = Matrix{T}(undef, length(xs), length(ys))
-    pairwise!(R, dist, xs, ys; preprocess = preprocess)
-end
-
-@doc """
-    pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-
-Compute distances between all pairs of elements in `xs` and `ys` according to the
-`StringDistance` `dist` and write the result in `R`. `R[i, j]` corresponds to the distance between `xs[i]` and `ys[j]`.
-
-For AbstractQGramDistances preprocessing will be used either if `preprocess` is set 
-to true or if there are more than 5 elements in `xs`. Set `preprocess` to 
-false if no preprocessing should be used, regardless of length.
-"""
-function pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector = xs; preprocess = nothing)
-    length(xs) == size(R, 1) || throw(DimensionMismatch("inconsistent length"))
-    length(ys) == size(R, 2) || throw(DimensionMismatch("inconsistent length"))
-    ((xs === ys) & (dist isa SemiMetric)) ?
-        _symmetric_pairwise!(R, dist, xs; preprocess = preprocess) :
-        _asymmetric_pairwise!(R, dist, xs, ys; preprocess = preprocess)
-end
-
-function _symmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector; preprocess = nothing)
-    objs = _preprocess(xs, dist, preprocess)
-    for i in 1:length(objs)
-        # handle missing
-        R[i, i] = objs[i] != objs[i]
-        Threads.@threads for j in (i+1):length(objs)
-            R[i, j] = R[j, i] = evaluate(dist, objs[i], objs[j])
-        end
-    end
-    return R
-end
-
-function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::AbstractVector, ys::AbstractVector; preprocess = nothing)
-    objsxs = _preprocess(xs, dist, preprocess)
-    objsys = xs === ys ? objsxs : _preprocess(ys, dist, preprocess)
-    for i in 1:length(objsxs)
-        Threads.@threads for j in 1:length(objsys)
-            R[i, j] = evaluate(dist, objsxs[i], objsys[j])
-        end
-    end
-    return R
-end
-
-function _preprocess(xs, dist::StringDistance, preprocess)
-    if preprocess === nothing
-        preprocess = length(xs) >= 5
-    end
-    if (dist isa AbstractQGramDistance) && preprocess
-        return fetch.(map(x -> (Threads.@spawn x === missing ? x : QGramSortedVector(x, dist.q)), xs))
-    else
-        return xs
-    end
-end
--- a/test/distances.jl
+++ b/test/distances.jl
@ -38,20 +38,31 @@ using StringDistances, Unicode, Test, Random
 		@test ismissing(evaluate(Levenshtein(), "", missing))
 	end

+	@testset "OptimalStringAlignement" begin
+		@test evaluate(OptimalStringAlignement(), "", "") == 0
+		@test evaluate(OptimalStringAlignement(), "abc", "") == 3
+		@test evaluate(OptimalStringAlignement(), "bc", "abc") == 1
+		@test evaluate(OptimalStringAlignement(), "fuor", "four") == 1
+		@test evaluate(OptimalStringAlignement(), "abcd", "acb") == 2
+		@test evaluate(OptimalStringAlignement(), "cape sand recycling ", "edith ann graham") == 17
+		@test evaluate(OptimalStringAlignement(), "jellyifhs", "jellyfish") == 2
+		@test evaluate(OptimalStringAlignement(), "ifhs", "fish") == 2
+		@test OptimalStringAlignement(2)("abcdef", "abcxyf") == 2
+
+		@test evaluate(OptimalStringAlignement(), [1, 2, 3], [1,2, 4]) == 1
+		@test evaluate(OptimalStringAlignement(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(OptimalStringAlignement(), "alborgów", "amoniak")
+		@test OptimalStringAlignement()("bc", "abc") == 1
+		@test result_type(OptimalStringAlignement(), "hello", "world") == Int
+		@inferred evaluate(OptimalStringAlignement(), "", "")
+		@test ismissing(evaluate(OptimalStringAlignement(), "", missing))
+	end
+
 	@testset "DamerauLevenshtein" begin
 		@test evaluate(DamerauLevenshtein(), "", "") == 0
-		@test evaluate(DamerauLevenshtein(), "abc", "") == 3
-		@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
-		@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
-		@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
-		@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
-		@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
-		@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
-		@test DamerauLevenshtein(2)("abcdef", "abcxyf") == 2
-
-		@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
-		@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
-		@test DamerauLevenshtein()("bc", "abc") == 1
+		@test evaluate(DamerauLevenshtein(), "CA", "ABC") == 2
+		@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABDCEF") == 1
+		@test evaluate(DamerauLevenshtein(), "ABCDEF", "BACDFE") == 2
+		@test evaluate(DamerauLevenshtein(), "ABCDEF", "ABCDE") == 1
 		@test result_type(DamerauLevenshtein(), "hello", "world") == Int
 		@inferred evaluate(DamerauLevenshtein(), "", "")
 		@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@ -292,7 +303,7 @@ using StringDistances, Unicode, Test, Random
 	]

 	solutions = ((Levenshtein(), [2  2  4  1  3  0  3  2  3  3  4  6 17  3  3  2]),
-			(DamerauLevenshtein(), [1  2  4  1  3  0  3  2  3  3  4  6 17  2  2  2]),
+			(OptimalStringAlignement(), [1  2  4  1  3  0  3  2  3  3  4  6 17  2  2  2]),
 			(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
 			(QGram(1), [0   3   3   1 3  0   6   4   5   4   4  11  14   0   0   3]),
 			(QGram(2), [  6   7   7   1 2 0   4   4   7   8   4  13  32   8   6   5]),
@ -320,13 +331,13 @@ using StringDistances, Unicode, Test, Random
 	for i in eachindex(strings)
 		d = Levenshtein()(strings[i]...)
 		@test Levenshtein(d)(strings[i]...) == d
-		d = DamerauLevenshtein()(strings[i]...)
-		@test DamerauLevenshtein(d)(strings[i]...) == d
+		d = OptimalStringAlignement()(strings[i]...)
+		@test OptimalStringAlignement(d)(strings[i]...) == d
 	end
 end

-d = DamerauLevenshtein()("abcdef", "abcxyf")
-@test DamerauLevenshtein(d)("abcdef", "abcxyf") == d
+d = OptimalStringAlignement()("abcdef", "abcxyf")
+@test OptimalStringAlignement(d)("abcdef", "abcxyf") == d



--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -60,9 +60,9 @@ end
 	#Levenshtein
 	compare("aüa", "aua", Levenshtein())
 	@test compare("ok", missing, Levenshtein()) === missing
-	compare("aüa", "aua", DamerauLevenshtein())
-	@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
-	@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
+	compare("aüa", "aua", OptimalStringAlignement())
+	@test StringDistances.normalize(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
+	@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0

 	# RatcliffObershelp
 	@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp())  ≈ 0.0
@ -115,7 +115,7 @@ end
 	("ifhs", "fish"),
 	("leia", "leela"),
 	]
-	for dist in (Levenshtein, DamerauLevenshtein)
+	for dist in (Levenshtein, OptimalStringAlignement)
 		for i in eachindex(strings)
 			if compare(strings[i]..., dist()) <  1 / 3
 				@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0
--- a/test/pairwise.jl
+++ b/test/pairwise.jl
@ -7,7 +7,7 @@ using StringDistances, Unicode, Test, Random
 	TestStrings1missing = ["", "abc", "bc", missing]
 	TestStrings2missing = ["mew", missing]

-	for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(),
+	for d in [Jaro(), Levenshtein(), OptimalStringAlignement(), RatcliffObershelp(),
 				QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)]

 		R = pairwise(d, TestStrings1)