Rename `OptimalStringAlignement` to `OptimalStringAlignment` (#57)

The former is a misspelling of the latter: note the extra 'e' in
"alignement." In renaming this, I've used `deprecate_binding` which
correctly handles deprecations for types as well as exporting.

I've also corrected two issues with the docstring for this type:
- The whole docstring was indented by 4 spaces, which Markdown
  interprets as a code block, so the whole docstring was code-quoted.
- The docstring erroneously said that OSA is the unrestricted D-L
  distance, but it's actually the restricted D-L distance.
pull/59/head
Alex Arslan 2021-12-02 10:23:41 -08:00 committed by GitHub
parent 71b4e42ead
commit c149761927
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 59 additions and 57 deletions

View File

@ -12,7 +12,7 @@ The available distances are:
- Hamming Distance `Hamming() <: SemiMetric`
- [Jaro and Jaro-Winkler Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()` `JaroWinkler() <: SemiMetric`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein() <: Metric`
- [Optimal String Alignement Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignement() <: SemiMetric`
- [Optimal String Alignment Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance) (a.k.a. restricted Damerau-Levenshtein) `OptimalStringAlignment() <: SemiMetric`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions) `DamerauLevenshtein() <: Metric`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp() <: SemiMetric`
- Q-gram distances compare the set of all substrings of length `q` in each string (and which
@ -72,7 +72,7 @@ The package also adds convenience functions to find elements in a iterator of st
findall(s, itr, dist; min_score = 0.8)
```
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignement` distances, as these algorithm can stop early if the distance becomes higher than a certain threshold.
The functions `findnearest` and `findall` are particularly optimized for the `Levenshtein` and `OptimalStringAlignment` distances, as these algorithm can stop early if the distance becomes higher than a certain threshold.

View File

@ -21,9 +21,9 @@ end
# 0.33s
@time f(Levenshtein(), x, y, min_score = 0.8);
# 0.11
@time f(OptimalStringAlignement(), x, y);
@time f(OptimalStringAlignment(), x, y);
# 0.44s.
@time f(OptimalStringAlignement(), x, y, min_score = 0.8);
@time f(OptimalStringAlignment(), x, y, min_score = 0.8);
# 0.08
@time f(DamerauLevenshtein(), x, y);
# 0.8s
@ -35,7 +35,7 @@ end
@time findnearest(x[1], y, Levenshtein());
# 0.1
@time findnearest(x[1], y, OptimalStringAlignement());
@time findnearest(x[1], y, OptimalStringAlignment());
# 0.1
@time findnearest(x[1], y, QGram(2));
# 0.75
@ -44,17 +44,17 @@ end
@time findall(x[1], y, Levenshtein());
# 0.05
@time findall(x[1], y, OptimalStringAlignement());
@time findall(x[1], y, OptimalStringAlignment());
# 0.05
@time findall(x[1], y, Partial(OptimalStringAlignement()));
@time findall(x[1], y, Partial(OptimalStringAlignment()));
# 0.96
@time findall(x[1], y, QGram(2));
# 0.81
@time findall(x[1], y, TokenSort(OptimalStringAlignement()));
@time findall(x[1], y, TokenSort(OptimalStringAlignment()));
# 0.27 (now 0.32)
@time findall(x[1], y, TokenSet(OptimalStringAlignement()));
@time findall(x[1], y, TokenSet(OptimalStringAlignment()));
# 0.55
@time findall(x[1], y, TokenMax(OptimalStringAlignement()));
@time findall(x[1], y, TokenMax(OptimalStringAlignment()));
# 2.25 (now 3.6)

View File

@ -42,7 +42,7 @@ Hamming,
Jaro,
JaroWinkler,
Levenshtein,
OptimalStringAlignement,
OptimalStringAlignment,
DamerauLevenshtein,
RatcliffObershelp,
# Qgram distances

View File

@ -165,25 +165,25 @@ function (dist::Levenshtein)(s1, s2; max_dist::Union{Integer, Nothing} = nothing
end
"""
OptimalStringAlignement()
OptimalStringAlignment()
Creates the OptimalStringAlignement distance (also known ad the unrestricted DamerauLevenshtein distance).
Creates the OptimalStringAlignment distance (also known as the restricted DamerauLevenshtein distance).
It is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
It is the minimum number of operations (consisting of insertions,
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
The distance differs slightly from the Damerau-Levenshtein algorithm by imposing
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
distance of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
The distance differs slightly from the Damerau-Levenshtein algorithm by imposing
the restriction that no substring is edited more than once. So for example, "CA" to "ABC" has an edit
distance of 2 by a complete application of Damerau-Levenshtein, but a distance of 3 by this method that
uses the optimal string alignment algorithm. In particular, the restricted distance does not satisfy
the triangle inequality.
"""
struct OptimalStringAlignement <: StringSemiMetric end
struct OptimalStringAlignment <: StringSemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::OptimalStringAlignement)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
function (dist::OptimalStringAlignment)(s1, s2; max_dist::Union{Integer, Nothing} = nothing)
(s1 === missing) | (s2 === missing) && return missing
len1, len2 = length(s1), length(s2)
if len1 > len2
@ -246,6 +246,8 @@ function (dist::OptimalStringAlignement)(s1, s2; max_dist::Union{Integer, Nothin
return Int(current)
end
Base.@deprecate_binding OptimalStringAlignement OptimalStringAlignment
"""
DamerauLevenshtein()
@ -370,4 +372,4 @@ function longest_common_pattern!(p, s1, s2, start1, start2, end1, end2)
end
end
return j1, j2, len
end
end

View File

@ -38,7 +38,7 @@ function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2; max_di
return out
end
function (dist::Normalized{<:Union{Levenshtein, OptimalStringAlignement}})(s1, s2; max_dist = 1.0)
function (dist::Normalized{<:Union{Levenshtein, OptimalStringAlignment}})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -66,4 +66,4 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2; max_dist = 1.0)
end
max_dist !== nothing && out > max_dist && return 1.0
return out
end
end

View File

@ -41,28 +41,28 @@ using StringDistances, Unicode, Test, Random
@test ismissing(Levenshtein()("", missing))
end
@testset "OptimalStringAlignement" begin
@test OptimalStringAlignement()("", "") == 0
@test OptimalStringAlignement()("abc", "") == 3
@test OptimalStringAlignement()("bc", "abc") == 1
@test OptimalStringAlignement()("fuor", "four") == 1
@test OptimalStringAlignement()("abcd", "acb") == 2
@test OptimalStringAlignement()("cape sand recycling ", "edith ann graham") == 17
@test OptimalStringAlignement()("jellyifhs", "jellyfish") == 2
@test OptimalStringAlignement()("ifhs", "fish") == 2
@test OptimalStringAlignement()("a cat", "an act") == 2
@test OptimalStringAlignement()("a cat", "an abct") == 4
@test OptimalStringAlignement()("a cat", "a tc") == 3
@test OptimalStringAlignement()("abcdef", "abcxyf") == 2
@test OptimalStringAlignement()("abcdef", "abcxyf"; max_dist = 2) == 2
@testset "OptimalStringAlignment" begin
@test OptimalStringAlignment()("", "") == 0
@test OptimalStringAlignment()("abc", "") == 3
@test OptimalStringAlignment()("bc", "abc") == 1
@test OptimalStringAlignment()("fuor", "four") == 1
@test OptimalStringAlignment()("abcd", "acb") == 2
@test OptimalStringAlignment()("cape sand recycling ", "edith ann graham") == 17
@test OptimalStringAlignment()("jellyifhs", "jellyfish") == 2
@test OptimalStringAlignment()("ifhs", "fish") == 2
@test OptimalStringAlignment()("a cat", "an act") == 2
@test OptimalStringAlignment()("a cat", "an abct") == 4
@test OptimalStringAlignment()("a cat", "a tc") == 3
@test OptimalStringAlignment()("abcdef", "abcxyf") == 2
@test OptimalStringAlignment()("abcdef", "abcxyf"; max_dist = 2) == 2
prefix = "my_prefix"
@test OptimalStringAlignement()(prefix * "alborgów", prefix * "amoniak") == OptimalStringAlignement()("alborgów", "amoniak")
@test OptimalStringAlignement()([1, 2, 3], [1,2, 4]) == 1
@test OptimalStringAlignement()(graphemes("alborgów"), graphemes("amoniak")) == OptimalStringAlignement()("alborgów", "amoniak")
@test OptimalStringAlignement()("bc", "abc") == 1
@test result_type(OptimalStringAlignement(), "hello", "world") == Int
@inferred OptimalStringAlignement()("", "")
@test ismissing(OptimalStringAlignement()("", missing))
@test OptimalStringAlignment()(prefix * "alborgów", prefix * "amoniak") == OptimalStringAlignment()("alborgów", "amoniak")
@test OptimalStringAlignment()([1, 2, 3], [1,2, 4]) == 1
@test OptimalStringAlignment()(graphemes("alborgów"), graphemes("amoniak")) == OptimalStringAlignment()("alborgów", "amoniak")
@test OptimalStringAlignment()("bc", "abc") == 1
@test result_type(OptimalStringAlignment(), "hello", "world") == Int
@inferred OptimalStringAlignment()("", "")
@test ismissing(OptimalStringAlignment()("", missing))
end
@testset "DamerauLevenshtein" begin
@ -316,7 +316,7 @@ using StringDistances, Unicode, Test, Random
]
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(OptimalStringAlignement(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(OptimalStringAlignment(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
@ -344,8 +344,8 @@ using StringDistances, Unicode, Test, Random
for i in eachindex(strings)
d = Levenshtein()(strings[i]...)
@test Levenshtein()(strings[i]...; max_dist = d) == d
d = OptimalStringAlignement()(strings[i]...)
@test OptimalStringAlignement()(strings[i]...; max_dist = d) == d
d = OptimalStringAlignment()(strings[i]...)
@test OptimalStringAlignment()(strings[i]...; max_dist = d) == d
end
end

View File

@ -60,9 +60,9 @@ end
#Levenshtein
compare("aüa", "aua", Levenshtein())
@test compare("ok", missing, Levenshtein()) === missing
compare("aüa", "aua", OptimalStringAlignement())
@test StringDistances.Normalized(Partial(OptimalStringAlignement()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(OptimalStringAlignement())) == 0
compare("aüa", "aua", OptimalStringAlignment())
@test StringDistances.Normalized(Partial(OptimalStringAlignment()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(OptimalStringAlignment())) == 0
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@ -115,7 +115,7 @@ end
("ifhs", "fish"),
("leia", "leela"),
]
for dist in (Levenshtein, OptimalStringAlignement)
for dist in (Levenshtein, OptimalStringAlignment)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0
@ -150,4 +150,4 @@ end
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
end
end
end

View File

@ -7,7 +7,7 @@ using StringDistances, Unicode, Test, Random
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
for d in [Jaro(), Levenshtein(), OptimalStringAlignement(), RatcliffObershelp(),
for d in [Jaro(), Levenshtein(), OptimalStringAlignment(), RatcliffObershelp(),
QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)]
R = pairwise(d, TestStrings1)
@ -82,4 +82,4 @@ using StringDistances, Unicode, Test, Random
R5 = pairwise(d, TestStrings1missing; preprocess = true)
@test eltype(R5) == Union{result_type(d, String, String), Missing}
end
end
end