7 changed files

@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.7.1"
version = "0.7.2"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

The available distances are:
- [Overlap Distance]( `Overlap(q::Int)`
- [Sorensen-Dice Distance]( `SorensenDice(q::Int)`
- Distance "modifiers" that can be applied to any distance:
- [Partial]( returns the minimum distance between the shorter string and substrings of the longer string.
- [TokenSort]( adjusts for differences in word orders by reording words alphabetically.
- [TokenSet]( adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
- [Partial]( returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
- [TokenSort]( adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.
- [TokenSet]( adjusts for differences in word orders and word numbers by returning the normalized distance between the intersection of two strings with each string.
- [TokenMax]( combines the normalized distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](
- [Winkler]( diminishes the normalized distance of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but it can be defined for any string distance.

@ -3,12 +3,11 @@ module StringDistances
using Distances
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
# Distances API
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))

@ -14,7 +14,7 @@ where ``m`` is the number of matching characters and
struct Jaro <: SemiMetric end
function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
function (dist::Jaro)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -180,7 +180,7 @@ region on either side of the longest common subsequence.
struct RatcliffObershelp <: SemiMetric end
function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))

@ -1,9 +1,19 @@
struct Normalized{S <: SemiMetric} <: SemiMetric
struct Normalize{S <: SemiMetric} <: SemiMetric
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
normalize(dist::SemiMetric) = Normalize(dist)
normalize(dist::Normalize) = dist
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -13,7 +23,7 @@ function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, m
out > max_dist ? 1.0 : out
function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
((s1 === missing) | (s2 === missing)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
@ -27,21 +37,141 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
out > max_dist ? 1.0 : out
function (dist::Normalized)(s1, s2, max_dist = 1.0)
function (dist::Normalize)(s1, s2, max_dist = 1.0)
out = dist.dist(s1, s2)
out > max_dist ? 1.0 : out
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
Creates the `Partial{dist}` distance.
`Partial{dist}` normalizes the string distance `dist` and modify it to return the
minimum distance between the shorter string and substrings of the longer string
### Examples
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
normalize(dist::Partial) = Partial(normalize(dist.dist))
normalize(dist::TokenSort) = TokenSort(normalize(dist.dist))
normalize(dist::TokenSet) = TokenSet(normalize(dist.dist))
normalize(dist::SemiMetric) = Normalized(dist)
normalize(dist::Normalized) = dist
struct Partial{S <: SemiMetric} <: SemiMetric
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
normalize(dist::Partial) = dist
function (dist::Partial)(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
out = dist.dist(s1, s2, max_dist)
len1 == len2 && return out
len1 == 0 && return out
for x in qgrams(s2, len1)
curr = dist.dist(s1, x, max_dist)
out = min(out, curr)
max_dist = min(out, max_dist)
return out
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2)
out = 1.0
for r in matching_blocks(s1, s2)
# Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
s2_end = s2_start + len1 - 1
if s2_start < 1
s2_end += 1 - s2_start
s2_start += 1 - s2_start
elseif s2_end > len2
s2_start += len2 - s2_end
s2_end += len2 - s2_end
curr = dist.dist(s1, _slice(s2, s2_start, s2_end))
out = min(out, curr)
return out
Creates the `TokenSort{dist}` distance.
`TokenSort{dist}` normalizes the string distance `dist` and modify it to adjust for differences
in word orders by reording words alphabetically.
### Examples
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
struct TokenSort{S <: SemiMetric} <: SemiMetric
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
normalize(dist::TokenSort) = dist
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
out = dist.dist(s1, s2, max_dist)
Creates the `TokenSet{dist}` distance.
`TokenSet{dist}` normalizes the string distance `dist` and modify it to adjust for differences
in word orders and word numbers by comparing the intersection of two strings with each string.
### Examples
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
struct TokenSet{S <: SemiMetric} <: SemiMetric
TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
normalize(dist::TokenSet) = dist
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return dist.dist(s1, s2, max_dist)
score_01 = dist.dist(s0, s1, max_dist)
max_dist = min(max_dist, score_01)
score_02 = dist.dist(s0, s2, max_dist)
max_dist = min(max_dist, score_02)
score_12 = dist.dist(s1, s2, max_dist)
min(score_01, score_02, score_12)
@ -50,7 +180,7 @@ normalize(dist::Normalized) = dist
Creates the `Winkler{dist, p, threshold, maxlength}` distance.
`Winkler{dist, p, threshold, length)` modifies the string distance `normalize(dist)` to decrease the
`Winkler{dist, p, threshold, length)` normalizes the string distance `dist` and modify it to decrease the
distance between two strings, when their original distance is below some `threshold`.
The boost is equal to `min(l, maxlength) * p * dist` where `l` denotes the
length of their common prefix and `dist` denotes the original distance
@ -103,10 +233,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
function TokenMax(dist::SemiMetric)
dist = normalize(dist)
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
normalize(dist::TokenMax) = dist
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)

@ -98,7 +98,7 @@ struct QGram <: QGramDistance
function (dist::QGram)(s1, s2, nothing::Nothing = nothing)
function (dist::QGram)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -124,7 +124,7 @@ struct Cosine <: QGramDistance
function (dist::Cosine)(s1, s2, nothing::Nothing = nothing)
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -151,7 +151,7 @@ struct Jaccard <: QGramDistance
function (dist::Jaccard)(s1, s2, nothing::Nothing = nothing)
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -178,7 +178,7 @@ struct SorensenDice <: QGramDistance
function (dist::SorensenDice)(s1, s2, nothing::Nothing = nothing)
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -205,7 +205,7 @@ struct Overlap <: QGramDistance
function (dist::Overlap)(s1, s2, nothing::Nothing = nothing)
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))

@ -23,7 +23,7 @@ using StringDistances, Unicode, Test
compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein())
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
# Winkler
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.84 atol = 1e-4