add back normalize for Partial/TokenSort/TokenSet
parent
e0ef0e8ec1
commit
04b1902f9e
|
@ -1,6 +1,6 @@
|
|||
name = "StringDistances"
|
||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||
version = "0.7.1"
|
||||
version = "0.7.2"
|
||||
|
||||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
|
|
|
@ -22,9 +22,9 @@ The available distances are:
|
|||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
|
||||
- Distance "modifiers" that can be applied to any distance:
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically.
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the normalized distance between the intersection of two strings with each string.
|
||||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines the normalized distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
- [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) diminishes the normalized distance of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but it can be defined for any string distance.
|
||||
|
||||
|
|
|
@ -3,12 +3,11 @@ module StringDistances
|
|||
using Distances
|
||||
|
||||
include("utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("modifiers.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("normalize.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ where ``m`` is the number of matching characters and
|
|||
struct Jaro <: SemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function (dist::Jaro)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::Jaro)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -180,7 +180,7 @@ region on either side of the longest common subsequence.
|
|||
"""
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
159
src/normalize.jl
159
src/normalize.jl
|
@ -1,9 +1,19 @@
|
|||
|
||||
struct Normalized{S <: SemiMetric} <: SemiMetric
|
||||
struct Normalize{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
"""
|
||||
normalize(dist::SemiMetric)
|
||||
|
||||
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
|
||||
"""
|
||||
normalize(dist::SemiMetric) = Normalize(dist)
|
||||
normalize(dist::Normalize) = dist
|
||||
|
||||
|
||||
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
||||
function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -13,7 +23,7 @@ function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, m
|
|||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
@ -27,21 +37,141 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
|||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized)(s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize)(s1, s2, max_dist = 1.0)
|
||||
out = dist.dist(s1, s2)
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
"""
|
||||
normalize(dist::SemiMetric)
|
||||
Partial(dist)
|
||||
|
||||
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1
|
||||
Creates the `Partial{dist}` distance.
|
||||
|
||||
`Partial{dist}` normalizes the string distance `dist` and modify it to return the
|
||||
minimum distance between the shorter string and substrings of the longer string
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(Partial(RatcliffObershelp()), s1, s2)
|
||||
0.5483870967741935
|
||||
```
|
||||
"""
|
||||
normalize(dist::Partial) = Partial(normalize(dist.dist))
|
||||
normalize(dist::TokenSort) = TokenSort(normalize(dist.dist))
|
||||
normalize(dist::TokenSet) = TokenSet(normalize(dist.dist))
|
||||
normalize(dist::SemiMetric) = Normalized(dist)
|
||||
normalize(dist::Normalized) = dist
|
||||
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
|
||||
normalize(dist::Partial) = dist
|
||||
|
||||
function (dist::Partial)(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
out = dist.dist(s1, s2, max_dist)
|
||||
len1 == len2 && return out
|
||||
len1 == 0 && return out
|
||||
for x in qgrams(s2, len1)
|
||||
curr = dist.dist(s1, x, max_dist)
|
||||
out = min(out, curr)
|
||||
max_dist = min(out, max_dist)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
out = 1.0
|
||||
for r in matching_blocks(s1, s2)
|
||||
# Make sure the substring of s2 has length len1
|
||||
s2_start = r[2] - r[1] + 1
|
||||
s2_end = s2_start + len1 - 1
|
||||
if s2_start < 1
|
||||
s2_end += 1 - s2_start
|
||||
s2_start += 1 - s2_start
|
||||
elseif s2_end > len2
|
||||
s2_start += len2 - s2_end
|
||||
s2_end += len2 - s2_end
|
||||
end
|
||||
curr = dist.dist(s1, _slice(s2, s2_start, s2_end))
|
||||
out = min(out, curr)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
"""
|
||||
TokenSort(dist)
|
||||
|
||||
Creates the `TokenSort{dist}` distance.
|
||||
|
||||
`TokenSort{dist}` normalizes the string distance `dist` and modify it to adjust for differences
|
||||
in word orders by reording words alphabetically.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2)
|
||||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
|
||||
normalize(dist::TokenSort) = dist
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
out = dist.dist(s1, s2, max_dist)
|
||||
end
|
||||
|
||||
"""
|
||||
TokenSet(dist)
|
||||
|
||||
Creates the `TokenSet{dist}` distance.
|
||||
|
||||
`TokenSet{dist}` normalizes the string distance `dist` and modify it to adjust for differences
|
||||
in word orders and word numbers by comparing the intersection of two strings with each string.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2)
|
||||
0.0
|
||||
```
|
||||
"""
|
||||
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
|
||||
normalize(dist::TokenSet) = dist
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return dist.dist(s1, s2, max_dist)
|
||||
score_01 = dist.dist(s0, s1, max_dist)
|
||||
max_dist = min(max_dist, score_01)
|
||||
score_02 = dist.dist(s0, s2, max_dist)
|
||||
max_dist = min(max_dist, score_02)
|
||||
score_12 = dist.dist(s1, s2, max_dist)
|
||||
min(score_01, score_02, score_12)
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -50,7 +180,7 @@ normalize(dist::Normalized) = dist
|
|||
|
||||
Creates the `Winkler{dist, p, threshold, maxlength}` distance.
|
||||
|
||||
`Winkler{dist, p, threshold, length)` modifies the string distance `normalize(dist)` to decrease the
|
||||
`Winkler{dist, p, threshold, length)` normalizes the string distance `dist` and modify it to decrease the
|
||||
distance between two strings, when their original distance is below some `threshold`.
|
||||
The boost is equal to `min(l, maxlength) * p * dist` where `l` denotes the
|
||||
length of their common prefix and `dist` denotes the original distance
|
||||
|
@ -103,10 +233,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
|
|||
TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
|
||||
function TokenMax(dist::SemiMetric)
|
||||
dist = normalize(dist)
|
||||
TokenMax{typeof(dist)}(dist)
|
||||
end
|
||||
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
|
||||
normalize(dist::TokenMax) = dist
|
||||
|
||||
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
|
|
|
@ -98,7 +98,7 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::QGram)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::QGram)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
n = 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -124,7 +124,7 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Cosine)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::Cosine)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -151,7 +151,7 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Jaccard)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::Jaccard)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -178,7 +178,7 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::SorensenDice)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::SorensenDice)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -205,7 +205,7 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Overlap)(s1, s2, nothing::Nothing = nothing)
|
||||
function (dist::Overlap)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
|
@ -23,7 +23,7 @@ using StringDistances, Unicode, Test
|
|||
compare("aüa", "aua", Levenshtein())
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
|
||||
|
||||
@test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
|
||||
# Winkler
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4
|
||||
|
|
Loading…
Reference in New Issue