From 04b1902f9ed1f4c629b999283a123e7a63dc552c Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Mon, 20 Jul 2020 08:25:53 -0700 Subject: [PATCH] add back normalize for Partial/TokenSort/TokenSet --- Project.toml | 2 +- README.md | 6 +- src/StringDistances.jl | 7 +- src/{distances => }/edit.jl | 4 +- src/normalize.jl | 159 +++++++++++++++++++++++++++++++---- src/{distances => }/qgram.jl | 10 +-- test/modifiers.jl | 2 +- 7 files changed, 158 insertions(+), 32 deletions(-) rename src/{distances => }/edit.jl (98%) rename src/{distances => }/qgram.jl (94%) diff --git a/Project.toml b/Project.toml index 2977cc5..fc88258 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "StringDistances" uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" -version = "0.7.1" +version = "0.7.2" [deps] Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" diff --git a/README.md b/README.md index 4ee4133..f02b1b6 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,9 @@ The available distances are: - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` - Distance "modifiers" that can be applied to any distance: - - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum distance between the shorter string and substrings of the longer string. - - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically. - - [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. + - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string. + - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. + - [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by returning the normalized distance between the intersection of two strings with each string. - [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines the normalized distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. This is a good distance to match strings composed of multiple words, like addresses. `TokenMax(Levenshtein())` corresponds to the distance defined in [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) - [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) diminishes the normalized distance of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but it can be defined for any string distance. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index d5b40e0..b5d7b92 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -3,12 +3,11 @@ module StringDistances using Distances include("utils.jl") -include("distances/edit.jl") -include("distances/qgram.jl") -include("modifiers.jl") +include("edit.jl") +include("qgram.jl") include("normalize.jl") -const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized} +const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize} # Distances API Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", "")) diff --git a/src/distances/edit.jl b/src/edit.jl similarity index 98% rename from src/distances/edit.jl rename to src/edit.jl index e355570..7c572ab 100755 --- a/src/distances/edit.jl +++ b/src/edit.jl @@ -14,7 +14,7 @@ where ``m`` is the number of matching characters and struct Jaro <: SemiMetric end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html -function (dist::Jaro)(s1, s2, nothing::Nothing = nothing) +function (dist::Jaro)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -180,7 +180,7 @@ region on either side of the longest common subsequence. """ struct RatcliffObershelp <: SemiMetric end -function (dist::RatcliffObershelp)(s1, s2, nothing::Nothing = nothing) +function (dist::RatcliffObershelp)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) n_matched = sum(last.(matching_blocks(s1, s2))) diff --git a/src/normalize.jl b/src/normalize.jl index d7259e0..7525a4c 100755 --- a/src/normalize.jl +++ b/src/normalize.jl @@ -1,9 +1,19 @@ -struct Normalized{S <: SemiMetric} <: SemiMetric +struct Normalize{S <: SemiMetric} <: SemiMetric dist::S end + +""" + normalize(dist::SemiMetric) + + Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 +""" +normalize(dist::SemiMetric) = Normalize(dist) +normalize(dist::Normalize) = dist + + # A normalized distance is between 0 and 1, and accept a third argument, max_dist. -function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0) +function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0) ((s1 === missing) | (s2 === missing)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) @@ -13,7 +23,7 @@ function (dist::Normalized{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, m out > max_dist ? 1.0 : out end -function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0) +function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0) ((s1 === missing) | (s2 === missing)) && return missing # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) @@ -27,21 +37,141 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0) out > max_dist ? 1.0 : out end -function (dist::Normalized)(s1, s2, max_dist = 1.0) +function (dist::Normalize)(s1, s2, max_dist = 1.0) out = dist.dist(s1, s2) out > max_dist ? 1.0 : out end """ - normalize(dist::SemiMetric) + Partial(dist) - Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 +Creates the `Partial{dist}` distance. + +`Partial{dist}` normalizes the string distance `dist` and modify it to return the +minimum distance between the shorter string and substrings of the longer string + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> evaluate(Partial(RatcliffObershelp()), s1, s2) +0.5483870967741935 +``` """ -normalize(dist::Partial) = Partial(normalize(dist.dist)) -normalize(dist::TokenSort) = TokenSort(normalize(dist.dist)) -normalize(dist::TokenSet) = TokenSet(normalize(dist.dist)) -normalize(dist::SemiMetric) = Normalized(dist) -normalize(dist::Normalized) = dist +struct Partial{S <: SemiMetric} <: SemiMetric + dist::S + Partial{S}(dist::S) where {S <: SemiMetric} = new(dist) +end +Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist)) +normalize(dist::Partial) = dist + +function (dist::Partial)(s1, s2, max_dist = 1.0) + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + out = dist.dist(s1, s2, max_dist) + len1 == len2 && return out + len1 == 0 && return out + for x in qgrams(s2, len1) + curr = dist.dist(s1, x, max_dist) + out = min(out, curr) + max_dist = min(out, max_dist) + end + return out +end + +function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0) + s1, s2 = reorder(s1, s2) + len1, len2 = length(s1), length(s2) + len1 == len2 && return dist.dist(s1, s2) + out = 1.0 + for r in matching_blocks(s1, s2) + # Make sure the substring of s2 has length len1 + s2_start = r[2] - r[1] + 1 + s2_end = s2_start + len1 - 1 + if s2_start < 1 + s2_end += 1 - s2_start + s2_start += 1 - s2_start + elseif s2_end > len2 + s2_start += len2 - s2_end + s2_end += len2 - s2_end + end + curr = dist.dist(s1, _slice(s2, s2_start, s2_end)) + out = min(out, curr) + end + return out +end + +""" + TokenSort(dist) + +Creates the `TokenSort{dist}` distance. + +`TokenSort{dist}` normalizes the string distance `dist` and modify it to adjust for differences +in word orders by reording words alphabetically. + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s1 = "New York Mets vs Atlanta Braves" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> evaluate(TokenSort(RatcliffObershelp()), s1, s2) +0.0 +``` +""" +struct TokenSort{S <: SemiMetric} <: SemiMetric + dist::S + TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist) +end +TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)) +normalize(dist::TokenSort) = dist + +# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ +function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) + s1 = join(sort!(split(s1)), " ") + s2 = join(sort!(split(s2)), " ") + out = dist.dist(s1, s2, max_dist) +end + +""" + TokenSet(dist) + +Creates the `TokenSet{dist}` distance. + +`TokenSet{dist}` normalizes the string distance `dist` and modify it to adjust for differences +in word orders and word numbers by comparing the intersection of two strings with each string. + +### Examples +```julia-repl +julia> s1 = "New York Mets vs Atlanta" +julia> s2 = "Atlanta Braves vs New York Mets" +julia> evaluate(TokenSet(RatcliffObershelp()), s1, s2) +0.0 +``` +""" +struct TokenSet{S <: SemiMetric} <: SemiMetric + dist::S + TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist) +end +TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist)) +normalize(dist::TokenSet) = dist + +# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ +function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) + v1 = unique!(sort!(split(s1))) + v2 = unique!(sort!(split(s2))) + v0 = intersect(v1, v2) + s0 = join(v0, " ") + s1 = join(v1, " ") + s2 = join(v2, " ") + isempty(s0) && return dist.dist(s1, s2, max_dist) + score_01 = dist.dist(s0, s1, max_dist) + max_dist = min(max_dist, score_01) + score_02 = dist.dist(s0, s2, max_dist) + max_dist = min(max_dist, score_02) + score_12 = dist.dist(s1, s2, max_dist) + min(score_01, score_02, score_12) +end + @@ -50,7 +180,7 @@ normalize(dist::Normalized) = dist Creates the `Winkler{dist, p, threshold, maxlength}` distance. -`Winkler{dist, p, threshold, length)` modifies the string distance `normalize(dist)` to decrease the +`Winkler{dist, p, threshold, length)` normalizes the string distance `dist` and modify it to decrease the distance between two strings, when their original distance is below some `threshold`. The boost is equal to `min(l, maxlength) * p * dist` where `l` denotes the length of their common prefix and `dist` denotes the original distance @@ -103,10 +233,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist) end -function TokenMax(dist::SemiMetric) - dist = normalize(dist) - TokenMax{typeof(dist)}(dist) -end +TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist)) normalize(dist::TokenMax) = dist function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0) diff --git a/src/distances/qgram.jl b/src/qgram.jl similarity index 94% rename from src/distances/qgram.jl rename to src/qgram.jl index 03a716e..edc518d 100755 --- a/src/distances/qgram.jl +++ b/src/qgram.jl @@ -98,7 +98,7 @@ struct QGram <: QGramDistance q::Int end -function (dist::QGram)(s1, s2, nothing::Nothing = nothing) +function (dist::QGram)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing n = 0 for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) @@ -124,7 +124,7 @@ struct Cosine <: QGramDistance q::Int end -function (dist::Cosine)(s1, s2, nothing::Nothing = nothing) +function (dist::Cosine)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing norm1, norm2, prodnorm = 0, 0, 0 for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) @@ -151,7 +151,7 @@ struct Jaccard <: QGramDistance q::Int end -function (dist::Jaccard)(s1, s2, nothing::Nothing = nothing) +function (dist::Jaccard)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) @@ -178,7 +178,7 @@ struct SorensenDice <: QGramDistance q::Int end -function (dist::SorensenDice)(s1, s2, nothing::Nothing = nothing) +function (dist::SorensenDice)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) @@ -205,7 +205,7 @@ struct Overlap <: QGramDistance q::Int end -function (dist::Overlap)(s1, s2, nothing::Nothing = nothing) +function (dist::Overlap)(s1, s2) ((s1 === missing) | (s2 === missing)) && return missing ndistinct1, ndistinct2, nintersect = 0, 0, 0 for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q)) diff --git a/test/modifiers.jl b/test/modifiers.jl index 1f3b3ba..8ae6ca0 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -23,7 +23,7 @@ using StringDistances, Unicode, Test compare("aüa", "aua", Levenshtein()) compare("aüa", "aua", DamerauLevenshtein()) @test compare("ab", "de", Partial(DamerauLevenshtein())) == 0 - + @test normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 # Winkler @test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4 @test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4