add functors

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-12 09:41:46 -05:00
parent 1ccef94f1a
commit 4806349088
7 changed files with 65 additions and 47 deletions

View File

@ -50,6 +50,12 @@ evaluate(TokenMax(RatcliffObershelp()), "martha", "marhta")
A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
Alternatively, each `dist` struct can be used as a callable to call the evaluate function of each metric or modified metric, for example:
```julia
Jaro()("martha", "marhta")
Winkler(Jaro())("martha", "marhta")
QGram(2)("martha", "marhta")
```
## Compare
The function `compare` is defined as 1 minus the normalized distance between two strings. It always returns a number between 0 and 1: a value of 0 means completely different and a value of 1 means completely similar.
```julia

View File

@ -1,16 +1,15 @@
module StringDistances
using Distances
import Distances: evaluate, result_type
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("modifiers.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
result_type(dist::StringDistance, s1, s2) = typeof(evaluate(dist, "", ""))
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
Distances.evaluate(dist::StringDistance, args...) = dist(args...)
include("find.jl")
##############################################################################

View File

@ -16,7 +16,7 @@ isnormalized(::Jaro) = true
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString
function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
function (dist::Jaro)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -85,7 +85,7 @@ struct Levenshtein <: Metric end
# Return max_dist +1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
function (dist::Levenshtein)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -139,7 +139,7 @@ required to change one string into the other.
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -225,7 +225,7 @@ struct RatcliffObershelp <: SemiMetric end
isnormalized(::RatcliffObershelp) = true
function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)

View File

@ -10,7 +10,7 @@ julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - normalize(dist)(s1, s2, 1 - min_score)
"""

View File

@ -14,26 +14,26 @@ isnormalized(dist::SemiMetric) = false
isnormalized(dist::Normalize) = true
function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
d = dist.dist(s1, s2, ceil(Int, len2 * max_dist))
out = d / len2
out > max_dist ? 1.0 : out
end
function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
if typeof(dist.dist) <: QGram
evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
else
evaluate(dist.dist, s1, s2)
dist.dist(s1, s2)
end
end
@ -55,17 +55,16 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
maxlength::Integer # max length of common prefix. Default to 4
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
end
isnormalized(dist::Winkler) = true
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
end
isnormalized(dist::Winkler) = true
function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
function (dist::Winkler)(s1, s2, max_dist = 1.0)
# cannot do min_score because of boosting threshold
score = evaluate(dist.dist, s1, s2)
score = dist.dist(s1, s2)
if score <= 1 - dist.threshold
l = common_prefix(s1, s2)[1]
score -= min(l, dist.maxlength) * dist.p * score
@ -97,24 +96,24 @@ end
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::Partial) = true
function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
function (dist::Partial)(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
len1 == len2 && return dist.dist(s1, s2, max_dist)
len1 == 0 && return 0.0
out = 1.0
for x in qgrams(s2, len1)
curr = evaluate(dist.dist, s1, x, max_dist)
curr = dist.dist(s1, x, max_dist)
out = min(out, curr)
max_dist = min(out, max_dist)
end
return out
end
function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return evaluate(dist.dist, s1, s2)
len1 == len2 && return dist.dist(s1, s2)
out = 1.0
for r in matching_blocks(s1, s2)
# Make sure the substring of s2 has length len1
@ -127,7 +126,7 @@ function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
s2_start += len2 - s2_end
s2_end += len2 - s2_end
end
curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
out = min(out, curr)
end
@ -159,10 +158,10 @@ TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)
isnormalized(dist::TokenSort) = true
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
evaluate(dist.dist, s1, s2, max_dist)
dist.dist(s1, s2, max_dist)
end
@ -190,19 +189,19 @@ TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenSet) = true
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
score_01 = evaluate(dist.dist, s0, s1, max_dist)
isempty(s0) && return dist.dist(s1, s2, max_dist)
score_01 = dist.dist(s0, s1, max_dist)
max_dist = min(max_dist, score_01)
score_02 = evaluate(dist.dist, s0, s2, max_dist)
score_02 = dist.dist(s0, s2, max_dist)
max_dist = min(max_dist, score_02)
score_12 = evaluate(dist.dist, s1, s2, max_dist)
score_12 = dist.dist(s1, s2, max_dist)
min(score_01, score_02, score_12)
end
@ -232,29 +231,29 @@ end
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenMax) = true
function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
score = evaluate(dist.dist, s1, s2, max_dist)
score = dist.dist(s1, s2, max_dist)
min_score = min(max_dist, score)
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1)
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
min_score = min(max_dist, score_partial)
score_sort = 1 - unbase_scale * partial_scale *
(1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
(1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale * partial_scale *
(1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
(1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
return min(score, score_partial, score_sort, score_set)
else
score_sort = 1 - unbase_scale *
(1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale *
(1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
return min(score, score_sort, score_set)
end
end

View File

@ -45,9 +45,6 @@ qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s,
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
abstract type QGramDistance <: SemiMetric end
# For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2,
# returns a tuple with the numbers of times it appears in x1 and x2
@ -80,6 +77,9 @@ function count_map(s1, s2)
return d
end
abstract type QGramDistance <: SemiMetric end
"""
QGram(q::Int)
@ -96,7 +96,7 @@ struct QGram <: QGramDistance
q::Int
end
function evaluate(dist::QGram, s1, s2)
function (dist::QGram)(s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
@ -122,7 +122,7 @@ struct Cosine <: QGramDistance
q::Int
end
function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
function (dist::Cosine)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
@ -149,7 +149,7 @@ struct Jaccard <: QGramDistance
q::Int
end
function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
function (dist::Jaccard)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -176,7 +176,7 @@ struct SorensenDice <: QGramDistance
q::Int
end
function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
function (dist::SorensenDice)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -203,7 +203,7 @@ struct Overlap <: QGramDistance
q::Int
end
function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
function (dist::Overlap)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0

View File

@ -9,11 +9,15 @@ using StringDistances, Unicode, Test
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) 0.2222222222222222
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
@test Jaro()(" vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Jaro(), "", "")
@test ismissing(evaluate(Jaro(), "", missing))
end
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@ -25,6 +29,7 @@ using StringDistances, Unicode, Test
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
@test Levenshtein()("", "abc") == 3
@test result_type(Levenshtein(), "hello", "world") == Int
@inferred Int evaluate(Levenshtein(), "", "")
@test ismissing(evaluate(Levenshtein(), "", missing))
@ -41,6 +46,7 @@ using StringDistances, Unicode, Test
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
@test DamerauLevenshtein()("bc", "abc") == 1
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred Int evaluate(DamerauLevenshtein(), "", "")
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@ -56,6 +62,7 @@ using StringDistances, Unicode, Test
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) 1/3
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
@test RatcliffObershelp()("pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(RatcliffObershelp(), "", "")
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
@ -68,19 +75,23 @@ using StringDistances, Unicode, Test
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
@test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
@test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(QGram(2), "alborgów", "amoniak")
@test QGram(1)("abc", "cba") == 0
@test result_type(QGram(1), "hello", "world") == Int
@test ismissing(evaluate(QGram(1), "", missing))
@inferred Int evaluate(QGram(1), "", "")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) 0.5
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Cosine(2), "alborgów", "amoniak")
@test Cosine(2)("leia", "leela") 0.7113249 atol = 1e-4
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Cosine(2), "", "")
@test ismissing(evaluate(Cosine(2), "", missing))
@ -92,6 +103,7 @@ using StringDistances, Unicode, Test
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) 2/3 atol = 1e-4
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(Jaccard(2), "alborgów", "amoniak")
@test Jaccard(2)("leia", "leela") 0.83333 atol = 1e-4
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Jaccard(1), "", "")
@test ismissing(evaluate(Jaccard(1), "", missing))
@ -101,6 +113,7 @@ using StringDistances, Unicode, Test
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) evaluate(SorensenDice(2), "alborgów", "amoniak")
@test SorensenDice(2)("night", "nacht") 0.75 atol = 1e-4
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(SorensenDice(1), "", "")
@test ismissing(evaluate(SorensenDice(1), "", missing))
@ -109,6 +122,7 @@ using StringDistances, Unicode, Test
@testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test Overlap(1)("context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@inferred Float64 evaluate(Overlap(1), "", "")
@test ismissing(evaluate(Overlap(1), "", missing))