simplify three arguments form

compathelper/new_version/2020-05-20-12-03-08-092-188304956
matthieugomez 2020-02-13 09:44:27 -05:00
parent 41ccf12e45
commit 093c536377
7 changed files with 43 additions and 40 deletions

View File

@ -8,7 +8,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
[compat]
julia = "1"
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
Distances = "0.8.1"
[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

View File

@ -8,8 +8,11 @@ include("qgram.jl")
include("modifiers.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
Distances.evaluate(dist::StringDistance, args...) = dist(args...)
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
include("find.jl")
##############################################################################

View File

@ -12,12 +12,11 @@ where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
isnormalized(::Jaro) = true
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString
function (dist::Jaro)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::Jaro)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# If both are empty, the formula in Wikipedia gives 0
@ -25,6 +24,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
prevstate1 = firstindex(s1)
ch1_match = Vector{eltype(s1)}(undef, len1)
# m counts number matching characters
m = 0
@ -55,6 +55,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
m == 0 && return 1.0
# t counts number of transpositions
@ -82,11 +83,11 @@ substitutions of a single character) required to change one string into the othe
struct Levenshtein <: Metric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist +1 if distance higher than max_dist
# Return max_dist + 1 if distance higher than max_dist
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
function (dist::Levenshtein)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -125,8 +126,6 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
return current
end
"""
DamerauLevenshtein()
@ -139,8 +138,9 @@ required to change one string into the other.
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
# Return max_dist + 1 if distance higher than max_dist
function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -223,10 +223,8 @@ region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end
isnormalized(::RatcliffObershelp) = true
function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::RatcliffObershelp)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)

View File

@ -6,16 +6,12 @@ end
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 (or a `missing` if one element is missing)
"""
function normalize(dist::SemiMetric)
isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
end
isnormalized(dist::SemiMetric) = false
isnormalized(dist::Normalize) = true
# also a normalized distance always accept a third argument, max_dist.
normalize(dist::SemiMetric) = Normalize{typeof(dist)}(dist)
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
@ -25,7 +21,7 @@ function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, ma
end
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
((s1 === missing) | (s2 === missing)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -37,6 +33,9 @@ function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
end
end
function (dist::Normalize)(s1, s2, max_dist = 1.0)
dist.dist(s1, s2)
end
"""
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
@ -55,15 +54,15 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
maxlength::Integer # max length of common prefix. Default to 4
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
end
isnormalized(dist::Winkler) = true
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
end
normalize(dist::Winkler) = dist
function (dist::Winkler)(s1, s2, max_dist = 1.0)
# cannot do min_score because of boosting threshold
# cannot do max_dist because of boosting threshold
score = dist.dist(s1, s2)
if score <= 1 - dist.threshold
l = common_prefix(s1, s2)[1]
@ -94,13 +93,13 @@ struct Partial{S <: SemiMetric} <: SemiMetric
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::Partial) = true
normalize(dist::Partial) = dist
function (dist::Partial)(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2, max_dist)
len1 == 0 && return 0.0
len1 == 0 && return 1.0
out = 1.0
for x in qgrams(s2, len1)
curr = dist.dist(s1, x, max_dist)
@ -110,7 +109,7 @@ function (dist::Partial)(s1, s2, max_dist = 1.0)
return out
end
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
function (dist::Partial{Normalize{RatcliffObershelp}})(s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2)
@ -127,7 +126,6 @@ function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
s2_end += len2 - s2_end
end
curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
out = min(out, curr)
end
return out
@ -155,7 +153,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenSort) = true
normalize(dist::TokenSort) = dist
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
@ -187,6 +185,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
end
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenSet) = true
normalize(dist::TokenSet) = dist
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
@ -229,7 +228,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
end
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenMax) = true
normalize(dist::TokenMax) = dist
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1, s2 = reorder(s1, s2)

View File

@ -97,7 +97,7 @@ struct QGram <: QGramDistance
end
function (dist::QGram)(s1, s2)
(ismissing(s1) | ismissing(s2)) && return missing
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
n = 0
for (n1, n2) in itr
@ -122,8 +122,8 @@ struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::Cosine)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in itr
@ -149,8 +149,8 @@ struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::Jaccard)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
@ -176,8 +176,8 @@ struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::SorensenDice)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr
@ -203,8 +203,8 @@ struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
function (dist::Overlap)(s1, s2)
((s1 === missing) | (s2 === missing)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in itr

View File

@ -23,6 +23,8 @@ struct StringWithLength{T <: AbstractString} <: AbstractString
l::Int
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
# Not really needed but avoid multi-encapsulation
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)

View File

@ -37,6 +37,7 @@ using StringDistances, Unicode, Test
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
@test compare("New York Yankees", "", Partial(Jaro())) 0.0
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444