return 1 if distance over maxdist

compathelper/new_version/2020-10-08-17-05-17-769-1797568811
matthieugomez 2020-07-19 12:37:49 -07:00
parent 9d4ae1a510
commit fb0a786fd9
4 changed files with 25 additions and 29 deletions

View File

@ -3,12 +3,13 @@ module StringDistances
using Distances
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("modifiers.jl")
include("normalize.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
# Distances API
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
"""

View File

@ -14,20 +14,20 @@ where ``m`` is the number of matching characters and
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function (dist::Jaro)(s1, s2)
function (dist::Jaro)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# If both are empty, the formula in Wikipedia gives 0
# Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
d = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
ch1_match = Vector{eltype(s1)}()
for (i1, ch1) in enumerate(s1)
for (i2, ch2) in enumerate(s2)
# greedy alignement
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2]
flag[i2] = true
push!(ch1_match, ch1)
break
@ -49,7 +49,6 @@ function (dist::Jaro)(s1, s2)
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
(dist::Jaro)(s1, s2, ::Nothing) = (dist::Jaro)(s1, s2)
"""
Levenshtein()
@ -143,7 +142,7 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} =
if i2 <= k
prevch2 = ch2
elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
# no need to look beyond window of lower right diagonal - maxDistance cells
# no need to look beyond window of lower right diagonal - max distance cells
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
prevch2 = ch2
else
@ -181,7 +180,7 @@ region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end
function (dist::RatcliffObershelp)(s1, s2)
function (dist::RatcliffObershelp)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
s1, s2 = reorder(s1, s2)
n_matched = sum(last.(matching_blocks(s1, s2)))
@ -230,5 +229,3 @@ function longest_common_pattern(s1, s2)
end
return start1, start2, len
end
(dist::RatcliffObershelp)(s1, s2, ::Nothing) = dist(s1, s2)

View File

@ -98,7 +98,7 @@ struct QGram <: QGramDistance
q::Int
end
function (dist::QGram)(s1, s2)
function (dist::QGram)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
n = 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -107,7 +107,6 @@ function (dist::QGram)(s1, s2)
n
end
(dist::QGram)(s1, s2, ::Nothing) = dist(s1, s2)
"""
Cosine(q::Int)
@ -125,7 +124,7 @@ struct Cosine <: QGramDistance
q::Int
end
function (dist::Cosine)(s1, s2)
function (dist::Cosine)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -136,7 +135,6 @@ function (dist::Cosine)(s1, s2)
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
end
(dist::Cosine)(s1, s2, ::Nothing) = dist(s1, s2)
"""
Jaccard(q::Int)
@ -153,7 +151,7 @@ struct Jaccard <: QGramDistance
q::Int
end
function (dist::Jaccard)(s1, s2)
function (dist::Jaccard)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -164,7 +162,6 @@ function (dist::Jaccard)(s1, s2)
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
end
(dist::Jaccard)(s1, s2, ::Nothing) = dist(s1, s2)
"""
SorensenDice(q::Int)
@ -181,7 +178,7 @@ struct SorensenDice <: QGramDistance
q::Int
end
function (dist::SorensenDice)(s1, s2)
function (dist::SorensenDice)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -192,7 +189,6 @@ function (dist::SorensenDice)(s1, s2)
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
end
(dist::SorensenDice)(s1, s2, ::Nothing) = dist(s1, s2)
"""
Overlap(q::Int)
@ -209,7 +205,7 @@ struct Overlap <: QGramDistance
q::Int
end
function (dist::Overlap)(s1, s2)
function (dist::Overlap)(s1, s2, ::Nothing)
((s1 === missing) | (s2 === missing)) && return missing
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
@ -220,4 +216,3 @@ function (dist::Overlap)(s1, s2)
1.0 - nintersect / min(ndistinct1, ndistinct2)
end
(dist::Overlap)(s1, s2, ::Nothing) = dist(s1, s2)

View File

@ -20,14 +20,16 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
len1, len2 = length(s1), length(s2)
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
if typeof(dist.dist) <: QGram
dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
out = dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
else
dist.dist(s1, s2)
out = dist.dist(s1, s2)
end
out > max_dist ? 1.0 : out
end
function (dist::Normalized)(s1, s2, max_dist = 1.0)
dist.dist(s1, s2)
out = dist.dist(s1, s2)
out > max_dist ? 1.0 : out
end
"""
@ -67,12 +69,12 @@ normalize(dist::Winkler) = dist
function (dist::Winkler)(s1, s2, max_dist = 1.0)
# cannot do max_dist because of boosting threshold
score = dist.dist(s1, s2)
if score <= 1 - dist.threshold
out = dist.dist(s1, s2)
if out <= 1 - dist.threshold
l = common_prefix(s1, s2)[1]
score -= min(l, dist.maxlength) * dist.p * score
out -= min(l, dist.maxlength) * dist.p * out
end
return score
out > max_dist ? 1.0 : out
end
@ -121,13 +123,14 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale * partial_scale *
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
return min(score, score_partial, score_sort, score_set)
out = min(score, score_partial, score_sort, score_set)
else
score_sort = 1 - unbase_scale *
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale *
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
return min(score, score_sort, score_set)
out = min(score, score_sort, score_set)
end
out > max_dist ? 1.0 : out
end