return 1 if distance over maxdist
parent
9d4ae1a510
commit
fb0a786fd9
|
@ -3,12 +3,13 @@ module StringDistances
|
|||
using Distances
|
||||
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("modifiers.jl")
|
||||
include("normalize.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
|
||||
"""
|
||||
|
|
|
@ -14,20 +14,20 @@ where ``m`` is the number of matching characters and
|
|||
struct Jaro <: SemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function (dist::Jaro)(s1, s2)
|
||||
function (dist::Jaro)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# If both are empty, the formula in Wikipedia gives 0
|
||||
# Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
d = max(0, div(len2, 2) - 1)
|
||||
flag = fill(false, len2)
|
||||
ch1_match = Vector{eltype(s1)}()
|
||||
for (i1, ch1) in enumerate(s1)
|
||||
for (i2, ch2) in enumerate(s2)
|
||||
# greedy alignement
|
||||
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
|
||||
if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2]
|
||||
flag[i2] = true
|
||||
push!(ch1_match, ch1)
|
||||
break
|
||||
|
@ -49,7 +49,6 @@ function (dist::Jaro)(s1, s2)
|
|||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
end
|
||||
|
||||
(dist::Jaro)(s1, s2, ::Nothing) = (dist::Jaro)(s1, s2)
|
||||
"""
|
||||
Levenshtein()
|
||||
|
||||
|
@ -143,7 +142,7 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} =
|
|||
if i2 <= k
|
||||
prevch2 = ch2
|
||||
elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
|
||||
# no need to look beyond window of lower right diagonal - maxDistance cells
|
||||
# no need to look beyond window of lower right diagonal - max distance cells
|
||||
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
|
||||
prevch2 = ch2
|
||||
else
|
||||
|
@ -181,7 +180,7 @@ region on either side of the longest common subsequence.
|
|||
"""
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
function (dist::RatcliffObershelp)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
|
@ -230,5 +229,3 @@ function longest_common_pattern(s1, s2)
|
|||
end
|
||||
return start1, start2, len
|
||||
end
|
||||
|
||||
(dist::RatcliffObershelp)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
@ -98,7 +98,7 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::QGram)(s1, s2)
|
||||
function (dist::QGram)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
n = 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -107,7 +107,6 @@ function (dist::QGram)(s1, s2)
|
|||
n
|
||||
end
|
||||
|
||||
(dist::QGram)(s1, s2, ::Nothing) = dist(s1, s2)
|
||||
|
||||
"""
|
||||
Cosine(q::Int)
|
||||
|
@ -125,7 +124,7 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Cosine)(s1, s2)
|
||||
function (dist::Cosine)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -136,7 +135,6 @@ function (dist::Cosine)(s1, s2)
|
|||
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
||||
end
|
||||
|
||||
(dist::Cosine)(s1, s2, ::Nothing) = dist(s1, s2)
|
||||
|
||||
"""
|
||||
Jaccard(q::Int)
|
||||
|
@ -153,7 +151,7 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Jaccard)(s1, s2)
|
||||
function (dist::Jaccard)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -164,7 +162,6 @@ function (dist::Jaccard)(s1, s2)
|
|||
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||
end
|
||||
|
||||
(dist::Jaccard)(s1, s2, ::Nothing) = dist(s1, s2)
|
||||
|
||||
"""
|
||||
SorensenDice(q::Int)
|
||||
|
@ -181,7 +178,7 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::SorensenDice)(s1, s2)
|
||||
function (dist::SorensenDice)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -192,7 +189,6 @@ function (dist::SorensenDice)(s1, s2)
|
|||
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
||||
end
|
||||
|
||||
(dist::SorensenDice)(s1, s2, ::Nothing) = dist(s1, s2)
|
||||
|
||||
"""
|
||||
Overlap(q::Int)
|
||||
|
@ -209,7 +205,7 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Overlap)(s1, s2)
|
||||
function (dist::Overlap)(s1, s2, ::Nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||
|
@ -220,4 +216,3 @@ function (dist::Overlap)(s1, s2)
|
|||
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
||||
end
|
||||
|
||||
(dist::Overlap)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
@ -20,14 +20,16 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
|||
len1, len2 = length(s1), length(s2)
|
||||
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
|
||||
if typeof(dist.dist) <: QGram
|
||||
dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||
out = dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||
else
|
||||
dist.dist(s1, s2)
|
||||
out = dist.dist(s1, s2)
|
||||
end
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function (dist::Normalized)(s1, s2, max_dist = 1.0)
|
||||
dist.dist(s1, s2)
|
||||
out = dist.dist(s1, s2)
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
"""
|
||||
|
@ -67,12 +69,12 @@ normalize(dist::Winkler) = dist
|
|||
|
||||
function (dist::Winkler)(s1, s2, max_dist = 1.0)
|
||||
# cannot do max_dist because of boosting threshold
|
||||
score = dist.dist(s1, s2)
|
||||
if score <= 1 - dist.threshold
|
||||
out = dist.dist(s1, s2)
|
||||
if out <= 1 - dist.threshold
|
||||
l = common_prefix(s1, s2)[1]
|
||||
score -= min(l, dist.maxlength) * dist.p * score
|
||||
out -= min(l, dist.maxlength) * dist.p * out
|
||||
end
|
||||
return score
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
|
||||
|
@ -121,13 +123,14 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
|
|||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale * partial_scale *
|
||||
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
return min(score, score_partial, score_sort, score_set)
|
||||
out = min(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
score_sort = 1 - unbase_scale *
|
||||
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale *
|
||||
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
return min(score, score_sort, score_set)
|
||||
out = min(score, score_sort, score_set)
|
||||
end
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
Loading…
Reference in New Issue