return 1 if distance over maxdist
parent
9d4ae1a510
commit
fb0a786fd9
|
@ -3,12 +3,13 @@ module StringDistances
|
||||||
using Distances
|
using Distances
|
||||||
|
|
||||||
include("utils.jl")
|
include("utils.jl")
|
||||||
include("edit.jl")
|
include("distances/edit.jl")
|
||||||
include("qgram.jl")
|
include("distances/qgram.jl")
|
||||||
include("modifiers.jl")
|
include("modifiers.jl")
|
||||||
include("normalize.jl")
|
include("normalize.jl")
|
||||||
|
|
||||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalized}
|
||||||
|
# Distances API
|
||||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -14,20 +14,20 @@ where ``m`` is the number of matching characters and
|
||||||
struct Jaro <: SemiMetric end
|
struct Jaro <: SemiMetric end
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
function (dist::Jaro)(s1, s2)
|
function (dist::Jaro)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
# If both are empty, the formula in Wikipedia gives 0
|
# If both are empty, the formula in Wikipedia gives 0
|
||||||
# Add this line so that not the case
|
# Add this line so that not the case
|
||||||
len2 == 0 && return 0.0
|
len2 == 0 && return 0.0
|
||||||
maxdist = max(0, div(len2, 2) - 1)
|
d = max(0, div(len2, 2) - 1)
|
||||||
flag = fill(false, len2)
|
flag = fill(false, len2)
|
||||||
ch1_match = Vector{eltype(s1)}()
|
ch1_match = Vector{eltype(s1)}()
|
||||||
for (i1, ch1) in enumerate(s1)
|
for (i1, ch1) in enumerate(s1)
|
||||||
for (i2, ch2) in enumerate(s2)
|
for (i2, ch2) in enumerate(s2)
|
||||||
# greedy alignement
|
# greedy alignement
|
||||||
if (i2 <= i1 + maxdist) && (i2 >= i1 - maxdist) && (ch1 == ch2) && !flag[i2]
|
if (i2 <= i1 + d) && (i2 >= i1 - d) && (ch1 == ch2) && !flag[i2]
|
||||||
flag[i2] = true
|
flag[i2] = true
|
||||||
push!(ch1_match, ch1)
|
push!(ch1_match, ch1)
|
||||||
break
|
break
|
||||||
|
@ -49,7 +49,6 @@ function (dist::Jaro)(s1, s2)
|
||||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::Jaro)(s1, s2, ::Nothing) = (dist::Jaro)(s1, s2)
|
|
||||||
"""
|
"""
|
||||||
Levenshtein()
|
Levenshtein()
|
||||||
|
|
||||||
|
@ -143,7 +142,7 @@ function (dist::DamerauLevenshtein)(s1, s2, max_dist::Union{Integer, Nothing} =
|
||||||
if i2 <= k
|
if i2 <= k
|
||||||
prevch2 = ch2
|
prevch2 = ch2
|
||||||
elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
|
elseif (max_dist !== nothing) && ((i2 - k - 1 < i2_start) | (i2 - k - 1 >= i2_end))
|
||||||
# no need to look beyond window of lower right diagonal - maxDistance cells
|
# no need to look beyond window of lower right diagonal - max distance cells
|
||||||
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
|
#lower right diag is i1 - (len2 - len1)) and the upper left diagonal + max_dist cells (upper left is i1)
|
||||||
prevch2 = ch2
|
prevch2 = ch2
|
||||||
else
|
else
|
||||||
|
@ -181,7 +180,7 @@ region on either side of the longest common subsequence.
|
||||||
"""
|
"""
|
||||||
struct RatcliffObershelp <: SemiMetric end
|
struct RatcliffObershelp <: SemiMetric end
|
||||||
|
|
||||||
function (dist::RatcliffObershelp)(s1, s2)
|
function (dist::RatcliffObershelp)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
|
@ -230,5 +229,3 @@ function longest_common_pattern(s1, s2)
|
||||||
end
|
end
|
||||||
return start1, start2, len
|
return start1, start2, len
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::RatcliffObershelp)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
|
@ -98,7 +98,7 @@ struct QGram <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::QGram)(s1, s2)
|
function (dist::QGram)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
n = 0
|
n = 0
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -107,7 +107,6 @@ function (dist::QGram)(s1, s2)
|
||||||
n
|
n
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::QGram)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Cosine(q::Int)
|
Cosine(q::Int)
|
||||||
|
@ -125,7 +124,7 @@ struct Cosine <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Cosine)(s1, s2)
|
function (dist::Cosine)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -136,7 +135,6 @@ function (dist::Cosine)(s1, s2)
|
||||||
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::Cosine)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Jaccard(q::Int)
|
Jaccard(q::Int)
|
||||||
|
@ -153,7 +151,7 @@ struct Jaccard <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Jaccard)(s1, s2)
|
function (dist::Jaccard)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -164,7 +162,6 @@ function (dist::Jaccard)(s1, s2)
|
||||||
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::Jaccard)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
SorensenDice(q::Int)
|
SorensenDice(q::Int)
|
||||||
|
@ -181,7 +178,7 @@ struct SorensenDice <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::SorensenDice)(s1, s2)
|
function (dist::SorensenDice)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -192,7 +189,6 @@ function (dist::SorensenDice)(s1, s2)
|
||||||
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::SorensenDice)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Overlap(q::Int)
|
Overlap(q::Int)
|
||||||
|
@ -209,7 +205,7 @@ struct Overlap <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Overlap)(s1, s2)
|
function (dist::Overlap)(s1, s2, ::Nothing)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
for (n1, n2) in _count(qgrams(s1, dist.q), qgrams(s2, dist.q))
|
||||||
|
@ -220,4 +216,3 @@ function (dist::Overlap)(s1, s2)
|
||||||
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
||||||
end
|
end
|
||||||
|
|
||||||
(dist::Overlap)(s1, s2, ::Nothing) = dist(s1, s2)
|
|
|
@ -20,14 +20,16 @@ function (dist::Normalized{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
|
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
|
||||||
if typeof(dist.dist) <: QGram
|
if typeof(dist.dist) <: QGram
|
||||||
dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
out = dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||||
else
|
else
|
||||||
dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
end
|
end
|
||||||
|
out > max_dist ? 1.0 : out
|
||||||
end
|
end
|
||||||
|
|
||||||
function (dist::Normalized)(s1, s2, max_dist = 1.0)
|
function (dist::Normalized)(s1, s2, max_dist = 1.0)
|
||||||
dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
|
out > max_dist ? 1.0 : out
|
||||||
end
|
end
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -67,12 +69,12 @@ normalize(dist::Winkler) = dist
|
||||||
|
|
||||||
function (dist::Winkler)(s1, s2, max_dist = 1.0)
|
function (dist::Winkler)(s1, s2, max_dist = 1.0)
|
||||||
# cannot do max_dist because of boosting threshold
|
# cannot do max_dist because of boosting threshold
|
||||||
score = dist.dist(s1, s2)
|
out = dist.dist(s1, s2)
|
||||||
if score <= 1 - dist.threshold
|
if out <= 1 - dist.threshold
|
||||||
l = common_prefix(s1, s2)[1]
|
l = common_prefix(s1, s2)[1]
|
||||||
score -= min(l, dist.maxlength) * dist.p * score
|
out -= min(l, dist.maxlength) * dist.p * out
|
||||||
end
|
end
|
||||||
return score
|
out > max_dist ? 1.0 : out
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -121,13 +123,14 @@ function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0
|
||||||
max_dist = min(max_dist, score_sort)
|
max_dist = min(max_dist, score_sort)
|
||||||
score_set = 1 - unbase_scale * partial_scale *
|
score_set = 1 - unbase_scale * partial_scale *
|
||||||
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
(1 - TokenSet(partial_dist)(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||||
return min(score, score_partial, score_sort, score_set)
|
out = min(score, score_partial, score_sort, score_set)
|
||||||
else
|
else
|
||||||
score_sort = 1 - unbase_scale *
|
score_sort = 1 - unbase_scale *
|
||||||
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||||
max_dist = min(max_dist, score_sort)
|
max_dist = min(max_dist, score_sort)
|
||||||
score_set = 1 - unbase_scale *
|
score_set = 1 - unbase_scale *
|
||||||
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||||
return min(score, score_sort, score_set)
|
out = min(score, score_sort, score_set)
|
||||||
end
|
end
|
||||||
|
out > max_dist ? 1.0 : out
|
||||||
end
|
end
|
Loading…
Reference in New Issue