Update normalize.jl

pull/57/head
matthieugomez 2021-09-14 14:04:49 -04:00
parent 5c8109833c
commit ac6d315dc9
1 changed files with 6 additions and 5 deletions

View File

@ -3,7 +3,8 @@
"""
Normalized(dist::Union{StringSemiMetric, StringMetric})
Creates a normalized distance. The normalized distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing)
Creates a normalized distance. The normalized distance always return a Float64 between 0.0 and 1.0 (or a missing if one of the argument is missing).
A Normalized Distance has a keyword argument `max_dist` that defaults to 1.0. It returns 1.0 if the true distance is higher than `max_dist`.
### Examples
```julia-repl
@ -20,7 +21,7 @@ struct Normalized{T <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetri
end
Normalized(dist::Normalized) = dist
# this basically says that all distances are considered to be normalized by default
# Consider all distances to be normalized by default
function (dist::Normalized)(s1, s2; max_dist = 1.0)
out = dist.dist(s1, s2; max_dist = max_dist)
max_dist !== nothing && out > max_dist && return 1.0
@ -29,9 +30,9 @@ end
function (dist::Normalized{<:Union{Hamming, DamerauLevenshtein}})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 0.0
out = dist.dist(s1, s2) / len2
max_dist !== nothing && out > max_dist && return 1.0
return out
@ -39,9 +40,9 @@ end
function (dist::Normalized{<:Union{Levenshtein, OptimalStringAlignement}})(s1, s2; max_dist = 1.0)
(s1 === missing) | (s2 === missing) && return missing
isempty(s1) && isempty(s2) && return 0.0
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 0.0
if max_dist == 1.0
d = dist.dist(s1, s2)
else
@ -57,7 +58,7 @@ function (dist::Normalized{<:AbstractQGramDistance})(s1, s2; max_dist = 1.0)
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
len1 <= dist.dist.q - 1 && return Float64(s1 != s2)
if dist.dist isa QGram
out = dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
else