simplify three arguments form
parent
41ccf12e45
commit
093c536377
|
@ -8,7 +8,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
|||
|
||||
[compat]
|
||||
julia = "1"
|
||||
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
|
||||
Distances = "0.8.1"
|
||||
|
||||
[extras]
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
|
|
@ -8,8 +8,11 @@ include("qgram.jl")
|
|||
include("modifiers.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
Distances.evaluate(dist::StringDistance, args...) = dist(args...)
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
|
||||
|
||||
|
||||
|
||||
include("find.jl")
|
||||
|
||||
##############################################################################
|
||||
|
|
22
src/edit.jl
22
src/edit.jl
|
@ -12,12 +12,11 @@ where ``m`` is the number of matching characters and
|
|||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: SemiMetric end
|
||||
isnormalized(::Jaro) = true
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
## accepts any iterator, including AbstractString
|
||||
function (dist::Jaro)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::Jaro)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# If both are empty, the formula in Wikipedia gives 0
|
||||
|
@ -25,6 +24,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
|
|||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
flag = fill(false, len2)
|
||||
prevstate1 = firstindex(s1)
|
||||
ch1_match = Vector{eltype(s1)}(undef, len1)
|
||||
# m counts number matching characters
|
||||
m = 0
|
||||
|
@ -55,6 +55,7 @@ function (dist::Jaro)(s1, s2, max_dist = nothing)
|
|||
end
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevstate1 = state1
|
||||
end
|
||||
m == 0 && return 1.0
|
||||
# t counts number of transpositions
|
||||
|
@ -82,11 +83,11 @@ substitutions of a single character) required to change one string into the othe
|
|||
struct Levenshtein <: Metric end
|
||||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
# Return max_dist +1 if distance higher than max_dist
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||
# This is important for find_all
|
||||
function (dist::Levenshtein)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
|
@ -125,8 +126,6 @@ function (dist::Levenshtein)(s1, s2, max_dist = nothing)
|
|||
return current
|
||||
end
|
||||
|
||||
|
||||
|
||||
"""
|
||||
DamerauLevenshtein()
|
||||
|
||||
|
@ -139,8 +138,9 @@ required to change one string into the other.
|
|||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
# Return max_dist + 1 if distance higher than max_dist
|
||||
function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
|
@ -223,10 +223,8 @@ region on either side of the longest common subsequence.
|
|||
"""
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
isnormalized(::RatcliffObershelp) = true
|
||||
|
||||
function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::RatcliffObershelp)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
||||
|
|
|
@ -6,16 +6,12 @@ end
|
|||
|
||||
Normalize a metric, so that `evaluate` always return a Float64 between 0 and 1 (or a `missing` if one element is missing)
|
||||
"""
|
||||
function normalize(dist::SemiMetric)
|
||||
isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
|
||||
end
|
||||
|
||||
isnormalized(dist::SemiMetric) = false
|
||||
isnormalized(dist::Normalize) = true
|
||||
# also a normalized distance always accept a third argument, max_dist.
|
||||
|
||||
normalize(dist::SemiMetric) = Normalize{typeof(dist)}(dist)
|
||||
|
||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
|
@ -25,7 +21,7 @@ function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, ma
|
|||
end
|
||||
|
||||
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -37,6 +33,9 @@ function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
|||
end
|
||||
end
|
||||
|
||||
function (dist::Normalize)(s1, s2, max_dist = 1.0)
|
||||
dist.dist(s1, s2)
|
||||
end
|
||||
|
||||
"""
|
||||
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
|
||||
|
@ -55,15 +54,15 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
|
|||
maxlength::Integer # max length of common prefix. Default to 4
|
||||
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
|
||||
end
|
||||
isnormalized(dist::Winkler) = true
|
||||
|
||||
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
||||
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
|
||||
end
|
||||
normalize(dist::Winkler) = dist
|
||||
|
||||
function (dist::Winkler)(s1, s2, max_dist = 1.0)
|
||||
# cannot do min_score because of boosting threshold
|
||||
# cannot do max_dist because of boosting threshold
|
||||
score = dist.dist(s1, s2)
|
||||
if score <= 1 - dist.threshold
|
||||
l = common_prefix(s1, s2)[1]
|
||||
|
@ -94,13 +93,13 @@ struct Partial{S <: SemiMetric} <: SemiMetric
|
|||
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::Partial) = true
|
||||
normalize(dist::Partial) = dist
|
||||
|
||||
function (dist::Partial)(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2, max_dist)
|
||||
len1 == 0 && return 0.0
|
||||
len1 == 0 && return 1.0
|
||||
out = 1.0
|
||||
for x in qgrams(s2, len1)
|
||||
curr = dist.dist(s1, x, max_dist)
|
||||
|
@ -110,7 +109,7 @@ function (dist::Partial)(s1, s2, max_dist = 1.0)
|
|||
return out
|
||||
end
|
||||
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
|
||||
function (dist::Partial{Normalize{RatcliffObershelp}})(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
|
@ -127,7 +126,6 @@ function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
|
|||
s2_end += len2 - s2_end
|
||||
end
|
||||
curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
|
||||
|
||||
out = min(out, curr)
|
||||
end
|
||||
return out
|
||||
|
@ -155,7 +153,7 @@ struct TokenSort{S <: SemiMetric} <: SemiMetric
|
|||
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenSort) = true
|
||||
normalize(dist::TokenSort) = dist
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
|
@ -187,6 +185,7 @@ struct TokenSet{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenSet) = true
|
||||
normalize(dist::TokenSet) = dist
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
|
@ -229,7 +228,7 @@ struct TokenMax{S <: SemiMetric} <: SemiMetric
|
|||
end
|
||||
|
||||
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenMax) = true
|
||||
normalize(dist::TokenMax) = dist
|
||||
|
||||
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
|
|
18
src/qgram.jl
18
src/qgram.jl
|
@ -97,7 +97,7 @@ struct QGram <: QGramDistance
|
|||
end
|
||||
|
||||
function (dist::QGram)(s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -122,8 +122,8 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Cosine)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::Cosine)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -149,8 +149,8 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Jaccard)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::Jaccard)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -176,8 +176,8 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::SorensenDice)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::SorensenDice)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
@ -203,8 +203,8 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function (dist::Overlap)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
function (dist::Overlap)(s1, s2)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in itr
|
||||
|
|
|
@ -23,6 +23,8 @@ struct StringWithLength{T <: AbstractString} <: AbstractString
|
|||
l::Int
|
||||
end
|
||||
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
||||
# Not really needed but avoid multi-encapsulation
|
||||
string_with_length(s::StringWithLength) = s
|
||||
Base.length(s::StringWithLength) = s.l
|
||||
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
|
||||
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
|
||||
|
|
|
@ -37,6 +37,7 @@ using StringDistances, Unicode, Test
|
|||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
|
||||
|
||||
@test compare("New York Yankees", "", Partial(Jaro())) ≈ 0.0
|
||||
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
|
||||
|
|
Loading…
Reference in New Issue