188 lines
6.5 KiB
Julia
Executable File
188 lines
6.5 KiB
Julia
Executable File
"""
|
|
Partial(dist)
|
|
|
|
Creates the `Partial{dist}` distance.
|
|
|
|
`Partial{dist}` returns the minimum distance between the shorter string and substrings of the longer string that have a length equal to the shorter string.
|
|
|
|
See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
|
|
|
### Examples
|
|
```julia-repl
|
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
|
julia> Partial(RatcliffObershelp())(s1, s2)
|
|
0.5483870967741935
|
|
```
|
|
"""
|
|
struct Partial{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|
dist::S
|
|
end
|
|
|
|
function (dist::Partial)(s1, s2; max_dist = nothing)
|
|
(s1 === missing) | (s2 === missing) && return missing
|
|
s1, s2 = reorder(s1, s2)
|
|
len1, len2 = length(s1), length(s2)
|
|
out = dist.dist(s1, s2; max_dist = max_dist)
|
|
max_dist0 = (max_dist !== nothing) ? min(max_dist, out) : out
|
|
((len1 == 0) | (len1 == len2)) && return out
|
|
for x in qgrams(s2, len1)
|
|
curr = dist.dist(s1, x; max_dist = max_dist0)
|
|
out = min(out, curr)
|
|
max_dist0 = min(max_dist0, curr)
|
|
end
|
|
return out
|
|
end
|
|
|
|
# specialized (faster) version for RatcliffObershelp
|
|
function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}})(s1, s2; max_dist = nothing)
|
|
(s1 === missing) | (s2 === missing) && return missing
|
|
s1, s2 = reorder(s1, s2)
|
|
len1, len2 = length(s1), length(s2)
|
|
len1 == len2 && return dist.dist(s1, s2)
|
|
out = 1.0
|
|
for s2_start in matching_blocks(s1, s2, 1, 1, len1, len2)
|
|
# Make sure the substring of s2 has length len1
|
|
if s2_start < 1
|
|
s2_start = 1
|
|
elseif s2_start + len1 - 1 > len2
|
|
s2_start += len2 - (s2_start + len1 - 1)
|
|
end
|
|
n_matched = length_matching_blocks(s1, s2, 1, s2_start, len1, s2_start + len1 - 1)
|
|
curr = 1 - 2 * n_matched / (len1 + len1)
|
|
out = min(out, curr)
|
|
end
|
|
return out
|
|
end
|
|
|
|
function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
|
|
x = Set{Int}()
|
|
p = zeros(Int, max(end1 - start1, end2 - start2) + 1)
|
|
matching_blocks!(x, p, s1, s2, start1, start2, end1, end2)
|
|
end
|
|
|
|
function matching_blocks!(x::Set{Int}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
|
|
j1, j2, len = longest_common_pattern!(p, s1, s2, start1, start2, end1, end2)
|
|
len == 0 && return x
|
|
push!(x, j2 - j1 + 1)
|
|
matching_blocks!(x, p, s1, s2, start1, start2, j1 - 1, j2 - 1)
|
|
matching_blocks!(x, p, s1, s2, j1 + len, j2 + len, end1, end2)
|
|
return x
|
|
end
|
|
|
|
Normalized(dist::Partial) = Normalized{typeof(Partial(Normalized(dist.dist)))}(Partial(Normalized(dist.dist)))
|
|
"""
|
|
TokenSort(dist)
|
|
|
|
Creates the `TokenSort{dist}` distance.
|
|
|
|
`TokenSort{dist}` returns the distance between strings after reording words alphabetically.
|
|
See http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
|
|
|
It is only defined on AbstractStrings.
|
|
|
|
### Examples
|
|
```julia-repl
|
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
|
julia> s1 = "New York Mets vs Atlanta Braves"
|
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
|
julia> TokenSort(RatcliffObershelp())(s1, s2)
|
|
0.0
|
|
```
|
|
"""
|
|
struct TokenSort{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|
dist::S
|
|
end
|
|
|
|
function (dist::TokenSort)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
|
(s1 === missing) | (s2 === missing) && return missing
|
|
f = s -> join(sort!(split(s)), " ")
|
|
dist.dist(f(s1), f(s2); max_dist = max_dist)
|
|
end
|
|
|
|
Normalized(dist::TokenSort) = Normalized{typeof(TokenSort(Normalized(dist.dist)))}(TokenSort(Normalized(dist.dist)))
|
|
"""
|
|
TokenSet(dist)
|
|
|
|
Creates the `TokenSet{dist}` distance, which is only defined on AbstractStrings.
|
|
|
|
`TokenSet{dist}` returns the minimum the distances between:
|
|
[SORTED_INTERSECTION]
|
|
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING1]
|
|
[SORTED_INTERSECTION] + [SORTED_REST_OF_STRING2]
|
|
See: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
|
|
|
### Examples
|
|
```julia-repl
|
|
julia> s1 = "New York Mets vs Atlanta"
|
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
|
julia> TokenSet(RatcliffObershelp())(s1, s2)
|
|
0.0
|
|
```
|
|
"""
|
|
struct TokenSet{S <: Union{StringSemiMetric, StringMetric}} <: StringSemiMetric
|
|
dist::S
|
|
end
|
|
|
|
function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
|
(s1 === missing) | (s2 === missing) && return missing
|
|
v1 = unique!(sort!(split(s1)))
|
|
v2 = unique!(sort!(split(s2)))
|
|
v0 = intersect(v1, v2)
|
|
s0 = join(v0, " ")
|
|
s1 = join(v1, " ")
|
|
s2 = join(v2, " ")
|
|
isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist)
|
|
min(dist.dist(s0, s1; max_dist = max_dist),
|
|
dist.dist(s0, s2; max_dist = max_dist),
|
|
dist.dist(s1, s2; max_dist = max_dist))
|
|
end
|
|
|
|
Normalized(dist::TokenSet) = Normalized{typeof(TokenSet(Normalized(dist.dist)))}(TokenSet(Normalized(dist.dist)))
|
|
"""
|
|
TokenMax(dist)
|
|
|
|
Creates the `TokenMax{dist}` distance, which is only defined on AbstractStrings.
|
|
|
|
`TokenMax{dist}` normalizes the distance `dist` and returns the minimum of the distance,
|
|
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
|
|
[`TokenSet`](@ref) modifier, with penalty terms depending on the strings lengths.
|
|
|
|
|
|
### Examples
|
|
```julia-repl
|
|
julia> s1 = "New York Mets vs Atlanta"
|
|
julia> s2 = "Atlanta Braves vs New York Mets"
|
|
julia> evaluate(TokenMax(RatcliffObershelp()), s1, s2)
|
|
0.05
|
|
```
|
|
"""
|
|
struct TokenMax{S <: Normalized} <: StringSemiMetric
|
|
dist::S
|
|
end
|
|
TokenMax(dist::Union{StringSemiMetric, StringMetric}) = TokenMax(Normalized(dist))
|
|
|
|
function (dist::TokenMax)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = 1.0)
|
|
(s1 === missing) | (s2 === missing) && return missing
|
|
s1, s2 = reorder(s1, s2)
|
|
len1, len2 = length(s1), length(s2)
|
|
dist0 = dist.dist
|
|
out = dist0(s1, s2; max_dist = max_dist)
|
|
max_dist = min(max_dist, out)
|
|
scale = 0.95
|
|
# if one string is much shorter than the other, use partial
|
|
if len2 >= 1.5 * len1
|
|
dist0 = Partial(dist0)
|
|
pscale = 0.9
|
|
pout = 1 - pscale * (1 - dist0(s1, s2; max_dist = 1 - (1 - max_dist) / pscale))
|
|
out = min(out, pout)
|
|
max_dist = min(max_dist, pout)
|
|
scale *= pscale
|
|
end
|
|
out_sort = 1 - scale * (1 - TokenSort(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
|
|
max_dist = min(max_dist, out_sort)
|
|
out_set = 1 - scale * (1 - TokenSet(dist0)(s1, s2; max_dist = 1 - (1 - max_dist) / scale))
|
|
out = min(out, out_sort, out_set)
|
|
out > max_dist ? 1.0 : out
|
|
end
|