parent
acf6623c2d
commit
9fb099e2aa
|
@ -1,6 +1,6 @@
|
|||
name = "StringDistances"
|
||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||
version = "0.11.0"
|
||||
version = "0.11.1"
|
||||
|
||||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
|
|
|
@ -2,10 +2,10 @@ module StringDistances
|
|||
|
||||
using Distances
|
||||
import StatsAPI: pairwise, pairwise!
|
||||
# Distances API
|
||||
abstract type StringSemiMetric <: SemiMetric end
|
||||
abstract type StringMetric <: Metric end
|
||||
(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2)
|
||||
|
||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
||||
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
|
||||
T = typeof(dist("", ""))
|
||||
if (Missing <: s1) | (Missing <: s2)
|
||||
|
@ -15,6 +15,9 @@ function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::
|
|||
end
|
||||
Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
|
||||
|
||||
|
||||
|
||||
(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2)
|
||||
include("utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
|
@ -24,7 +27,6 @@ include("find.jl")
|
|||
include("fuzzywuzzy.jl")
|
||||
|
||||
|
||||
const StringDistance = Union{StringSemiMetric, StringMetric}
|
||||
##############################################################################
|
||||
##
|
||||
## Export
|
||||
|
|
|
@ -34,15 +34,15 @@ function (dist::Partial)(s1, s2; max_dist = nothing)
|
|||
return out
|
||||
end
|
||||
|
||||
# specialized (faster) version for RatcliffObershelp
|
||||
function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}})(s1, s2; max_dist = nothing)
|
||||
(s1 === missing) | (s2 === missing) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
out = 1.0
|
||||
for r in matching_blocks(s1, s2, 1, 1, len1, len2)
|
||||
for s2_start in matching_blocks(s1, s2, 1, 1, len1, len2)
|
||||
# Make sure the substring of s2 has length len1
|
||||
s2_start = r[2] - r[1] + 1
|
||||
if s2_start < 1
|
||||
s2_start = 1
|
||||
elseif s2_start + len1 - 1 > len2
|
||||
|
@ -56,20 +56,16 @@ function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp
|
|||
end
|
||||
|
||||
function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
|
||||
x = Set{Tuple{Int, Int, Int}}()
|
||||
x = Set{Int}()
|
||||
p = zeros(Int, max(end1 - start1, end2 - start2) + 1)
|
||||
matching_blocks!(x, p, s1, s2, start1, start2, end1, end2)
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
|
||||
function matching_blocks!(x::Set{Int}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
|
||||
j1, j2, len = longest_common_pattern!(p, s1, s2, start1, start2, end1, end2)
|
||||
# exit if there is no common substring
|
||||
len == 0 && return x
|
||||
# add the info of the common to the existing set
|
||||
push!(x, (j1, j2, len))
|
||||
# add the longest common substring that happens before
|
||||
push!(x, j2 - j1 + 1)
|
||||
matching_blocks!(x, p, s1, s2, start1, start2, j1 - 1, j2 - 1)
|
||||
# add the longest common substring that happens after
|
||||
matching_blocks!(x, p, s1, s2, j1 + len, j2 + len, end1, end2)
|
||||
return x
|
||||
end
|
||||
|
@ -137,10 +133,9 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
|
|||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist)
|
||||
out_01 = dist.dist(s0, s1; max_dist = max_dist)
|
||||
out_02 = dist.dist(s0, s2; max_dist = max_dist)
|
||||
out_12 = dist.dist(s1, s2; max_dist = max_dist)
|
||||
min(out_01, out_02, out_12)
|
||||
min(dist.dist(s0, s1; max_dist = max_dist),
|
||||
dist.dist(s0, s2; max_dist = max_dist),
|
||||
dist.dist(s1, s2; max_dist = max_dist))
|
||||
end
|
||||
|
||||
Normalized(dist::TokenSet) = Normalized{typeof(TokenSet(Normalized(dist.dist)))}(TokenSet(Normalized(dist.dist)))
|
||||
|
|
|
@ -26,12 +26,11 @@ string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
|||
# Not really needed but avoid multi-encapsulation
|
||||
string_with_length(s::StringWithLength) = s
|
||||
Base.length(s::StringWithLength) = s.l
|
||||
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
|
||||
Base.iterate(s::StringWithLength) = iterate(s.s)
|
||||
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
|
||||
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
|
||||
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
|
||||
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
|
||||
|
||||
|
||||
function reorder(s1::AbstractString, s2::AbstractString)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
|
|
Loading…
Reference in New Issue