tag new version

pull/57/head v0.11.1
matthieugomez 2021-09-26 10:47:30 -04:00
parent acf6623c2d
commit 9fb099e2aa
4 changed files with 16 additions and 20 deletions

View File

@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.11.0"
version = "0.11.1"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

View File

@ -2,10 +2,10 @@ module StringDistances
using Distances
import StatsAPI: pairwise, pairwise!
# Distances API
abstract type StringSemiMetric <: SemiMetric end
abstract type StringMetric <: Metric end
(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2)
const StringDistance = Union{StringSemiMetric, StringMetric}
function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::Type, s2::Type)
T = typeof(dist("", ""))
if (Missing <: s1) | (Missing <: s2)
@ -15,6 +15,9 @@ function Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1::
end
Distances.result_type(dist::Union{StringSemiMetric, StringMetric}, s1, s2) = result_type(dist, typeof(s1), typeof(s2))
(dist::Union{StringSemiMetric, StringMetric})(s1, s2; max_dist = nothing) = dist(s1, s2)
include("utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
@ -24,7 +27,6 @@ include("find.jl")
include("fuzzywuzzy.jl")
const StringDistance = Union{StringSemiMetric, StringMetric}
##############################################################################
##
## Export

View File

@ -34,15 +34,15 @@ function (dist::Partial)(s1, s2; max_dist = nothing)
return out
end
# specialized (faster) version for RatcliffObershelp
function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp}}})(s1, s2; max_dist = nothing)
(s1 === missing) | (s2 === missing) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return dist.dist(s1, s2)
out = 1.0
for r in matching_blocks(s1, s2, 1, 1, len1, len2)
for s2_start in matching_blocks(s1, s2, 1, 1, len1, len2)
# Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
if s2_start < 1
s2_start = 1
elseif s2_start + len1 - 1 > len2
@ -56,20 +56,16 @@ function (dist::Partial{<: Union{RatcliffObershelp, Normalized{RatcliffObershelp
end
function matching_blocks(s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
x = Set{Tuple{Int, Int, Int}}()
x = Set{Int}()
p = zeros(Int, max(end1 - start1, end2 - start2) + 1)
matching_blocks!(x, p, s1, s2, start1, start2, end1, end2)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
function matching_blocks!(x::Set{Int}, p::Vector{Int}, s1, s2, start1::Integer, start2::Integer, end1::Integer, end2::Integer)
j1, j2, len = longest_common_pattern!(p, s1, s2, start1, start2, end1, end2)
# exit if there is no common substring
len == 0 && return x
# add the info of the common to the existing set
push!(x, (j1, j2, len))
# add the longest common substring that happens before
push!(x, j2 - j1 + 1)
matching_blocks!(x, p, s1, s2, start1, start2, j1 - 1, j2 - 1)
# add the longest common substring that happens after
matching_blocks!(x, p, s1, s2, j1 + len, j2 + len, end1, end2)
return x
end
@ -137,10 +133,9 @@ function (dist::TokenSet)(s1::Union{AbstractString, Missing}, s2::Union{Abstract
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return dist.dist(s1, s2; max_dist = max_dist)
out_01 = dist.dist(s0, s1; max_dist = max_dist)
out_02 = dist.dist(s0, s2; max_dist = max_dist)
out_12 = dist.dist(s1, s2; max_dist = max_dist)
min(out_01, out_02, out_12)
min(dist.dist(s0, s1; max_dist = max_dist),
dist.dist(s0, s2; max_dist = max_dist),
dist.dist(s1, s2; max_dist = max_dist))
end
Normalized(dist::TokenSet) = Normalized{typeof(TokenSet(Normalized(dist.dist)))}(TokenSet(Normalized(dist.dist)))

View File

@ -26,12 +26,11 @@ string_with_length(s::AbstractString) = StringWithLength(s, length(s))
# Not really needed but avoid multi-encapsulation
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
Base.iterate(s::StringWithLength) = iterate(s.s)
Base.iterate(s::StringWithLength, i::Integer) = iterate(s.s, i)
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
function reorder(s1::AbstractString, s2::AbstractString)
s1 = string_with_length(s1)
s2 = string_with_length(s2)