pull/17/head
matthieugomez 2019-08-17 13:12:55 -04:00
parent efcace4f03
commit a6ef80daf8
4 changed files with 11 additions and 24 deletions

View File

@ -1,6 +1,6 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.3.3"
version = "0.4.0"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

View File

@ -18,7 +18,7 @@ end
function compare(s1::AbstractString, s2::AbstractString,
dist::AbstractQGramDistance)
# When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2)
len1, len2 = length(s1), length(s2)
min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.N + 2)
@ -69,14 +69,10 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare("", "", dist.dist)
iter = QGramIterator(s2, len2, len1)
out = 0.0
x = iterate(iter)
while x !== nothing
s, state = x
curr = compare(s1, s, dist.dist)
for x in qgram_iterator(s2, len1)
curr = compare(s1, x, dist.dist)
out = max(out, curr)
x = iterate(iter, state)
end
return out
end
@ -87,8 +83,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
result = matching_blocks(s1, s2)
for r in result
for r in matching_blocks(s1, s2)
# here I difffer from fuzz.py by making sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
s2_end = s2_start + len1 - 1
@ -138,18 +133,15 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet)
s0 = join(v0, " ")
s1 = join(Iterators.flatten((v0, v1)), " ")
s2 = join(Iterators.flatten((v0, v2)), " ")
if isempty(s0)
# otherwise compare("", "a", dist)== 1.0
compare(s1, s2, dist.dist)
else
max(compare(s0, s1, dist.dist),
# otherwise compare("", "a", dist)== 1.0
isempty(s0) && return compare(s1, s2, dist.dist)
max(compare(s0, s1, dist.dist),
compare(s1, s2, dist.dist),
compare(s0, s2, dist.dist))
end
compare(s0, s2, dist.dist))
end
# separate 2 vectors in intersection, setdiff1, setdiff2 (all sorted)
function _separate!(v1::Vector, v2::Vector)
function _separate!(v1::AbstractVector, v2::AbstractVector)
sort!(v1)
sort!(v2)
out = eltype(v1)[]

View File

@ -219,6 +219,3 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
return x
end

View File

@ -24,11 +24,9 @@ function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1
return l, x1, x2
end
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
function longest_common_substring(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
if len1 > len2
start2, start1, len = longest_common_substring(s2, s1, len2, len1)
else