update to 0.7
parent
ba5a54fa84
commit
69a008fcf8
|
@ -67,13 +67,13 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
|||
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||
len1 == 0 && return compare(dist.dist, "", "")
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
state = iterate(iter)
|
||||
s, state = iterate(iter, state)
|
||||
out = compare(dist.dist, s1, s)
|
||||
while state != nothing
|
||||
s, state = iterate(iter, state)
|
||||
out = 0.0
|
||||
x = iterate(iter)
|
||||
while x != nothing
|
||||
s, state = x
|
||||
curr = compare(dist.dist, s1, s)
|
||||
out = max(out, curr)
|
||||
x = iterate(iter, state)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
@ -97,7 +97,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
|
|||
s2_end += len2 - s2_end
|
||||
end
|
||||
i2_start = nextind(s2, 0, s2_start)
|
||||
i2_end = s2_end == len2 ? endof(s2) : (nextind(s2, 0, s2_end + 1) - 1)
|
||||
i2_end = s2_end == len2 ? lastindex(s2) : (nextind(s2, 0, s2_end + 1) - 1)
|
||||
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
|
||||
out = max(out, curr)
|
||||
end
|
||||
|
@ -115,12 +115,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
|
||||
if search(s1, Base._default_delims) > 0
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
end
|
||||
if search(s2, Base._default_delims) > 0
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
end
|
||||
compare(dist.dist, s1, s2)
|
||||
end
|
||||
|
||||
|
@ -137,8 +133,8 @@ end
|
|||
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
|
||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(chain(v0, v1), " ")
|
||||
s2 = join(chain(v0, v2), " ")
|
||||
s1 = join(Iterators.flatten((v0, v1)), " ")
|
||||
s2 = join(Iterators.flatten((v0, v2)), " ")
|
||||
if isempty(s0)
|
||||
# otherwise compare(dist, "", "a")== 1.0
|
||||
compare(dist.dist, s1, s2)
|
||||
|
|
|
@ -32,12 +32,12 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1)
|
||||
s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1)
|
||||
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
|
||||
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
|
||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||
if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2)
|
||||
s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1))
|
||||
s2after = SubString(s2, chr2ind(s2, a[2] + a[3]), endof(s2))
|
||||
if (a[1] + a[3]) <= lastindex(s1) && (a[2] + a[3]) <= lastindex(s2)
|
||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
||||
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -4,19 +4,23 @@
|
|||
##############################################################################
|
||||
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
state1 = firstindex(s1)
|
||||
state2 = firstindex(s2)
|
||||
# in case this loop never happens
|
||||
out1 = firstindex(s1)
|
||||
out2 = firstindex(s2)
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
l = 0
|
||||
while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0)
|
||||
ch1, nextstate1 = iterate(s1, state1)
|
||||
ch2, nextstate2 = iterate(s2, state2)
|
||||
while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
|
||||
ch1, state1 = x1
|
||||
ch2, state2 = x2
|
||||
ch1 != ch2 && break
|
||||
out1 = state1
|
||||
out2 = state2
|
||||
x1 = iterate(s1, state1)
|
||||
x2 = iterate(s2, state2)
|
||||
l += 1
|
||||
state1, state2 = nextstate1, nextstate2
|
||||
end
|
||||
return l, state1, state2
|
||||
return l, out1, out2
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -48,28 +52,26 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
|||
# prefix common to both strings can be ignored
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
(start1 > ncu1) && return len2 - k
|
||||
x1 = iterate(s1, start1)
|
||||
(x1 == nothing) && return len2 - k
|
||||
# distance initialized to first row of matrix
|
||||
# => distance between "" and s2[1:i}
|
||||
v0 = Array{Int}(undef, len2 - k)
|
||||
for i2 in 1:(len2 - k)
|
||||
v0[i2] = i2
|
||||
end
|
||||
current = zero(0)
|
||||
state1 = start1
|
||||
current = 0
|
||||
i1 = 0
|
||||
while state1 <= ncu1
|
||||
while x1 != nothing
|
||||
i1 += 1
|
||||
ch1, state1 = iterate(s1, i1)
|
||||
ch1, state1 = x1
|
||||
left = (i1 - 1)
|
||||
current = (i1 - 1)
|
||||
state2 = start2
|
||||
i2 = 0
|
||||
while state2 <= ncu2
|
||||
x2 = iterate(s2, start2)
|
||||
while x2 != nothing
|
||||
i2 += 1
|
||||
ch2, state2 = iterate(s2, state2)
|
||||
ch2, state2 = x2
|
||||
# update
|
||||
above, current, left = current, left, v0[i2]
|
||||
if ch1 != ch2
|
||||
|
@ -79,7 +81,9 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
|||
left + 1)
|
||||
end
|
||||
v0[i2] = current
|
||||
x2 = iterate(s2, state2)
|
||||
end
|
||||
x1 = iterate(s1, state1)
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
@ -95,36 +99,32 @@ struct DamerauLevenshtein <: SemiMetric end
|
|||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
(start1 > ncu1) && return len2 - k
|
||||
|
||||
k, state1, start2 = common_prefix(s1, s2)
|
||||
x1 = iterate(s1, state1)
|
||||
(x1 == nothing) && return len2 - k
|
||||
v0 = Array{Int}(undef, len2 - k)
|
||||
@inbounds for i2 in 1:(len2 - k)
|
||||
v0[i2] = i2
|
||||
end
|
||||
v2 = Array{Int}(undef, len2 - k)
|
||||
|
||||
ch1, = iterate(s1, start1)
|
||||
current = 0
|
||||
state1 = start1
|
||||
i1 = 0
|
||||
while state1 <= ncu1
|
||||
ch1 = first(s1)
|
||||
while (x1 != nothing)
|
||||
i1 += 1
|
||||
prevch1 = ch1
|
||||
ch1, state1 = iterate(s1, i1)
|
||||
ch2, = iterate(s2, start2)
|
||||
ch1, state1 = x1
|
||||
x2 = iterate(s2, start2)
|
||||
left = (i1 - 1)
|
||||
current = i1
|
||||
nextTransCost = 0
|
||||
state2 = start2
|
||||
ch2, = x2
|
||||
i2 = 0
|
||||
while state2 <= ncu2
|
||||
while (x2 != nothing)
|
||||
i2 += 1
|
||||
prevch2 = ch2
|
||||
ch2, state2 = iterate(s2, state2)
|
||||
ch2, state2 = x2
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
nextTransCost = v2[i2]
|
||||
|
@ -150,7 +150,9 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
end
|
||||
end
|
||||
v0[i2] = current
|
||||
x2 = iterate(s2, state2)
|
||||
end
|
||||
x1 = iterate(s1, state1)
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
@ -165,8 +167,6 @@ struct Jaro <: SemiMetric end
|
|||
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
ncu1 = ncodeunits(s1)
|
||||
ncu2 = ncodeunits(s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
|
@ -174,21 +174,22 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
m = 0
|
||||
flag = fill(false, len2)
|
||||
i1 = 0
|
||||
state1 = firstindex(s1)
|
||||
startstate2 = firstindex(s2)
|
||||
starti2 = 0
|
||||
i1_match = fill!(Array{typeof(state1)}(undef, len1), state1)
|
||||
while state1 <= ncu1
|
||||
ch1, newstate1 = iterate(s1, i1)
|
||||
state1 = firstindex(s1)
|
||||
i1_match = fill!(Array{Int}(undef, len1), state1)
|
||||
x1 = iterate(s1)
|
||||
while (x1 != nothing)
|
||||
ch1, newstate1 = x1
|
||||
i1 += 1
|
||||
if starti2 < i1 - maxdist - 1
|
||||
startstate2 = iterate(s2, startstate2)
|
||||
startstate2 = nextind(s2, startstate2)
|
||||
starti2 += 1
|
||||
end
|
||||
i2 = starti2
|
||||
state2 = startstate2
|
||||
while state2 <= len2 && i2 <= i1 + maxdist
|
||||
ch2, state2 = iterate(s2, state2)
|
||||
x2 = iterate(s2, startstate2)
|
||||
while (x2 != nothing) && i2 <= i1 + maxdist
|
||||
ch2, state2 = x2
|
||||
i2 += 1
|
||||
if ch1 == ch2 && !flag[i2]
|
||||
m += 1
|
||||
|
@ -196,8 +197,10 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
i1_match[m] = state1
|
||||
break
|
||||
end
|
||||
x2 = iterate(s2, state2)
|
||||
end
|
||||
state1 = newstate1
|
||||
x1 = iterate(s1, state1)
|
||||
end
|
||||
# count t transpotsitions
|
||||
t = 0
|
||||
|
|
|
@ -11,12 +11,10 @@ struct QGramIterator{S <: AbstractString, T <: Integer}
|
|||
end
|
||||
|
||||
|
||||
function Base.iterate(qgram::QGramIterator)
|
||||
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
||||
end
|
||||
function Base.iterate(qgram::QGramIterator, state)
|
||||
function Base.iterate(qgram::QGramIterator,
|
||||
state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q)))
|
||||
istart, iend = state
|
||||
iend > nchodeunits(qgram.s) && return nothing
|
||||
iend > ncodeunits(qgram.s) && return nothing
|
||||
element = SubString(qgram.s, istart, iend)
|
||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||
element, nextstate
|
||||
|
@ -25,7 +23,7 @@ Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
|||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array{eltype(qgram)}(length(qgram))
|
||||
x = Array{eltype(qgram)}(undef, length(qgram))
|
||||
i = 0
|
||||
for q in qgram
|
||||
i += 1
|
||||
|
@ -46,11 +44,10 @@ struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
|||
v1::T1
|
||||
v2::T2
|
||||
end
|
||||
Base.iterate(s::CountIterator) = (1, 1)
|
||||
|
||||
function Base.iterate(s::CountIterator, state)
|
||||
function Base.iterate(s::CountIterator, state = (1, 1))
|
||||
state1, state2 = state
|
||||
state2 > s.v2 && state1 > s.v1 && nothing
|
||||
state2 > length(s.v2) && state1 > length(s.v1) && return nothing
|
||||
iter1 = state2 > length(s.v2)
|
||||
iter2 = state1 > length(s.v1)
|
||||
if iter1
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
using StringDistances, Test
|
||||
|
||||
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
@test evaluate(Levenshtein(), "", "abc") == 3
|
||||
@test evaluate(Levenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||
|
||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
|
||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
|
|
@ -30,6 +30,7 @@ using StringDistances, Test
|
|||
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
||||
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
||||
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
|
@ -54,6 +55,7 @@ using StringDistances, Test
|
|||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
|
||||
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
|
|
Loading…
Reference in New Issue