update to 0.7
parent
ba5a54fa84
commit
69a008fcf8
|
@ -67,13 +67,13 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
||||||
len1 == len2 && return compare(dist.dist, s1, s2)
|
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||||
len1 == 0 && return compare(dist.dist, "", "")
|
len1 == 0 && return compare(dist.dist, "", "")
|
||||||
iter = QGramIterator(s2, len2, len1)
|
iter = QGramIterator(s2, len2, len1)
|
||||||
state = iterate(iter)
|
out = 0.0
|
||||||
s, state = iterate(iter, state)
|
x = iterate(iter)
|
||||||
out = compare(dist.dist, s1, s)
|
while x != nothing
|
||||||
while state != nothing
|
s, state = x
|
||||||
s, state = iterate(iter, state)
|
|
||||||
curr = compare(dist.dist, s1, s)
|
curr = compare(dist.dist, s1, s)
|
||||||
out = max(out, curr)
|
out = max(out, curr)
|
||||||
|
x = iterate(iter, state)
|
||||||
end
|
end
|
||||||
return out
|
return out
|
||||||
end
|
end
|
||||||
|
@ -97,7 +97,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr
|
||||||
s2_end += len2 - s2_end
|
s2_end += len2 - s2_end
|
||||||
end
|
end
|
||||||
i2_start = nextind(s2, 0, s2_start)
|
i2_start = nextind(s2, 0, s2_start)
|
||||||
i2_end = s2_end == len2 ? endof(s2) : (nextind(s2, 0, s2_end + 1) - 1)
|
i2_end = s2_end == len2 ? lastindex(s2) : (nextind(s2, 0, s2_end + 1) - 1)
|
||||||
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
|
curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end))
|
||||||
out = max(out, curr)
|
out = max(out, curr)
|
||||||
end
|
end
|
||||||
|
@ -115,12 +115,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
|
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString)
|
||||||
if search(s1, Base._default_delims) > 0
|
s1 = join(sort!(split(s1)), " ")
|
||||||
s1 = join(sort!(split(s1)), " ")
|
s2 = join(sort!(split(s2)), " ")
|
||||||
end
|
|
||||||
if search(s2, Base._default_delims) > 0
|
|
||||||
s2 = join(sort!(split(s2)), " ")
|
|
||||||
end
|
|
||||||
compare(dist.dist, s1, s2)
|
compare(dist.dist, s1, s2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -137,8 +133,8 @@ end
|
||||||
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
|
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString)
|
||||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||||
s0 = join(v0, " ")
|
s0 = join(v0, " ")
|
||||||
s1 = join(chain(v0, v1), " ")
|
s1 = join(Iterators.flatten((v0, v1)), " ")
|
||||||
s2 = join(chain(v0, v2), " ")
|
s2 = join(Iterators.flatten((v0, v2)), " ")
|
||||||
if isempty(s0)
|
if isempty(s0)
|
||||||
# otherwise compare(dist, "", "a")== 1.0
|
# otherwise compare(dist, "", "a")== 1.0
|
||||||
compare(dist.dist, s1, s2)
|
compare(dist.dist, s1, s2)
|
||||||
|
|
|
@ -32,12 +32,12 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
||||||
a = longest_common_substring(s1, s2)
|
a = longest_common_substring(s1, s2)
|
||||||
if a[3] > 0
|
if a[3] > 0
|
||||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||||
s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1)
|
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
|
||||||
s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1)
|
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
|
||||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||||
if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2)
|
if (a[1] + a[3]) <= lastindex(s1) && (a[2] + a[3]) <= lastindex(s2)
|
||||||
s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1))
|
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||||
s2after = SubString(s2, chr2ind(s2, a[2] + a[3]), endof(s2))
|
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
||||||
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,19 +4,23 @@
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||||
ncu1 = ncodeunits(s1)
|
# in case this loop never happens
|
||||||
ncu2 = ncodeunits(s2)
|
out1 = firstindex(s1)
|
||||||
state1 = firstindex(s1)
|
out2 = firstindex(s2)
|
||||||
state2 = firstindex(s2)
|
x1 = iterate(s1)
|
||||||
|
x2 = iterate(s2)
|
||||||
l = 0
|
l = 0
|
||||||
while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0)
|
while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0)
|
||||||
ch1, nextstate1 = iterate(s1, state1)
|
ch1, state1 = x1
|
||||||
ch2, nextstate2 = iterate(s2, state2)
|
ch2, state2 = x2
|
||||||
ch1 != ch2 && break
|
ch1 != ch2 && break
|
||||||
|
out1 = state1
|
||||||
|
out2 = state2
|
||||||
|
x1 = iterate(s1, state1)
|
||||||
|
x2 = iterate(s2, state2)
|
||||||
l += 1
|
l += 1
|
||||||
state1, state2 = nextstate1, nextstate2
|
|
||||||
end
|
end
|
||||||
return l, state1, state2
|
return l, out1, out2
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -48,28 +52,26 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
s2, len2, s1, len1 = reorder(s1, s2)
|
s2, len2, s1, len1 = reorder(s1, s2)
|
||||||
k, start1, start2 = common_prefix(s1, s2)
|
k, start1, start2 = common_prefix(s1, s2)
|
||||||
ncu1 = ncodeunits(s1)
|
x1 = iterate(s1, start1)
|
||||||
ncu2 = ncodeunits(s2)
|
(x1 == nothing) && return len2 - k
|
||||||
(start1 > ncu1) && return len2 - k
|
|
||||||
# distance initialized to first row of matrix
|
# distance initialized to first row of matrix
|
||||||
# => distance between "" and s2[1:i}
|
# => distance between "" and s2[1:i}
|
||||||
v0 = Array{Int}(undef, len2 - k)
|
v0 = Array{Int}(undef, len2 - k)
|
||||||
for i2 in 1:(len2 - k)
|
for i2 in 1:(len2 - k)
|
||||||
v0[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
current = zero(0)
|
current = 0
|
||||||
state1 = start1
|
|
||||||
i1 = 0
|
i1 = 0
|
||||||
while state1 <= ncu1
|
while x1 != nothing
|
||||||
i1 += 1
|
i1 += 1
|
||||||
ch1, state1 = iterate(s1, i1)
|
ch1, state1 = x1
|
||||||
left = (i1 - 1)
|
left = (i1 - 1)
|
||||||
current = (i1 - 1)
|
current = (i1 - 1)
|
||||||
state2 = start2
|
|
||||||
i2 = 0
|
i2 = 0
|
||||||
while state2 <= ncu2
|
x2 = iterate(s2, start2)
|
||||||
|
while x2 != nothing
|
||||||
i2 += 1
|
i2 += 1
|
||||||
ch2, state2 = iterate(s2, state2)
|
ch2, state2 = x2
|
||||||
# update
|
# update
|
||||||
above, current, left = current, left, v0[i2]
|
above, current, left = current, left, v0[i2]
|
||||||
if ch1 != ch2
|
if ch1 != ch2
|
||||||
|
@ -79,7 +81,9 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
left + 1)
|
left + 1)
|
||||||
end
|
end
|
||||||
v0[i2] = current
|
v0[i2] = current
|
||||||
|
x2 = iterate(s2, state2)
|
||||||
end
|
end
|
||||||
|
x1 = iterate(s1, state1)
|
||||||
end
|
end
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
@ -95,36 +99,32 @@ struct DamerauLevenshtein <: SemiMetric end
|
||||||
|
|
||||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||||
s2, len2, s1, len1 = reorder(s1, s2)
|
s2, len2, s1, len1 = reorder(s1, s2)
|
||||||
ncu1 = ncodeunits(s1)
|
|
||||||
ncu2 = ncodeunits(s2)
|
|
||||||
# prefix common to both strings can be ignored
|
# prefix common to both strings can be ignored
|
||||||
k, start1, start2 = common_prefix(s1, s2)
|
k, state1, start2 = common_prefix(s1, s2)
|
||||||
(start1 > ncu1) && return len2 - k
|
x1 = iterate(s1, state1)
|
||||||
|
(x1 == nothing) && return len2 - k
|
||||||
v0 = Array{Int}(undef, len2 - k)
|
v0 = Array{Int}(undef, len2 - k)
|
||||||
@inbounds for i2 in 1:(len2 - k)
|
@inbounds for i2 in 1:(len2 - k)
|
||||||
v0[i2] = i2
|
v0[i2] = i2
|
||||||
end
|
end
|
||||||
v2 = Array{Int}(undef, len2 - k)
|
v2 = Array{Int}(undef, len2 - k)
|
||||||
|
|
||||||
ch1, = iterate(s1, start1)
|
|
||||||
current = 0
|
current = 0
|
||||||
state1 = start1
|
|
||||||
i1 = 0
|
i1 = 0
|
||||||
while state1 <= ncu1
|
ch1 = first(s1)
|
||||||
|
while (x1 != nothing)
|
||||||
i1 += 1
|
i1 += 1
|
||||||
prevch1 = ch1
|
prevch1 = ch1
|
||||||
ch1, state1 = iterate(s1, i1)
|
ch1, state1 = x1
|
||||||
ch2, = iterate(s2, start2)
|
x2 = iterate(s2, start2)
|
||||||
left = (i1 - 1)
|
left = (i1 - 1)
|
||||||
current = i1
|
current = i1
|
||||||
nextTransCost = 0
|
nextTransCost = 0
|
||||||
state2 = start2
|
ch2, = x2
|
||||||
i2 = 0
|
i2 = 0
|
||||||
while state2 <= ncu2
|
while (x2 != nothing)
|
||||||
i2 += 1
|
i2 += 1
|
||||||
prevch2 = ch2
|
prevch2 = ch2
|
||||||
ch2, state2 = iterate(s2, state2)
|
ch2, state2 = x2
|
||||||
above = current
|
above = current
|
||||||
thisTransCost = nextTransCost
|
thisTransCost = nextTransCost
|
||||||
nextTransCost = v2[i2]
|
nextTransCost = v2[i2]
|
||||||
|
@ -150,7 +150,9 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
v0[i2] = current
|
v0[i2] = current
|
||||||
|
x2 = iterate(s2, state2)
|
||||||
end
|
end
|
||||||
|
x1 = iterate(s1, state1)
|
||||||
end
|
end
|
||||||
return current
|
return current
|
||||||
end
|
end
|
||||||
|
@ -165,8 +167,6 @@ struct Jaro <: SemiMetric end
|
||||||
|
|
||||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
s2, len2, s1, len1 = reorder(s1, s2)
|
s2, len2, s1, len1 = reorder(s1, s2)
|
||||||
ncu1 = ncodeunits(s1)
|
|
||||||
ncu2 = ncodeunits(s2)
|
|
||||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||||
len2 == 0 && return 0.0
|
len2 == 0 && return 0.0
|
||||||
maxdist = max(0, div(len2, 2) - 1)
|
maxdist = max(0, div(len2, 2) - 1)
|
||||||
|
@ -174,21 +174,22 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
m = 0
|
m = 0
|
||||||
flag = fill(false, len2)
|
flag = fill(false, len2)
|
||||||
i1 = 0
|
i1 = 0
|
||||||
state1 = firstindex(s1)
|
|
||||||
startstate2 = firstindex(s2)
|
startstate2 = firstindex(s2)
|
||||||
starti2 = 0
|
starti2 = 0
|
||||||
i1_match = fill!(Array{typeof(state1)}(undef, len1), state1)
|
state1 = firstindex(s1)
|
||||||
while state1 <= ncu1
|
i1_match = fill!(Array{Int}(undef, len1), state1)
|
||||||
ch1, newstate1 = iterate(s1, i1)
|
x1 = iterate(s1)
|
||||||
|
while (x1 != nothing)
|
||||||
|
ch1, newstate1 = x1
|
||||||
i1 += 1
|
i1 += 1
|
||||||
if starti2 < i1 - maxdist - 1
|
if starti2 < i1 - maxdist - 1
|
||||||
startstate2 = iterate(s2, startstate2)
|
startstate2 = nextind(s2, startstate2)
|
||||||
starti2 += 1
|
starti2 += 1
|
||||||
end
|
end
|
||||||
i2 = starti2
|
i2 = starti2
|
||||||
state2 = startstate2
|
x2 = iterate(s2, startstate2)
|
||||||
while state2 <= len2 && i2 <= i1 + maxdist
|
while (x2 != nothing) && i2 <= i1 + maxdist
|
||||||
ch2, state2 = iterate(s2, state2)
|
ch2, state2 = x2
|
||||||
i2 += 1
|
i2 += 1
|
||||||
if ch1 == ch2 && !flag[i2]
|
if ch1 == ch2 && !flag[i2]
|
||||||
m += 1
|
m += 1
|
||||||
|
@ -196,8 +197,10 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||||
i1_match[m] = state1
|
i1_match[m] = state1
|
||||||
break
|
break
|
||||||
end
|
end
|
||||||
|
x2 = iterate(s2, state2)
|
||||||
end
|
end
|
||||||
state1 = newstate1
|
state1 = newstate1
|
||||||
|
x1 = iterate(s1, state1)
|
||||||
end
|
end
|
||||||
# count t transpotsitions
|
# count t transpotsitions
|
||||||
t = 0
|
t = 0
|
||||||
|
|
|
@ -11,12 +11,10 @@ struct QGramIterator{S <: AbstractString, T <: Integer}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
function Base.iterate(qgram::QGramIterator)
|
function Base.iterate(qgram::QGramIterator,
|
||||||
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q)))
|
||||||
end
|
|
||||||
function Base.iterate(qgram::QGramIterator, state)
|
|
||||||
istart, iend = state
|
istart, iend = state
|
||||||
iend > nchodeunits(qgram.s) && return nothing
|
iend > ncodeunits(qgram.s) && return nothing
|
||||||
element = SubString(qgram.s, istart, iend)
|
element = SubString(qgram.s, istart, iend)
|
||||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||||
element, nextstate
|
element, nextstate
|
||||||
|
@ -25,7 +23,7 @@ Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
||||||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
||||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||||
function Base.collect(qgram::QGramIterator)
|
function Base.collect(qgram::QGramIterator)
|
||||||
x = Array{eltype(qgram)}(length(qgram))
|
x = Array{eltype(qgram)}(undef, length(qgram))
|
||||||
i = 0
|
i = 0
|
||||||
for q in qgram
|
for q in qgram
|
||||||
i += 1
|
i += 1
|
||||||
|
@ -46,11 +44,10 @@ struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||||
v1::T1
|
v1::T1
|
||||||
v2::T2
|
v2::T2
|
||||||
end
|
end
|
||||||
Base.iterate(s::CountIterator) = (1, 1)
|
|
||||||
|
|
||||||
function Base.iterate(s::CountIterator, state)
|
function Base.iterate(s::CountIterator, state = (1, 1))
|
||||||
state1, state2 = state
|
state1, state2 = state
|
||||||
state2 > s.v2 && state1 > s.v1 && nothing
|
state2 > length(s.v2) && state1 > length(s.v1) && return nothing
|
||||||
iter1 = state2 > length(s.v2)
|
iter1 = state2 > length(s.v2)
|
||||||
iter2 = state1 > length(s.v1)
|
iter2 = state1 > length(s.v1)
|
||||||
if iter1
|
if iter1
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
|
||||||
|
using StringDistances, Test
|
||||||
|
|
||||||
|
|
||||||
|
@test evaluate(Levenshtein(), "", "") == 0
|
||||||
|
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||||
|
@test evaluate(Levenshtein(), "", "abc") == 3
|
||||||
|
@test evaluate(Levenshtein(), "bc", "abc") == 1
|
||||||
|
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||||
|
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||||
|
|
||||||
|
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||||
|
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||||
|
|
||||||
|
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||||
|
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||||
|
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||||
|
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
|
|
@ -30,6 +30,7 @@ using StringDistances, Test
|
||||||
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
||||||
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
||||||
|
|
||||||
|
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||||
@test evaluate(QGram(1), "", "abc") == 3
|
@test evaluate(QGram(1), "", "abc") == 3
|
||||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||||
|
@ -54,6 +55,7 @@ using StringDistances, Test
|
||||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||||
|
|
||||||
|
|
||||||
|
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||||
|
|
||||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||||
|
|
Loading…
Reference in New Issue