From 69a008fcf860490ab6ac80439fbe3d432e068b49 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Wed, 4 Jul 2018 14:02:50 -0400 Subject: [PATCH] update to 0.7 --- src/compare.jl | 24 ++++---- src/distances/RatcliffObershelp.jl | 10 ++-- src/distances/edit.jl | 89 +++++++++++++++--------------- src/distances/qgram.jl | 15 ++--- test/.sublime2Terminal.jl | 18 ++++++ test/distances.jl | 2 + 6 files changed, 87 insertions(+), 71 deletions(-) create mode 100644 test/.sublime2Terminal.jl diff --git a/src/compare.jl b/src/compare.jl index c99a8e9..6abd4ea 100644 --- a/src/compare.jl +++ b/src/compare.jl @@ -67,13 +67,13 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString) len1 == len2 && return compare(dist.dist, s1, s2) len1 == 0 && return compare(dist.dist, "", "") iter = QGramIterator(s2, len2, len1) - state = iterate(iter) - s, state = iterate(iter, state) - out = compare(dist.dist, s1, s) - while state != nothing - s, state = iterate(iter, state) + out = 0.0 + x = iterate(iter) + while x != nothing + s, state = x curr = compare(dist.dist, s1, s) out = max(out, curr) + x = iterate(iter, state) end return out end @@ -97,7 +97,7 @@ function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::Abstr s2_end += len2 - s2_end end i2_start = nextind(s2, 0, s2_start) - i2_end = s2_end == len2 ? endof(s2) : (nextind(s2, 0, s2_end + 1) - 1) + i2_end = s2_end == len2 ? lastindex(s2) : (nextind(s2, 0, s2_end + 1) - 1) curr = compare(RatcliffObershelp(), s1, SubString(s2, i2_start, i2_end)) out = max(out, curr) end @@ -115,12 +115,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric end function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString) - if search(s1, Base._default_delims) > 0 - s1 = join(sort!(split(s1)), " ") - end - if search(s2, Base._default_delims) > 0 - s2 = join(sort!(split(s2)), " ") - end + s1 = join(sort!(split(s1)), " ") + s2 = join(sort!(split(s2)), " ") compare(dist.dist, s1, s2) end @@ -137,8 +133,8 @@ end function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString) v0, v1, v2 = _separate!(split(s1), split(s2)) s0 = join(v0, " ") - s1 = join(chain(v0, v1), " ") - s2 = join(chain(v0, v2), " ") + s1 = join(Iterators.flatten((v0, v1)), " ") + s2 = join(Iterators.flatten((v0, v2)), " ") if isempty(s0) # otherwise compare(dist, "", "a")== 1.0 compare(dist.dist, s1, s2) diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl index f6db9b7..c9082f5 100644 --- a/src/distances/RatcliffObershelp.jl +++ b/src/distances/RatcliffObershelp.jl @@ -32,12 +32,12 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: a = longest_common_substring(s1, s2) if a[3] > 0 push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) - s1before = SubString(s1, iterate(s1), chr2ind(s1, a[1]) - 1) - s2before = SubString(s2, iterate(s2), chr2ind(s2, a[2]) - 1) + s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1) + s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1) matching_blocks!(x, s1before, s2before, start1, start2) - if (a[1] + a[3]) <= endof(s1) && (a[2] + a[3]) <= endof(s2) - s1after = SubString(s1, chr2ind(s1, a[1] + a[3]), endof(s1)) - s2after = SubString(s2, chr2ind(s2, a[2] + a[3]), endof(s2)) + if (a[1] + a[3]) <= lastindex(s1) && (a[2] + a[3]) <= lastindex(s2) + s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) + s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2)) matching_blocks!(x, s1after, s2after, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1) end end diff --git a/src/distances/edit.jl b/src/distances/edit.jl index 6dbfe75..2150aea 100644 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -4,19 +4,23 @@ ############################################################################## function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1) - ncu1 = ncodeunits(s1) - ncu2 = ncodeunits(s2) - state1 = firstindex(s1) - state2 = firstindex(s2) + # in case this loop never happens + out1 = firstindex(s1) + out2 = firstindex(s2) + x1 = iterate(s1) + x2 = iterate(s2) l = 0 - while (state1 <= ncu1) && (state2 <= ncu2) && (l < lim || lim < 0) - ch1, nextstate1 = iterate(s1, state1) - ch2, nextstate2 = iterate(s2, state2) + while (x1 != nothing) && (x2 != nothing) && (l < lim || lim < 0) + ch1, state1 = x1 + ch2, state2 = x2 ch1 != ch2 && break + out1 = state1 + out2 = state2 + x1 = iterate(s1, state1) + x2 = iterate(s2, state2) l += 1 - state1, state2 = nextstate1, nextstate2 end - return l, state1, state2 + return l, out1, out2 end ############################################################################## @@ -48,28 +52,26 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString) # prefix common to both strings can be ignored s2, len2, s1, len1 = reorder(s1, s2) k, start1, start2 = common_prefix(s1, s2) - ncu1 = ncodeunits(s1) - ncu2 = ncodeunits(s2) - (start1 > ncu1) && return len2 - k + x1 = iterate(s1, start1) + (x1 == nothing) && return len2 - k # distance initialized to first row of matrix # => distance between "" and s2[1:i} v0 = Array{Int}(undef, len2 - k) for i2 in 1:(len2 - k) v0[i2] = i2 end - current = zero(0) - state1 = start1 + current = 0 i1 = 0 - while state1 <= ncu1 + while x1 != nothing i1 += 1 - ch1, state1 = iterate(s1, i1) + ch1, state1 = x1 left = (i1 - 1) current = (i1 - 1) - state2 = start2 i2 = 0 - while state2 <= ncu2 + x2 = iterate(s2, start2) + while x2 != nothing i2 += 1 - ch2, state2 = iterate(s2, state2) + ch2, state2 = x2 # update above, current, left = current, left, v0[i2] if ch1 != ch2 @@ -79,7 +81,9 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString) left + 1) end v0[i2] = current + x2 = iterate(s2, state2) end + x1 = iterate(s1, state1) end return current end @@ -95,36 +99,32 @@ struct DamerauLevenshtein <: SemiMetric end function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString) s2, len2, s1, len1 = reorder(s1, s2) - ncu1 = ncodeunits(s1) - ncu2 = ncodeunits(s2) # prefix common to both strings can be ignored - k, start1, start2 = common_prefix(s1, s2) - (start1 > ncu1) && return len2 - k - + k, state1, start2 = common_prefix(s1, s2) + x1 = iterate(s1, state1) + (x1 == nothing) && return len2 - k v0 = Array{Int}(undef, len2 - k) @inbounds for i2 in 1:(len2 - k) v0[i2] = i2 end v2 = Array{Int}(undef, len2 - k) - - ch1, = iterate(s1, start1) current = 0 - state1 = start1 i1 = 0 - while state1 <= ncu1 + ch1 = first(s1) + while (x1 != nothing) i1 += 1 prevch1 = ch1 - ch1, state1 = iterate(s1, i1) - ch2, = iterate(s2, start2) + ch1, state1 = x1 + x2 = iterate(s2, start2) left = (i1 - 1) current = i1 nextTransCost = 0 - state2 = start2 + ch2, = x2 i2 = 0 - while state2 <= ncu2 + while (x2 != nothing) i2 += 1 prevch2 = ch2 - ch2, state2 = iterate(s2, state2) + ch2, state2 = x2 above = current thisTransCost = nextTransCost nextTransCost = v2[i2] @@ -150,7 +150,9 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri end end v0[i2] = current + x2 = iterate(s2, state2) end + x1 = iterate(s1, state1) end return current end @@ -165,8 +167,6 @@ struct Jaro <: SemiMetric end function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) s2, len2, s1, len1 = reorder(s1, s2) - ncu1 = ncodeunits(s1) - ncu2 = ncodeunits(s2) # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case len2 == 0 && return 0.0 maxdist = max(0, div(len2, 2) - 1) @@ -174,21 +174,22 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) m = 0 flag = fill(false, len2) i1 = 0 - state1 = firstindex(s1) startstate2 = firstindex(s2) starti2 = 0 - i1_match = fill!(Array{typeof(state1)}(undef, len1), state1) - while state1 <= ncu1 - ch1, newstate1 = iterate(s1, i1) + state1 = firstindex(s1) + i1_match = fill!(Array{Int}(undef, len1), state1) + x1 = iterate(s1) + while (x1 != nothing) + ch1, newstate1 = x1 i1 += 1 if starti2 < i1 - maxdist - 1 - startstate2 = iterate(s2, startstate2) + startstate2 = nextind(s2, startstate2) starti2 += 1 end i2 = starti2 - state2 = startstate2 - while state2 <= len2 && i2 <= i1 + maxdist - ch2, state2 = iterate(s2, state2) + x2 = iterate(s2, startstate2) + while (x2 != nothing) && i2 <= i1 + maxdist + ch2, state2 = x2 i2 += 1 if ch1 == ch2 && !flag[i2] m += 1 @@ -196,8 +197,10 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) i1_match[m] = state1 break end + x2 = iterate(s2, state2) end state1 = newstate1 + x1 = iterate(s1, state1) end # count t transpotsitions t = 0 diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 140ab44..0adc777 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -11,12 +11,10 @@ struct QGramIterator{S <: AbstractString, T <: Integer} end -function Base.iterate(qgram::QGramIterator) - (1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q)) -end -function Base.iterate(qgram::QGramIterator, state) +function Base.iterate(qgram::QGramIterator, + state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q))) istart, iend = state - iend > nchodeunits(qgram.s) && return nothing + iend > ncodeunits(qgram.s) && return nothing element = SubString(qgram.s, istart, iend) nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend) element, nextstate @@ -25,7 +23,7 @@ Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S} Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) function Base.collect(qgram::QGramIterator) - x = Array{eltype(qgram)}(length(qgram)) + x = Array{eltype(qgram)}(undef, length(qgram)) i = 0 for q in qgram i += 1 @@ -46,11 +44,10 @@ struct CountIterator{T1 <: AbstractVector, T2 <: AbstractVector} v1::T1 v2::T2 end -Base.iterate(s::CountIterator) = (1, 1) -function Base.iterate(s::CountIterator, state) +function Base.iterate(s::CountIterator, state = (1, 1)) state1, state2 = state - state2 > s.v2 && state1 > s.v1 && nothing + state2 > length(s.v2) && state1 > length(s.v1) && return nothing iter1 = state2 > length(s.v2) iter2 = state1 > length(s.v1) if iter1 diff --git a/test/.sublime2Terminal.jl b/test/.sublime2Terminal.jl new file mode 100644 index 0000000..e2121ee --- /dev/null +++ b/test/.sublime2Terminal.jl @@ -0,0 +1,18 @@ + +using StringDistances, Test + + +@test evaluate(Levenshtein(), "", "") == 0 +@test evaluate(Levenshtein(), "abc", "") == 3 +@test evaluate(Levenshtein(), "", "abc") == 3 +@test evaluate(Levenshtein(), "bc", "abc") == 1 +@test evaluate(Levenshtein(), "kitten", "sitting") == 3 +@test evaluate(Levenshtein(), "saturday", "sunday") == 3 + +@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 +@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 + +@test evaluate(DamerauLevenshtein(), "", "") == 0 +@test evaluate(DamerauLevenshtein(), "abc", "") == 3 +@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1 +@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1 \ No newline at end of file diff --git a/test/distances.jl b/test/distances.jl index e1e26d5..93ee4a8 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -30,6 +30,7 @@ using StringDistances, Test @test evaluate(Hamming(), "testing", "this is a test") == 13 @test evaluate(Hamming(), "saturday", "sunday") == 7 +@test evaluate(QGram(1), "abc", "abc") == 0 @test evaluate(QGram(1), "", "abc") == 3 @test evaluate(QGram(1), "abc", "cba") == 0 @test evaluate(QGram(1), "abc", "ccc") == 4 @@ -54,6 +55,7 @@ using StringDistances, Test @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 +@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777