diff --git a/REQUIRE b/REQUIRE index 407771b..850e509 100755 --- a/REQUIRE +++ b/REQUIRE @@ -1,3 +1,3 @@ julia 0.7 Distances -IterTools +IterTools \ No newline at end of file diff --git a/src/StringDistances.jl b/src/StringDistances.jl index bca589d..1cfca77 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -1,4 +1,3 @@ - module StringDistances ############################################################################## @@ -28,16 +27,11 @@ TokenSort, TokenSet, TokenMax - - - - ############################################################################## ## ## include ## ############################################################################## - include("utils.jl") include("distances/edit.jl") include("distances/qgram.jl") diff --git a/src/compare.jl b/src/compare.jl index 2101319..f7284cb 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -1,6 +1,3 @@ -\ - - ############################################################################## ## ## compare @@ -21,15 +18,14 @@ end function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) # When string length < q for qgram distance, returns s1 == s2 len1 = length(s1) ; len2 = length(s2) - min(len1, len2) <= (dist.q - 1) && return convert(Float64, s1 == s2) + min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2) if typeof(dist) <: QGram - 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) + 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2) else 1 - evaluate(dist, s1, s2) end end - ############################################################################## ## ## Winkler @@ -70,7 +66,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString) s2, len2, s1, len1 = reorder(s1, s2) len1 == len2 && return compare(dist.dist, s1, s2) len1 == 0 && return compare(dist.dist, "", "") - iter = QGramIterator(s2, len2, len1) + iter = QGramIterator{typeof(s2), len1}(s2, len2) out = 0.0 x = iterate(iter) while x !== nothing diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 643f37d..ae8fc7a 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -4,73 +4,22 @@ ## ############################################################################## -struct QGramIterator{S <: AbstractString, T <: Integer} +struct QGramIterator{S <: AbstractString, N} s::S # grapheme l::Int # length of string - q::T # length of q-grams end +param(x::QGramIterator{S, N}) where {S, N} = N - -function Base.iterate(qgram::QGramIterator, - state = (1, qgram.l < qgram.q ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, qgram.q))) +function Base.iterate(qgram::QGramIterator{S, N}, + state = (1, qgram.l < N ? lastindex(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N} istart, iend = state iend > ncodeunits(qgram.s) && return nothing - element = SubString(qgram.s, istart, iend) + element = qgram.s[istart:iend] nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend) element, nextstate end -Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S} -Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S -Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) - -############################################################################## -## -## CountedIterator that use Binary Search -## -## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2 -## v1 and v2 must be sorted vectors -## -############################################################################## -struct CountIteratorBinary{T1, T2} - v1::Vector{T1} - v2::Vector{T2} -end - -function Base.collect(qgram::QGramIterator) - x = Array{eltype(qgram)}(undef, length(qgram)) - i = 0 - for q in qgram - i += 1 - @inbounds x[i] = q - end - x -end -Base.sort(qgram::QGramIterator) = sort!(collect(qgram)) - - -function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator) - CountIteratorBinary(sort(s1), sort(s2)) -end - -function Base.iterate(s::CountIteratorBinary, state = (1, 1)) - state1, state2 = state - iter1 = state2 > length(s.v2) - iter2 = state1 > length(s.v1) - iter2 && iter1 && return nothing - if iter1 - x1 = s.v1[state1] - elseif iter2 - x2 = s.v2[state2] - else - x1 = s.v1[state1] - x2 = s.v2[state2] - iter1 = x1 <= x2 - iter2 = x2 <= x1 - end - nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1 - nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2 - ((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2)) -end +Base.eltype(qgram::QGramIterator{S}) where {S} = S +Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0) ############################################################################## @@ -85,22 +34,37 @@ struct CountIteratorDictionary{T} d::T end -function CountIteratorDictionary(s1::QGramIterator, s2::QGramIterator) - d = Dict{eltype(s1), Tuple{Int, Int}}() - for ch1 in s1 - if haskey(d, ch1) - t = d[ch1] - d[ch1] = (t[1] + 1, 0) +# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380 +function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N} + K = eltype(s1) + d = Dict{K, NTuple{2, UInt8}}() + sizehint!(d, length(s1)) + for ch10 in s1 + ch1 = convert(K, ch10) + if !isequal(ch1, ch10) + throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K")) + end + index = Base.ht_keyindex2!(d, ch1) + if index > 0 + d.age += 1 + @inbounds d.keys[index] = ch1 + @inbounds d.vals[index] = (d.vals[index][1] + UInt8(1), UInt8(0)) else - d[ch1] = (1, 0) + Base._setindex!(d, (UInt8(1), UInt8(0)), ch1, -index) end end - for ch2 in s2 - if haskey(d, ch2) - t = d[ch2] - d[ch2] = (t[1], t[2] + 1) + for ch20 in s2 + ch2 = convert(K, ch20) + if !isequal(ch2, ch20) + throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K")) + end + index = Base.ht_keyindex2!(d, ch2) + if index > 0 + d.age += 1 + @inbounds d.keys[index] = ch2 + @inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + UInt8(1)) else - d[ch2] = (0, 1) + Base._setindex!(d, (UInt8(0), UInt8(1)), ch2, -index) end end return values(d) @@ -113,12 +77,14 @@ end ## Distance on strings is computed by set distance on qgram sets ## ############################################################################## -abstract type AbstractQGram <: SemiMetric end +abstract type AbstractQGram{N} <: SemiMetric end +param(x::AbstractQGram{N}) where N = N + function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) evaluate(dist, - CountIteratorBinary(QGramIterator(s1, length(s1), dist.q), - QGramIterator(s2, length(s2), dist.q))) + CountIteratorDictionary(QGramIterator{typeof(s1), param(dist)}(s1, length(s1)), + QGramIterator{typeof(s2), param(dist)}(s2, length(s2)))) end ############################################################################## @@ -130,15 +96,14 @@ end ## ############################################################################## -struct QGram{T <: Integer} <: AbstractQGram - q::T -end -QGram() = QGram(2) +struct QGram{N} <: AbstractQGram{N} end + +QGram(x::Integer) = QGram{x}() function evaluate(dist::QGram, countiterator) n = 0 for (n1, n2) in countiterator - n += abs(n1 - n2) + n += abs(Int(n1) - Int(n2)) end n end @@ -150,10 +115,9 @@ end ## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)|| ############################################################################## -struct Cosine{T <: Integer} <: AbstractQGram - q::T -end -Cosine() = Cosine(2) +struct Cosine{N} <: AbstractQGram{N} end + +Cosine(n::Integer = 2) = Cosine{n}() function evaluate(dist::Cosine, countiterator) norm1, norm2, prodnorm = 0, 0, 0 @@ -174,10 +138,9 @@ end ## ############################################################################## -struct Jaccard{T <: Integer} <: AbstractQGram - q::T -end -Jaccard() = Jaccard(2) +struct Jaccard{N} <: AbstractQGram{N} end + +Jaccard(n::Integer = 2) = Jaccard{n}() function evaluate(dist::Jaccard, countiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 @@ -196,10 +159,9 @@ end ## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|) ############################################################################## -struct SorensenDice{T <: Integer} <: AbstractQGram - q::T -end -SorensenDice() = SorensenDice(2) +struct SorensenDice{N} <: AbstractQGram{N} end + +SorensenDice(n::Integer = 2) = SorensenDice{n}() function evaluate(dist::SorensenDice, countiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 @@ -218,10 +180,9 @@ end ## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q))) ############################################################################## -struct Overlap{T <: Integer} <: AbstractQGram - q::T -end -Overlap() = Overlap(2) +struct Overlap{N} <: AbstractQGram{N} end + +Overlap(n::Integer = 2) = Overlap{n}() function evaluate(dist::Overlap, countiterator) ndistinct1, ndistinct2, nintersect = 0, 0, 0 diff --git a/test/.sublime2Terminal.jl b/test/.sublime2Terminal.jl deleted file mode 100644 index e2121ee..0000000 --- a/test/.sublime2Terminal.jl +++ /dev/null @@ -1,18 +0,0 @@ - -using StringDistances, Test - - -@test evaluate(Levenshtein(), "", "") == 0 -@test evaluate(Levenshtein(), "abc", "") == 3 -@test evaluate(Levenshtein(), "", "abc") == 3 -@test evaluate(Levenshtein(), "bc", "abc") == 1 -@test evaluate(Levenshtein(), "kitten", "sitting") == 3 -@test evaluate(Levenshtein(), "saturday", "sunday") == 3 - -@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 -@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 - -@test evaluate(DamerauLevenshtein(), "", "") == 0 -@test evaluate(DamerauLevenshtein(), "abc", "") == 3 -@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1 -@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1 \ No newline at end of file diff --git a/test/utf8.jl b/test/utf8.jl index 4762da0..f246c94 100644 --- a/test/utf8.jl +++ b/test/utf8.jl @@ -1,4 +1,3 @@ - using StringDistances, Test # check with weird utf8 strings