diff --git a/README.md b/README.md index 4193b71..f750605 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl) [![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master) -This Julia package computes various distances between strings (ASCII) +This Julia package computes various distances between strings (UTF-8 encoding) ## Syntax The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 9e7fe3f..bca589d 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -1,4 +1,3 @@ -__precompile__(true) module StringDistances @@ -29,11 +28,16 @@ TokenSort, TokenSet, TokenMax + + + + ############################################################################## ## ## include ## ############################################################################## + include("utils.jl") include("distances/edit.jl") include("distances/qgram.jl") diff --git a/src/compare.jl b/src/compare.jl index d14322d..2101319 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -1,3 +1,6 @@ +\ + + ############################################################################## ## ## compare @@ -26,6 +29,7 @@ function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) end end + ############################################################################## ## ## Winkler diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl index 669c41d..68d057c 100755 --- a/src/distances/RatcliffObershelp.jl +++ b/src/distances/RatcliffObershelp.jl @@ -32,8 +32,8 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: a = longest_common_substring(s1, s2) if a[3] > 0 push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) - s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1) - s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1) + s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1] - 1)) + s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2] - 1)) matching_blocks!(x, s1before, s2before, start1, start2) if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2)) s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1)) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index bd61572..643f37d 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -19,8 +19,8 @@ function Base.iterate(qgram::QGramIterator, nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend) element, nextstate end -Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S -Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S} +Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S} +Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) ############################################################################## @@ -31,6 +31,11 @@ Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) ## v1 and v2 must be sorted vectors ## ############################################################################## +struct CountIteratorBinary{T1, T2} + v1::Vector{T1} + v2::Vector{T2} +end + function Base.collect(qgram::QGramIterator) x = Array{eltype(qgram)}(undef, length(qgram)) i = 0 @@ -43,11 +48,6 @@ end Base.sort(qgram::QGramIterator) = sort!(collect(qgram)) -struct CountIteratorBinary{T1, T2} - v1::Vector{T1} - v2::Vector{T2} -end - function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator) CountIteratorBinary(sort(s1), sort(s2)) end diff --git a/test/runtests.jl b/test/runtests.jl index 05e72a4..47cc30c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,6 @@ using StringDistances -tests = ["distances.jl", "modifiers.jl"] +tests = ["distances.jl", "modifiers.jl", "utf8.jl"] println("Running tests:") diff --git a/test/utf8.jl b/test/utf8.jl new file mode 100644 index 0000000..4762da0 --- /dev/null +++ b/test/utf8.jl @@ -0,0 +1,10 @@ + +using StringDistances, Test + +# check with weird utf8 strings +compare(TokenMax(RatcliffObershelp()), "aüa", "aua") +compare(TokenMax(QGram(2)), "aüa", "aua") +compare(DamerauLevenshtein(), "aüa", "aua") +compare(Hamming(), "aüa", "aua") +compare(Jaro(), "aüa", "aua") +compare(Levenshtein(), "aüa", "aua") \ No newline at end of file