pull/17/head v0.3.1
matthieugomez 2018-11-21 14:15:28 -05:00
parent ca09c2b6fa
commit 2834265e96
7 changed files with 30 additions and 12 deletions

View File

@ -2,7 +2,7 @@
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
This Julia package computes various distances between strings (ASCII)
This Julia package computes various distances between strings (UTF-8 encoding)
## Syntax
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar.

View File

@ -1,4 +1,3 @@
__precompile__(true)
module StringDistances
@ -29,11 +28,16 @@ TokenSort,
TokenSet,
TokenMax
##############################################################################
##
## include
##
##############################################################################
include("utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")

View File

@ -1,3 +1,6 @@
\
##############################################################################
##
## compare
@ -26,6 +29,7 @@ function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
end
end
##############################################################################
##
## Winkler

View File

@ -32,8 +32,8 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2] - 1))
matching_blocks!(x, s1before, s2before, start1, start2)
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))

View File

@ -19,8 +19,8 @@ function Base.iterate(qgram::QGramIterator,
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
element, nextstate
end
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
##############################################################################
@ -31,6 +31,11 @@ Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
## v1 and v2 must be sorted vectors
##
##############################################################################
struct CountIteratorBinary{T1, T2}
v1::Vector{T1}
v2::Vector{T2}
end
function Base.collect(qgram::QGramIterator)
x = Array{eltype(qgram)}(undef, length(qgram))
i = 0
@ -43,11 +48,6 @@ end
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
struct CountIteratorBinary{T1, T2}
v1::Vector{T1}
v2::Vector{T2}
end
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
CountIteratorBinary(sort(s1), sort(s2))
end

View File

@ -1,6 +1,6 @@
using StringDistances
tests = ["distances.jl", "modifiers.jl"]
tests = ["distances.jl", "modifiers.jl", "utf8.jl"]
println("Running tests:")

10
test/utf8.jl Normal file
View File

@ -0,0 +1,10 @@
using StringDistances, Test
# check with weird utf8 strings
compare(TokenMax(RatcliffObershelp()), "aüa", "aua")
compare(TokenMax(QGram(2)), "aüa", "aua")
compare(DamerauLevenshtein(), "aüa", "aua")
compare(Hamming(), "aüa", "aua")
compare(Jaro(), "aüa", "aua")
compare(Levenshtein(), "aüa", "aua")