parent
ca09c2b6fa
commit
2834265e96
|
@ -2,7 +2,7 @@
|
|||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||
|
||||
This Julia package computes various distances between strings (ASCII)
|
||||
This Julia package computes various distances between strings (UTF-8 encoding)
|
||||
|
||||
## Syntax
|
||||
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar.
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
__precompile__(true)
|
||||
|
||||
module StringDistances
|
||||
|
||||
|
@ -29,11 +28,16 @@ TokenSort,
|
|||
TokenSet,
|
||||
TokenMax
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## include
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
include("utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
\
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## compare
|
||||
|
@ -26,6 +29,7 @@ function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Winkler
|
||||
|
|
|
@ -32,8 +32,8 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
|
||||
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
|
||||
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1] - 1))
|
||||
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2] - 1))
|
||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
|
||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||
|
|
|
@ -19,8 +19,8 @@ function Base.iterate(qgram::QGramIterator,
|
|||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||
element, nextstate
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
||||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
|
||||
##############################################################################
|
||||
|
@ -31,6 +31,11 @@ Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
|||
## v1 and v2 must be sorted vectors
|
||||
##
|
||||
##############################################################################
|
||||
struct CountIteratorBinary{T1, T2}
|
||||
v1::Vector{T1}
|
||||
v2::Vector{T2}
|
||||
end
|
||||
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array{eltype(qgram)}(undef, length(qgram))
|
||||
i = 0
|
||||
|
@ -43,11 +48,6 @@ end
|
|||
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
||||
|
||||
|
||||
struct CountIteratorBinary{T1, T2}
|
||||
v1::Vector{T1}
|
||||
v2::Vector{T2}
|
||||
end
|
||||
|
||||
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
|
||||
CountIteratorBinary(sort(s1), sort(s2))
|
||||
end
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
using StringDistances
|
||||
|
||||
tests = ["distances.jl", "modifiers.jl"]
|
||||
tests = ["distances.jl", "modifiers.jl", "utf8.jl"]
|
||||
|
||||
println("Running tests:")
|
||||
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
|
||||
using StringDistances, Test
|
||||
|
||||
# check with weird utf8 strings
|
||||
compare(TokenMax(RatcliffObershelp()), "aüa", "aua")
|
||||
compare(TokenMax(QGram(2)), "aüa", "aua")
|
||||
compare(DamerauLevenshtein(), "aüa", "aua")
|
||||
compare(Hamming(), "aüa", "aua")
|
||||
compare(Jaro(), "aüa", "aua")
|
||||
compare(Levenshtein(), "aüa", "aua")
|
Loading…
Reference in New Issue