parent
ca09c2b6fa
commit
2834265e96
|
@ -2,7 +2,7 @@
|
||||||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||||
|
|
||||||
This Julia package computes various distances between strings (ASCII)
|
This Julia package computes various distances between strings (UTF-8 encoding)
|
||||||
|
|
||||||
## Syntax
|
## Syntax
|
||||||
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar.
|
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar.
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
__precompile__(true)
|
|
||||||
|
|
||||||
module StringDistances
|
module StringDistances
|
||||||
|
|
||||||
|
@ -29,11 +28,16 @@ TokenSort,
|
||||||
TokenSet,
|
TokenSet,
|
||||||
TokenMax
|
TokenMax
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## include
|
## include
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
include("utils.jl")
|
include("utils.jl")
|
||||||
include("distances/edit.jl")
|
include("distances/edit.jl")
|
||||||
include("distances/qgram.jl")
|
include("distances/qgram.jl")
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
\
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## compare
|
## compare
|
||||||
|
@ -26,6 +29,7 @@ function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Winkler
|
## Winkler
|
||||||
|
|
|
@ -32,8 +32,8 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
||||||
a = longest_common_substring(s1, s2)
|
a = longest_common_substring(s1, s2)
|
||||||
if a[3] > 0
|
if a[3] > 0
|
||||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||||
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1]) - 1)
|
s1before = SubString(s1, firstindex(s1), nextind(s1, 0, a[1] - 1))
|
||||||
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2]) - 1)
|
s2before = SubString(s2, firstindex(s2), nextind(s2, 0, a[2] - 1))
|
||||||
matching_blocks!(x, s1before, s2before, start1, start2)
|
matching_blocks!(x, s1before, s2before, start1, start2)
|
||||||
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
|
if ((a[1] + a[3]) <= lastindex(s1)) & ((a[2] + a[3]) <= lastindex(s2))
|
||||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||||
|
|
|
@ -19,8 +19,8 @@ function Base.iterate(qgram::QGramIterator,
|
||||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||||
element, nextstate
|
element, nextstate
|
||||||
end
|
end
|
||||||
Base.eltype(qgram::QGramIterator{S, T}) where {S <: SubString, T} = S
|
Base.eltype(qgram::QGramIterator{S}) where {S} = SubString{S}
|
||||||
Base.eltype(qgram::QGramIterator{S, T}) where {S, T} = SubString{S}
|
Base.eltype(qgram::QGramIterator{S}) where {S <: SubString} = S
|
||||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -31,6 +31,11 @@ Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||||
## v1 and v2 must be sorted vectors
|
## v1 and v2 must be sorted vectors
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
struct CountIteratorBinary{T1, T2}
|
||||||
|
v1::Vector{T1}
|
||||||
|
v2::Vector{T2}
|
||||||
|
end
|
||||||
|
|
||||||
function Base.collect(qgram::QGramIterator)
|
function Base.collect(qgram::QGramIterator)
|
||||||
x = Array{eltype(qgram)}(undef, length(qgram))
|
x = Array{eltype(qgram)}(undef, length(qgram))
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -43,11 +48,6 @@ end
|
||||||
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
||||||
|
|
||||||
|
|
||||||
struct CountIteratorBinary{T1, T2}
|
|
||||||
v1::Vector{T1}
|
|
||||||
v2::Vector{T2}
|
|
||||||
end
|
|
||||||
|
|
||||||
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
|
function CountIteratorBinary(s1::QGramIterator, s2::QGramIterator)
|
||||||
CountIteratorBinary(sort(s1), sort(s2))
|
CountIteratorBinary(sort(s1), sort(s2))
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
using StringDistances
|
using StringDistances
|
||||||
|
|
||||||
tests = ["distances.jl", "modifiers.jl"]
|
tests = ["distances.jl", "modifiers.jl", "utf8.jl"]
|
||||||
|
|
||||||
println("Running tests:")
|
println("Running tests:")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
|
||||||
|
using StringDistances, Test
|
||||||
|
|
||||||
|
# check with weird utf8 strings
|
||||||
|
compare(TokenMax(RatcliffObershelp()), "aüa", "aua")
|
||||||
|
compare(TokenMax(QGram(2)), "aüa", "aua")
|
||||||
|
compare(DamerauLevenshtein(), "aüa", "aua")
|
||||||
|
compare(Hamming(), "aüa", "aua")
|
||||||
|
compare(Jaro(), "aüa", "aua")
|
||||||
|
compare(Levenshtein(), "aüa", "aua")
|
Loading…
Reference in New Issue