Allow any AbstractString
parent
bb385cc1fb
commit
1650ec41f5
|
@ -1,7 +1,7 @@
|
|||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||
|
||||
This Julia package computes various distances between strings (UTF-8 encoding)
|
||||
This Julia package computes various distances between `AbstractString`s
|
||||
|
||||
## Syntax
|
||||
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar.
|
||||
|
|
|
@ -20,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
|||
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
|
||||
end
|
||||
|
||||
function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N}
|
||||
function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
len1 = length(s1) ; len2 = length(s2)
|
||||
min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2)
|
||||
min(len1, len2) <= (dist.N - 1) && return convert(Float64, s1 == s2)
|
||||
if typeof(dist) <: QGram
|
||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2)
|
||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.N + 2)
|
||||
else
|
||||
1 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
@ -71,7 +71,7 @@ function compare(dist::Partial, s1::AbstractString, s2::AbstractString)
|
|||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2)
|
||||
len1 == 0 && return compare(dist.dist, "", "")
|
||||
iter = QGramIterator{typeof(s2), len1}(s2, len2)
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
out = 0.0
|
||||
x = iterate(iter)
|
||||
while x !== nothing
|
||||
|
|
|
@ -5,21 +5,28 @@
|
|||
##
|
||||
##############################################################################
|
||||
# N is the number of characters in the QGram
|
||||
struct QGramIterator{S <: AbstractString, N}
|
||||
struct QGramIterator{S <: AbstractString}
|
||||
s::S # grapheme
|
||||
l::Int # length of string
|
||||
N::Int # Length of Qgram
|
||||
end
|
||||
|
||||
function Base.iterate(qgram::QGramIterator{S, N},
|
||||
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
|
||||
function qgram_iterator(s::AbstractString, n::Int)
|
||||
QGramIterator{typeof(s)}(s, length(s), n)
|
||||
end
|
||||
|
||||
function Base.iterate(qgram::QGramIterator,
|
||||
state = (1, qgram.l < qgram.N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, qgram.N)))
|
||||
istart, iend = state
|
||||
iend > ncodeunits(qgram.s) && return nothing
|
||||
element = qgram.s[istart:iend]
|
||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||
element, nextstate
|
||||
end
|
||||
Base.length(qgram::QGramIterator{S, N}) where {S, N} = max(qgram.l - N + 1, 0)
|
||||
Base.eltype(qgram::QGramIterator) = String
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.N + 1, 0)
|
||||
|
||||
Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = S
|
||||
Base.eltype(qgram::QGramIterator{S}) where {S} = S
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -31,7 +38,7 @@ Base.eltype(qgram::QGramIterator) = String
|
|||
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
function count_map(s1, s2) where {S1, S2, N}
|
||||
function count_map(s1, s2)
|
||||
K = Union{eltype(s1), eltype(s2)}
|
||||
d = Dict{K, NTuple{2, Int}}()
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
|
@ -63,14 +70,10 @@ end
|
|||
## Distance on strings is computed by set distance on qgram sets
|
||||
##
|
||||
##############################################################################
|
||||
abstract type AbstractQGram{N} <: SemiMetric end
|
||||
|
||||
function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N}
|
||||
QGramIterator{typeof(s), N}(s, length(s))
|
||||
end
|
||||
abstract type AbstractQGram <: SemiMetric end
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
|
||||
evaluate(dist, count_map(qgram_iterator(s1, dist.N), qgram_iterator(s2, dist.N)))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -82,9 +85,9 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
struct QGram{N} <: AbstractQGram{N} end
|
||||
|
||||
QGram(x::Integer) = QGram{x}()
|
||||
struct QGram <: AbstractQGram
|
||||
N::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, countiterator)
|
||||
n = 0
|
||||
|
@ -101,9 +104,9 @@ end
|
|||
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
||||
##############################################################################
|
||||
|
||||
struct Cosine{N} <: AbstractQGram{N} end
|
||||
|
||||
Cosine(n::Integer = 2) = Cosine{n}()
|
||||
struct Cosine <: AbstractQGram
|
||||
N::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, countiterator)
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
|
@ -124,9 +127,9 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
struct Jaccard{N} <: AbstractQGram{N} end
|
||||
|
||||
Jaccard(n::Integer = 2) = Jaccard{n}()
|
||||
struct Jaccard <: AbstractQGram
|
||||
N::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, countiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -145,9 +148,9 @@ end
|
|||
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
|
||||
##############################################################################
|
||||
|
||||
struct SorensenDice{N} <: AbstractQGram{N} end
|
||||
|
||||
SorensenDice(n::Integer = 2) = SorensenDice{n}()
|
||||
struct SorensenDice <: AbstractQGram
|
||||
N::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, countiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -166,9 +169,9 @@ end
|
|||
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
|
||||
##############################################################################
|
||||
|
||||
struct Overlap{N} <: AbstractQGram{N} end
|
||||
|
||||
Overlap(n::Integer = 2) = Overlap{n}()
|
||||
struct Overlap <: AbstractQGram
|
||||
N::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, countiterator)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
|
Loading…
Reference in New Issue