countiterator
parent
e0f5c4f711
commit
2b41b1fcfa
|
@ -24,7 +24,7 @@ function Base.next(qgram::QGramIterator, state)
|
||||||
istart, iend = state
|
istart, iend = state
|
||||||
element = SubString(qgram.s, istart, iend)
|
element = SubString(qgram.s, istart, iend)
|
||||||
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
nextstate = nextind(qgram.s, istart), nextind(qgram.s, iend)
|
||||||
return element, nextstate
|
element, nextstate
|
||||||
end
|
end
|
||||||
function Base.done(qgram::QGramIterator, state)
|
function Base.done(qgram::QGramIterator, state)
|
||||||
istart, idend = state
|
istart, idend = state
|
||||||
|
@ -39,24 +39,24 @@ function Base.collect(qgram::QGramIterator)
|
||||||
i += 1
|
i += 1
|
||||||
@inbounds x[i] = q
|
@inbounds x[i] = q
|
||||||
end
|
end
|
||||||
return x
|
x
|
||||||
end
|
end
|
||||||
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Define a type that iterates through a pair of sorted vector
|
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
||||||
## For each element in either v1 or v2, output number of times it appears in v1 and the number of times it appears in v2
|
## v1 and v2 must be sorted vectors
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
type PairIterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
type CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||||
v1::T1
|
v1::T1
|
||||||
v2::T2
|
v2::T2
|
||||||
end
|
end
|
||||||
Base.start(s::PairIterator) = (1, 1)
|
Base.start(s::CountInterator) = (1, 1)
|
||||||
|
|
||||||
function Base.next(s::PairIterator, state)
|
function Base.next(s::CountInterator, state)
|
||||||
state1, state2 = state
|
state1, state2 = state
|
||||||
iter1 = done(s.v2, state2)
|
iter1 = done(s.v2, state2)
|
||||||
iter2 = done(s.v1, state1)
|
iter2 = done(s.v1, state1)
|
||||||
|
@ -72,28 +72,24 @@ function Base.next(s::PairIterator, state)
|
||||||
end
|
end
|
||||||
nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1
|
nextstate1 = iter1 ? searchsortedlast(s.v1, x1, state1, length(s.v1), Base.Forward) + 1 : state1
|
||||||
nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2
|
nextstate2 = iter2 ? searchsortedlast(s.v2, x2, state2, length(s.v2), Base.Forward) + 1 : state2
|
||||||
return ((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
|
((nextstate1 - state1, nextstate2 - state2), (nextstate1, nextstate2))
|
||||||
end
|
end
|
||||||
|
|
||||||
function Base.done(s::PairIterator, state)
|
function Base.done(s::CountInterator, state)
|
||||||
state1, state2 = state
|
state1, state2 = state
|
||||||
done(s.v2, state2) && done(s.v1, state1)
|
done(s.v2, state2) && done(s.v1, state1)
|
||||||
end
|
end
|
||||||
|
|
||||||
function PairIterator(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer, q::Integer)
|
|
||||||
sort1 = sort(QGramIterator(s1, len1, q))
|
|
||||||
sort2 = sort(QGramIterator(s2, len2, q))
|
|
||||||
PairIterator(sort1, sort2)
|
|
||||||
end
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Evaluate Qgram distance on strings calls evaluate on space of qgrams
|
## Distance on strings is computed by set distance on qgram sets
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||||
evaluate(dist, PairIterator(s1, s2, len1, len2, dist.q))
|
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
||||||
|
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
||||||
|
evaluate(dist, CountInterator(sort1, sort2))
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -110,12 +106,12 @@ immutable QGram{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
QGram() = QGram(2)
|
QGram() = QGram(2)
|
||||||
|
|
||||||
function evaluate(dist::QGram, setiterator)
|
function evaluate(dist::QGram, countiterator)
|
||||||
n = 0
|
n = 0
|
||||||
for (n1, n2) in setiterator
|
for (n1, n2) in countiterator
|
||||||
n += abs(n1 - n2)
|
n += abs(n1 - n2)
|
||||||
end
|
end
|
||||||
return n
|
n
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -130,18 +126,16 @@ immutable Cosine{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Cosine() = Cosine(2)
|
Cosine() = Cosine(2)
|
||||||
|
|
||||||
function evaluate(dist::Cosine, setiterator)
|
function evaluate(dist::Cosine, countiterator)
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
for (n1, n2) in setiterator
|
for (n1, n2) in countiterator
|
||||||
norm1 += n1^2
|
norm1 += n1^2
|
||||||
norm2 += n2^2
|
norm2 += n2^2
|
||||||
prodnorm += n1 * n2
|
prodnorm += n1 * n2
|
||||||
end
|
end
|
||||||
return 1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
1.0 - prodnorm / (sqrt(norm1) * sqrt(norm2))
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Jaccard
|
## Jaccard
|
||||||
|
@ -156,14 +150,14 @@ immutable Jaccard{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Jaccard() = Jaccard(2)
|
Jaccard() = Jaccard(2)
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, setiterator)
|
function evaluate(dist::Jaccard, countiterator)
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in setiterator
|
for (n1, n2) in countiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
end
|
end
|
||||||
return 1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
1.0 - nintersect / (ndistinct1 + ndistinct2 - nintersect)
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -178,14 +172,14 @@ immutable SorensenDice{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
SorensenDice() = SorensenDice(2)
|
SorensenDice() = SorensenDice(2)
|
||||||
|
|
||||||
function evaluate(dist::SorensenDice, setiterator)
|
function evaluate(dist::SorensenDice, countiterator)
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in setiterator
|
for (n1, n2) in countiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
end
|
end
|
||||||
return 1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
1.0 - 2.0 * nintersect / (ndistinct1 + ndistinct2)
|
||||||
end
|
end
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
@ -200,13 +194,13 @@ immutable Overlap{T <: Integer} <: AbstractQGram
|
||||||
end
|
end
|
||||||
Overlap() = Overlap(2)
|
Overlap() = Overlap(2)
|
||||||
|
|
||||||
function evaluate(dist::Overlap, setiterator)
|
function evaluate(dist::Overlap, countiterator)
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
for (n1, n2) in setiterator
|
for (n1, n2) in countiterator
|
||||||
ndistinct1 += n1 > 0
|
ndistinct1 += n1 > 0
|
||||||
ndistinct2 += n2 > 0
|
ndistinct2 += n2 > 0
|
||||||
nintersect += (n1 > 0) & (n2 > 0)
|
nintersect += (n1 > 0) & (n2 > 0)
|
||||||
end
|
end
|
||||||
return 1.0 - nintersect / min(ndistinct1, ndistinct2)
|
1.0 - nintersect / min(ndistinct1, ndistinct2)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -9,11 +9,9 @@ using StringDistances, Base.Test
|
||||||
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||||
|
|
||||||
|
|
||||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||||
|
|
||||||
|
|
||||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||||
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||||
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||||
|
@ -23,7 +21,6 @@ using StringDistances, Base.Test
|
||||||
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
||||||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||||
|
|
||||||
|
|
||||||
@test evaluate(Hamming(), "", "") == 0
|
@test evaluate(Hamming(), "", "") == 0
|
||||||
@test evaluate(Hamming(), "", "abc") == 3
|
@test evaluate(Hamming(), "", "abc") == 3
|
||||||
@test evaluate(Hamming(), "abc", "abc") == 0
|
@test evaluate(Hamming(), "abc", "abc") == 0
|
||||||
|
@ -33,7 +30,6 @@ using StringDistances, Base.Test
|
||||||
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
||||||
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
||||||
|
|
||||||
|
|
||||||
@test evaluate(QGram(1), "", "abc") == 3
|
@test evaluate(QGram(1), "", "abc") == 3
|
||||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||||
|
@ -48,8 +44,6 @@ using StringDistances, Base.Test
|
||||||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Set([(1,1,3) (4,5,1) (6,6,1)])
|
Set([(1,1,3) (4,5,1) (6,6,1)])
|
||||||
@test matching_blocks("dwayne", "duane") ==
|
@test matching_blocks("dwayne", "duane") ==
|
||||||
Set([(5,4,2) (1,1,1) (3,3,1)])
|
Set([(5,4,2) (1,1,1) (3,3,1)])
|
||||||
|
|
Loading…
Reference in New Issue