Add NMD and fix bug in MorisitaOverlap (#40)
parent
f4185fbfe0
commit
ed6c2f650f
|
@ -23,6 +23,7 @@ The available distances are:
|
|||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
|
||||
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
|
||||
- [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)`
|
||||
- Distance "modifiers" that can be applied to any distance:
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.
|
||||
|
|
|
@ -36,6 +36,8 @@ Jaccard,
|
|||
SorensenDice,
|
||||
Overlap,
|
||||
MorisitaOverlap,
|
||||
NormalizedMultisetDistance,
|
||||
NMD,
|
||||
QGramDict,
|
||||
QGramSortedVector,
|
||||
Winkler,
|
||||
|
|
|
@ -424,4 +424,49 @@ end
|
|||
c.shared += (n1 * n2)
|
||||
|
||||
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
|
||||
(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
|
||||
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
|
||||
|
||||
"""
|
||||
NormalizedMultisetDistance(q::Int)
|
||||
NMD(q::Int)
|
||||
|
||||
Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
|
||||
Zigouris 2013. The goal with this distance is to behave similarly to a normalized
|
||||
compression distance without having to do any actual compression (and thus being
|
||||
faster to compute).
|
||||
|
||||
The distance corresponds to
|
||||
|
||||
``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))``
|
||||
|
||||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||
sum of those counts.
|
||||
|
||||
For details see:
|
||||
https://www.sciencedirect.com/science/article/pii/S1047320313001417
|
||||
"""
|
||||
struct NormalizedMultisetDistance <: QGramDistance
|
||||
q::Int
|
||||
end
|
||||
const NMD = NormalizedMultisetDistance # frequently used acronym
|
||||
|
||||
newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
|
||||
|
||||
@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
|
||||
c.left += n1
|
||||
c.shared += n1 # max(n1, 0) == n1
|
||||
end
|
||||
|
||||
@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
|
||||
c.right += n2
|
||||
c.shared += n2 # max(n2, 0) == n2
|
||||
end
|
||||
|
||||
@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
|
||||
c.left += n1
|
||||
c.right += n2
|
||||
c.shared += max(n1, n2)
|
||||
end
|
||||
|
||||
calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
|
||||
(c.shared - min(c.left, c.right)) / max(c.left, c.right)
|
||||
|
|
|
@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random
|
|||
|
||||
@testset "MorisitaOverlap" begin
|
||||
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
|
||||
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
|
||||
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
|
||||
|
||||
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
|
||||
# ms1 = [1, 1, 1, 2, 1, 1, 0]
|
||||
# ms2 = [2, 1, 1, 2, 0, 0, 1]
|
||||
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
|
||||
@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
|
||||
@test MorisitaOverlap(1)("context", "contact") == 0.8
|
||||
@test evaluate(MorisitaOverlap(1), "context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
|
||||
@test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4
|
||||
|
||||
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
|
||||
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
|
||||
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||
# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
|
||||
@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
|
||||
@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))
|
||||
|
||||
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(MorisitaOverlap(1), "", "")
|
||||
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "NMD" begin
|
||||
# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
|
||||
@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5
|
||||
|
||||
# ms1 = [1, 1, 1, 2, 1, 1, 0]
|
||||
# ms2 = [2, 1, 1, 2, 0, 0, 1]
|
||||
@test evaluate(NMD(1), "context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
|
||||
@test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4
|
||||
|
||||
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
|
||||
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||
@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
|
||||
|
||||
@test result_type(NMD(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(NMD(1), "", "")
|
||||
@test ismissing(evaluate(NMD(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "QGramDict and QGramSortedVector counts qgrams" begin
|
||||
# To get something we can more easily compare to:
|
||||
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
|
||||
|
@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random
|
|||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD]
|
||||
for _ in 1:100
|
||||
qlen = rand(2:9)
|
||||
dist = D(qlen)
|
||||
|
|
Loading…
Reference in New Issue