Add NMD and fix bug in MorisitaOverlap (#40)

pull/44/head
Robert Feldt 2020-11-10 19:55:05 +01:00 committed by GitHub
parent f4185fbfe0
commit ed6c2f650f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 6 deletions

View File

@ -23,6 +23,7 @@ The available distances are:
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
- [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)`
- Distance "modifiers" that can be applied to any distance:
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.

View File

@ -36,6 +36,8 @@ Jaccard,
SorensenDice,
Overlap,
MorisitaOverlap,
NormalizedMultisetDistance,
NMD,
QGramDict,
QGramSortedVector,
Winkler,

View File

@ -424,4 +424,49 @@ end
c.shared += (n1 * n2)
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
"""
NormalizedMultisetDistance(q::Int)
NMD(q::Int)
Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
Zigouris 2013. The goal with this distance is to behave similarly to a normalized
compression distance without having to do any actual compression (and thus being
faster to compute).
The distance corresponds to
``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))``
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
For details see:
https://www.sciencedirect.com/science/article/pii/S1047320313001417
"""
struct NormalizedMultisetDistance <: QGramDistance
q::Int
end
const NMD = NormalizedMultisetDistance # frequently used acronym
newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
c.left += n1
c.shared += n1 # max(n1, 0) == n1
end
@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
c.right += n2
c.shared += n2 # max(n2, 0) == n2
end
@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
c.left += n1
c.right += n2
c.shared += max(n1, n2)
end
calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
(c.shared - min(c.left, c.right)) / max(c.left, c.right)

View File

@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random
@testset "MorisitaOverlap" begin
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
# ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1]
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") == 0.8
@test evaluate(MorisitaOverlap(1), "context", "contact") .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") .2 atol = 1e-4
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(MorisitaOverlap(1), "", "")
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
end
@testset "NMD" begin
# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5
# ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1]
@test evaluate(NMD(1), "context", "contact") 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
@test NMD(1)("context", "contact") 0.2857 atol = 1e-4
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
@test result_type(NMD(1), "hello", "world") == typeof(float(1))
@inferred evaluate(NMD(1), "", "")
@test ismissing(evaluate(NMD(1), "", missing))
end
@testset "QGramDict and QGramSortedVector counts qgrams" begin
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random
end
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)