Add NMD and fix bug in MorisitaOverlap (#40)

2020-11-10 19:55:05 +01:00 · 2020-11-10 19:55:05 +01:00 · ed6c2f650f
parent f4185fbfe0
commit ed6c2f650f
4 changed files with 72 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ The available distances are:
 	- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
 	- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
 	- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
+	- [NormalizedMultisetDistance](https://www.sciencedirect.com/science/article/pii/S1047320313001417) `NormalizedMultisetDistance(q::Int)` or `NMD(q::Int)`
 - Distance "modifiers" that can be applied to any distance:
 	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
 	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. 
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -36,6 +36,8 @@ Jaccard,
 SorensenDice,
 Overlap,
 MorisitaOverlap,
+NormalizedMultisetDistance,
+NMD,
 QGramDict,
 QGramSortedVector,
 Winkler,
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -424,4 +424,49 @@ end
 	c.shared += (n1 * n2)

 calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
-	(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
+	1.0 - ((2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum))
+
+"""
+	NormalizedMultisetDistance(q::Int)
+	NMD(q::Int)
+
+Creates a NormalizedMultisetDistance (NMD) distance as introduced by Besiris and
+Zigouris 2013. The goal with this distance is to behave similarly to a normalized
+compression distance without having to do any actual compression (and thus being
+faster to compute).
+
+The distance corresponds to
+
+``(sum(max.(m(s1), m(s2)) - min(M(s1), M(s2))) / max(M(s1), M(s2))``
+
+where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
+sum of those counts.
+
+For details see:
+https://www.sciencedirect.com/science/article/pii/S1047320313001417
+"""
+struct NormalizedMultisetDistance <: QGramDistance
+	q::Int
+end
+const NMD = NormalizedMultisetDistance # frequently used acronym
+
+newcounter(d::NMD) = ThreeCounters{Int, NMD}(0, 0, 0)
+
+@inline function countleft!(c::ThreeCounters{Int, NMD}, n1::Integer)
+	c.left += n1
+	c.shared += n1 # max(n1, 0) == n1
+end
+
+@inline function countright!(c::ThreeCounters{Int, NMD}, n2::Integer)
+	c.right += n2
+	c.shared += n2 # max(n2, 0) == n2
+end
+
+@inline function countboth!(c::ThreeCounters{Int, NMD}, n1::Integer, n2::Integer)
+	c.left += n1
+	c.right += n2
+	c.shared += max(n1, n2)
+end
+
+calculate(d::NMD, c::ThreeCounters{Int, NMD}) =
+	(c.shared - min(c.left, c.right)) / max(c.left, c.right)
--- a/test/distances.jl
+++ b/test/distances.jl
@ -132,26 +132,44 @@ using StringDistances, Unicode, Test, Random

 	@testset "MorisitaOverlap" begin
 		# overlap for 'n', 'h', and 't' and 5 q-grams per string:
-		@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
+		@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.4 # 1.0-((2*3)/(5*5/5 + 5*5/5))

 		# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
 		# ms1 = [1, 1, 1, 2, 1, 1, 0]
 		# ms2 = [2, 1, 1, 2, 0, 0, 1]
 		# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
-		@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
-		@test MorisitaOverlap(1)("context", "contact") == 0.8
+		@test evaluate(MorisitaOverlap(1), "context", "contact") ≈ .2 atol = 1e-4 # 1.0-((2*8)/(9*7/7 + 11*7/7)) = 16/20
+		@test MorisitaOverlap(1)("context", "contact") ≈ .2 atol = 1e-4

 		# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
 		# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
 		# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
 		# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
-		@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
+		@test MorisitaOverlap(2)("context", "contact") == 0.5 # 1.0-((2*3)/(6*6/6 + 6*6/6))

 		@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
 		@inferred evaluate(MorisitaOverlap(1), "", "")
 		@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
 	end

+	@testset "NMD" begin
+		# m(s1) = [1, 1, 1, 1, 1, 0, 0], m(s2) = [1, 0, 0, 1, 1, 1, 1]
+		@test evaluate(NMD(1), "night", "nacht") == 0.4 # (7-5)/5
+
+		# ms1 = [1, 1, 1, 2, 1, 1, 0]
+		# ms2 = [2, 1, 1, 2, 0, 0, 1]
+		@test evaluate(NMD(1), "context", "contact") ≈ 0.2857 atol = 1e-4 # ((2+1+1+2+1+1+1)-7)/(7)
+		@test NMD(1)("context", "contact") ≈ 0.2857 atol = 1e-4
+
+		# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
+		# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
+		@test NMD(2)("context", "contact") == 0.5 # ((1+1+1+1+1+1+1+1+1)-6)/6
+
+		@test result_type(NMD(1), "hello", "world") == typeof(float(1))
+		@inferred evaluate(NMD(1), "", "")
+		@test ismissing(evaluate(NMD(1), "", missing))
+	end
+
 	@testset "QGramDict and QGramSortedVector counts qgrams" begin
 		# To get something we can more easily compare to:
 		stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
@ -234,7 +252,7 @@ using StringDistances, Unicode, Test, Random
 	end

 	@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
-		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
+		for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap, NMD]
 			for _ in 1:100
 				qlen = rand(2:9)
 				dist = D(qlen)