diff --git a/README.md b/README.md index 22f63ac..1b977e9 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ The available distances are: - [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)` - [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)` - [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)` + - [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)` - Distance "modifiers" that can be applied to any distance: - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string. - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically. diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 89efc47..ffc8b84 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -35,6 +35,7 @@ Cosine, Jaccard, SorensenDice, Overlap, +MorisitaOverlap, QGramDict, QGramSortedVector, Winkler, diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index a420877..7d335d8 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -377,4 +377,51 @@ newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0) c.shared += (n1 > 0) & (n2 > 0) calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) = - 1.0 - c.shared / min(c.left, c.right) \ No newline at end of file + 1.0 - c.shared / min(c.left, c.right) + +""" + MorisitaOverlap(q::Int) + +Creates a MorisitaOverlap distance, a general, statistical measure of +dispersion which can also be used on dictionaries such as created +from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index +This is more fine-grained than many of the other QGramDistances since +it is based on the counts per q-gram rather than only which q-grams are +in the strings. + +The distance corresponds to + +``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))`` + +where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the +sum of those counts. +""" +struct MorisitaOverlap <: QGramDistance + q::Int +end + +mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter + leftsum::T # sum(m(s1)) + rightsum::T # sum(m(s2)) + leftsq::T # sum(m(s1).^2) + rightsq::T # sum(m(s2).^2) + shared::T # sum(m(s1) .* m(s2)) +end + +newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0) + +@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer) + c.leftsum += n1 + c.leftsq += (n1^2) +end + +@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer) + c.rightsum += n2 + c.rightsq += (n2^2) +end + +@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) = + c.shared += (n1 * n2) + +calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) = + (2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum) diff --git a/src/pairwise.jl b/src/pairwise.jl index 3a91118..fe9d72f 100644 --- a/src/pairwise.jl +++ b/src/pairwise.jl @@ -104,4 +104,4 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst end end return R -end \ No newline at end of file +end diff --git a/test/distances.jl b/test/distances.jl index 7aaa8da..4d06064 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -130,6 +130,28 @@ using StringDistances, Unicode, Test, Random @test ismissing(evaluate(Overlap(1), "", missing)) end + @testset "MorisitaOverlap" begin + # overlap for 'n', 'h', and 't' and 5 q-grams per string: + @test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5)) + + # overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors + # ms1 = [1, 1, 1, 2, 1, 1, 0] + # ms2 = [2, 1, 1, 2, 0, 0, 1] + # sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7 + @test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20 + @test MorisitaOverlap(1)("context", "contact") == 0.8 + + # Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct" + # ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0] + # ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1] + # sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6 + @test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6)) + + @test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1)) + @inferred evaluate(MorisitaOverlap(1), "", "") + @test ismissing(evaluate(MorisitaOverlap(1), "", missing)) + end + @testset "QGramDict and QGramSortedVector counts qgrams" begin # To get something we can more easily compare to: stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p)) @@ -212,7 +234,7 @@ using StringDistances, Unicode, Test, Random end @testset "Differential testing of String, QGramDict, and QGramSortedVector" begin - for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap] + for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap] for _ in 1:100 qlen = rand(2:9) dist = D(qlen)