Adds MorisitaOverlap distance (#39)

pull/40/head^2
Robert Feldt 2020-11-10 16:12:28 +01:00 committed by GitHub
parent e4095682b4
commit 0c57f62319
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 74 additions and 3 deletions

View File

@ -21,6 +21,7 @@ The available distances are:
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
- Distance "modifiers" that can be applied to any distance:
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.

View File

@ -35,6 +35,7 @@ Cosine,
Jaccard,
SorensenDice,
Overlap,
MorisitaOverlap,
QGramDict,
QGramSortedVector,
Winkler,

View File

@ -377,4 +377,51 @@ newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
c.shared += (n1 > 0) & (n2 > 0)
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
1.0 - c.shared / min(c.left, c.right)
1.0 - c.shared / min(c.left, c.right)
"""
MorisitaOverlap(q::Int)
Creates a MorisitaOverlap distance, a general, statistical measure of
dispersion which can also be used on dictionaries such as created
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
This is more fine-grained than many of the other QGramDistances since
it is based on the counts per q-gram rather than only which q-grams are
in the strings.
The distance corresponds to
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
sum of those counts.
"""
struct MorisitaOverlap <: QGramDistance
q::Int
end
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
leftsum::T # sum(m(s1))
rightsum::T # sum(m(s2))
leftsq::T # sum(m(s1).^2)
rightsq::T # sum(m(s2).^2)
shared::T # sum(m(s1) .* m(s2))
end
newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
c.leftsum += n1
c.leftsq += (n1^2)
end
@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
c.rightsum += n2
c.rightsq += (n2^2)
end
@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
c.shared += (n1 * n2)
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)

View File

@ -104,4 +104,4 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
end
end
return R
end
end

View File

@ -130,6 +130,28 @@ using StringDistances, Unicode, Test, Random
@test ismissing(evaluate(Overlap(1), "", missing))
end
@testset "MorisitaOverlap" begin
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
# ms1 = [1, 1, 1, 2, 1, 1, 0]
# ms2 = [2, 1, 1, 2, 0, 0, 1]
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
@test MorisitaOverlap(1)("context", "contact") == 0.8
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(MorisitaOverlap(1), "", "")
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
end
@testset "QGramDict and QGramSortedVector counts qgrams" begin
# To get something we can more easily compare to:
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
@ -212,7 +234,7 @@ using StringDistances, Unicode, Test, Random
end
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
for _ in 1:100
qlen = rand(2:9)
dist = D(qlen)