Adds MorisitaOverlap distance (#39)
parent
e4095682b4
commit
0c57f62319
|
@ -21,6 +21,7 @@ The available distances are:
|
|||
- [Jaccard Distance](https://en.wikipedia.org/wiki/Jaccard_index) `Jaccard(q::Int)`
|
||||
- [Overlap Distance](https://en.wikipedia.org/wiki/Overlap_coefficient) `Overlap(q::Int)`
|
||||
- [Sorensen-Dice Distance](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) `SorensenDice(q::Int)`
|
||||
- [MorisitaOverlap Distance](https://en.wikipedia.org/wiki/Morisita%27s_overlap_index) `MorisitaOverlap(q::Int)`
|
||||
- Distance "modifiers" that can be applied to any distance:
|
||||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the minimum of the normalized distance between the shorter string and substrings of the longer string.
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by returning the normalized distance of the two strings, after re-ordering words alphabetically.
|
||||
|
|
|
@ -35,6 +35,7 @@ Cosine,
|
|||
Jaccard,
|
||||
SorensenDice,
|
||||
Overlap,
|
||||
MorisitaOverlap,
|
||||
QGramDict,
|
||||
QGramSortedVector,
|
||||
Winkler,
|
||||
|
|
|
@ -377,4 +377,51 @@ newcounter(d::IntersectionDist) = ThreeCounters{Int, typeof(d)}(0, 0, 0)
|
|||
c.shared += (n1 > 0) & (n2 > 0)
|
||||
|
||||
calculate(d::Overlap, c::ThreeCounters{Int, Overlap}) =
|
||||
1.0 - c.shared / min(c.left, c.right)
|
||||
1.0 - c.shared / min(c.left, c.right)
|
||||
|
||||
"""
|
||||
MorisitaOverlap(q::Int)
|
||||
|
||||
Creates a MorisitaOverlap distance, a general, statistical measure of
|
||||
dispersion which can also be used on dictionaries such as created
|
||||
from q-grams. See https://en.wikipedia.org/wiki/Morisita%27s_overlap_index
|
||||
This is more fine-grained than many of the other QGramDistances since
|
||||
it is based on the counts per q-gram rather than only which q-grams are
|
||||
in the strings.
|
||||
|
||||
The distance corresponds to
|
||||
|
||||
``(2 * sum(m(s1) .* m(s2)) / (sum(m(s1).^2)*M(s2)/M(s1) + sum(m(s2).^2)*M(s1)/M(s2))``
|
||||
|
||||
where ``m(s)`` is the vector of q-gram counts for string ``s`` and ``M(s)`` is the
|
||||
sum of those counts.
|
||||
"""
|
||||
struct MorisitaOverlap <: QGramDistance
|
||||
q::Int
|
||||
end
|
||||
|
||||
mutable struct FiveCounters{T, QD<:QGramDistance} <: AbstractQGramMatchCounter
|
||||
leftsum::T # sum(m(s1))
|
||||
rightsum::T # sum(m(s2))
|
||||
leftsq::T # sum(m(s1).^2)
|
||||
rightsq::T # sum(m(s2).^2)
|
||||
shared::T # sum(m(s1) .* m(s2))
|
||||
end
|
||||
|
||||
newcounter(d::MorisitaOverlap) = FiveCounters{Int, MorisitaOverlap}(0, 0, 0, 0, 0)
|
||||
|
||||
@inline function countleft!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer)
|
||||
c.leftsum += n1
|
||||
c.leftsq += (n1^2)
|
||||
end
|
||||
|
||||
@inline function countright!(c::FiveCounters{Int, MorisitaOverlap}, n2::Integer)
|
||||
c.rightsum += n2
|
||||
c.rightsq += (n2^2)
|
||||
end
|
||||
|
||||
@inline countshared!(c::FiveCounters{Int, MorisitaOverlap}, n1::Integer, n2::Integer) =
|
||||
c.shared += (n1 * n2)
|
||||
|
||||
calculate(d::MorisitaOverlap, c::FiveCounters{Int, MorisitaOverlap}) =
|
||||
(2 * c.shared) / (c.leftsq*c.rightsum/c.leftsum + c.rightsq*c.leftsum/c.rightsum)
|
||||
|
|
|
@ -104,4 +104,4 @@ function _asymmetric_pairwise!(R::AbstractMatrix, dist::StringDistance, xs::Abst
|
|||
end
|
||||
end
|
||||
return R
|
||||
end
|
||||
end
|
||||
|
|
|
@ -130,6 +130,28 @@ using StringDistances, Unicode, Test, Random
|
|||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "MorisitaOverlap" begin
|
||||
# overlap for 'n', 'h', and 't' and 5 q-grams per string:
|
||||
@test evaluate(MorisitaOverlap(1), "night", "nacht") == 0.6 # ((2*3)/(5*5/5 + 5*5/5))
|
||||
|
||||
# overlap for 'o', 'n', 2-overlap for 'c' and 't' and 7 unique q-grams in total so multiplicity vectors
|
||||
# ms1 = [1, 1, 1, 2, 1, 1, 0]
|
||||
# ms2 = [2, 1, 1, 2, 0, 0, 1]
|
||||
# sum(ms1 .* ms2) = 8, sum(ms1 .^ 2) = 9, sum(ms2 .^ 2) = 11, sum(ms1) = 7, sum(ms2) = 7
|
||||
@test evaluate(MorisitaOverlap(1), "context", "contact") == 0.8 # ((2*8)/(9*7/7 + 11*7/7)) = 16/20
|
||||
@test MorisitaOverlap(1)("context", "contact") == 0.8
|
||||
|
||||
# Multiplicity vectors for 2-grams "co", "on", "nt", "te", "ex", "xt", "ta", "ac", "ct"
|
||||
# ms1 = [1, 1, 1, 1, 1, 1, 0, 0, 0]
|
||||
# ms2 = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||
# sum(ms1 .* ms2) = 3, sum(ms1 .^ 2) = 6, sum(ms2 .^ 2) = 6, sum(ms1) = 6, sum(ms2) = 6
|
||||
@test MorisitaOverlap(2)("context", "contact") == 0.5 # ((2*3)/(6*6/6 + 6*6/6))
|
||||
|
||||
@test result_type(MorisitaOverlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(MorisitaOverlap(1), "", "")
|
||||
@test ismissing(evaluate(MorisitaOverlap(1), "", missing))
|
||||
end
|
||||
|
||||
@testset "QGramDict and QGramSortedVector counts qgrams" begin
|
||||
# To get something we can more easily compare to:
|
||||
stringify(p::Pair{<:AbstractString, <:Integer}) = (string(first(p)), last(p))
|
||||
|
@ -212,7 +234,7 @@ using StringDistances, Unicode, Test, Random
|
|||
end
|
||||
|
||||
@testset "Differential testing of String, QGramDict, and QGramSortedVector" begin
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap]
|
||||
for D in [QGram, Cosine, Jaccard, SorensenDice, Overlap, MorisitaOverlap]
|
||||
for _ in 1:100
|
||||
qlen = rand(2:9)
|
||||
dist = D(qlen)
|
||||
|
|
Loading…
Reference in New Issue