add comments

2019-08-17 19:45:31 -04:00 · 2019-08-17 19:45:31 -04:00 · 5c2b690c4f
parent 57466ae4e0
commit 5c2b690c4f
3 changed files with 114 additions and 13 deletions
--- a/src/compare.jl
+++ b/src/compare.jl
@ -34,11 +34,15 @@ end
 ## Winkler
 ##
 ##############################################################################
+"""
+   Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7)

+Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`)
+"""
 struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
    dist::T1
    scaling_factor::T2      # scaling factor. Default to 0.1
-    boosting_limit::T3      # boost threshold. Default to 0.7
+    boosting_threshold::T3      # boost threshold. Default to 0.7
 end

 # restrict to distance between 0 and 1
@ -48,7 +52,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
    score = compare(s1, s2, dist.dist)
    l = common_prefix(s1, s2, 4)[1]
    # common prefix adjustment
-    if score >= dist.boosting_limit
+    if score >= dist.boosting_threshold
        score += l * dist.scaling_factor * (1 - score)
    end
    return score
@ -60,6 +64,11 @@ end
 ## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 ##
 ##############################################################################
+"""
+   Partial(dist::Premetric)
+
+Partial is a `PreMetric` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
+"""
 struct Partial{T <: PreMetric} <: PreMetric
    dist::T
 end
@ -108,6 +117,11 @@ end
 ## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 ##
 ##############################################################################
+"""
+   TokenSort(dist::Premetric)
+
+TokenSort is a `PreMetric` modifier that adjusts for differences in word orders by reording words alphabetically.
+"""
 struct TokenSort{T <: PreMetric} <: PreMetric
    dist::T
 end
@ -124,6 +138,11 @@ end
 ## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 ##
 ##############################################################################
+"""
+   TokenSet(dist::Premetric)
+
+TokenSort is a `PreMetric` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
+"""
 struct TokenSet{T <: PreMetric} <: PreMetric
    dist::T
 end
@ -147,6 +166,11 @@ end
 ## TokenMax
 ##
 ##############################################################################
+"""
+   TokenMax(dist::Premetric)
+
+TokenSort is a `PreMetric` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
+"""
 struct TokenMax{T <: PreMetric} <: PreMetric
    dist::T
 end
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -4,7 +4,6 @@
 ## Hamming
 ##
 ##############################################################################
-
 function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
    current = abs(length(s2) - length(s1))
    for (ch1, ch2) in zip(s1, s2)
@ -19,7 +18,13 @@ end
 ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
 ##
 ##############################################################################
+"""
+    Levenshtein()

+Creates the Levenshtein metric
+
+The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other.
+"""
 struct Levenshtein <: SemiMetric end

 function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
@ -59,10 +64,16 @@ end
 ##############################################################################
 ##
 ## Damerau Levenshtein
-## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
+## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 ##
 ##############################################################################
+"""
+    DamerauLevenshtein()

+Creates the DamerauLevenshtein metric
+
+The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other.
+"""
 struct DamerauLevenshtein <: SemiMetric end

 function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
@ -127,7 +138,19 @@ end
 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 ##
 ##############################################################################
+"""
+    Jaro()

+Creates the Jaro metric
+
+The Jaro distance is defined as
+
+
+``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
+
+where ``m`` is the number of matching characters and 
+``t`` is half the number of transpositions.
+"""
 struct Jaro <: SemiMetric end

 function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
@ -189,7 +212,13 @@ end
 ## Ratcliff/Obershelp
 ##
 ##############################################################################
+"""
+    RatcliffObershelp()

+Creates the RatcliffObershelp metric
+
+The distance between two strings is defined as one minus  the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
+"""
 struct RatcliffObershelp <: PreMetric end

 function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -4,7 +4,7 @@
 ## Define a type that iterates through q-grams of a string
 ##
 ##############################################################################
-# N is the number of characters in the QGram
+# N is the number of characters for the QGram
 struct QGramIterator{S <: AbstractString}
 	s::S # grapheme
 	l::Int # length of string
@ -92,12 +92,24 @@ end
 ##############################################################################
 ##
 ## q-gram 
-## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
-## For instance v("leila")["il"] =1 
-## q-gram is ∑ |v(s1, p) - v(s2, p)|
 ##
 ##############################################################################
+"""
+For an AbstractString s, denote v(s) the vector on the space of q-grams of length N, that contains the number of times a q-gram appears in s
+The q-gram distance is ||v(s1) - v(s2)||
+"""

+"""
+	QGram(n::Int)
+
+Creates a QGram metric.
+
+The distance corresponds to
+
+``||v(s1, n) - v(s2, n)||``
+
+where ``v(s, n)`` denotes the vector on the space of q-grams of length n, that contains the number of times a q-gram appears for the string s
+"""
 struct QGram <: AbstractQGramDistance
 	N::Int
 end
@ -114,9 +126,19 @@ end
 ##
 ## cosine 
 ##
-## 1 - v(s1, p).v(s2, p)  / ||v(s1, p)|| * ||v(s2, p)||
+## 
 ##############################################################################
+"""
+	Cosine(n::Int)

+Creates a Cosine metric.
+
+The distance corresponds to
+
+`` 1 - v(s1, n).v(s2, n)  / ||v(s1, n)|| * ||v(s2, n)||``
+
+where ``v(s, n)`` denotes the vector on the space of q-grams of length n, that contains the number of times a q-gram appears for the string s
+"""
 struct Cosine <: AbstractQGramDistance
 	N::Int
 end
@ -135,11 +157,18 @@ end
 ##
 ## Jaccard
 ##
-## Denote Q(s, q) the set of tuple of length q in s
-## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
-##
 ##############################################################################
+"""
+	Jaccard(n::Int)

+Creates a Jaccard metric.
+
+The distance corresponds to 
+
+``1 - |Q(s1, n) ∩ Q(s2, n)| / |Q(s1, n) ∪ Q(s2, n))|``
+
+where ``Q(s, n)``  denotes the set of q-grams of length n for the string s
+"""
 struct Jaccard <: AbstractQGramDistance
 	N::Int
 end
@ -158,9 +187,18 @@ end
 ##
 ## SorensenDice
 ##
-## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
 ##############################################################################
+"""
+	SorensenDice(n::Int)

+Creates a SorensenDice metric
+
+The distance corresponds to  
+
+``1 - 2 * |Q(s1, n) ∩ Q(s2, n)|  / (|Q(s1, n)| + |Q(s2, n))|)``
+
+where ``Q(s, n)``  denotes the set of q-grams of length n for the string s
+"""
 struct SorensenDice <: AbstractQGramDistance
 	N::Int
 end
@ -181,7 +219,17 @@ end
 ##
 ## 1 -  |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
 ##############################################################################
+"""
+	Overlap(n::Int)

+Creates a Overlap metric
+
+The distance corresponds to  
+
+``1 - |Q(s1, n) ∩ Q(s2, n)|  / min(|Q(s1, n)|, |Q(s2, n)|)``
+
+where ``Q(s, n)``  denotes the set of q-grams of length n for the string s
+"""
 struct Overlap <: AbstractQGramDistance
 	N::Int
 end