add comments

pull/17/head
matthieugomez 2019-08-17 19:45:31 -04:00
parent 57466ae4e0
commit 5c2b690c4f
3 changed files with 114 additions and 13 deletions

View File

@ -34,11 +34,15 @@ end
## Winkler
##
##############################################################################
"""
Winkler(dist::Premetric, scaling_factor::Real = 0.1, boosting_limit::Real = 0.7)
Winkler is a `PreMetric` modifier that boosts the similarity score between two strings by a scale `scaling_factor` when the strings share a common prefix (the boost is only applied the similarity score above `boosting_threshold`)
"""
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
dist::T1
scaling_factor::T2 # scaling factor. Default to 0.1
boosting_limit::T3 # boost threshold. Default to 0.7
boosting_threshold::T3 # boost threshold. Default to 0.7
end
# restrict to distance between 0 and 1
@ -48,7 +52,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
score = compare(s1, s2, dist.dist)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment
if score >= dist.boosting_limit
if score >= dist.boosting_threshold
score += l * dist.scaling_factor * (1 - score)
end
return score
@ -60,6 +64,11 @@ end
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
Partial(dist::Premetric)
Partial is a `PreMetric` modifier that returns the maximal similarity score between the shorter string and substrings of the longer string
"""
struct Partial{T <: PreMetric} <: PreMetric
dist::T
end
@ -108,6 +117,11 @@ end
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
TokenSort(dist::Premetric)
TokenSort is a `PreMetric` modifier that adjusts for differences in word orders by reording words alphabetically.
"""
struct TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
@ -124,6 +138,11 @@ end
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
"""
TokenSet(dist::Premetric)
TokenSort is a `PreMetric` modifier that adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
"""
struct TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
@ -147,6 +166,11 @@ end
## TokenMax
##
##############################################################################
"""
TokenMax(dist::Premetric)
TokenSort is a `PreMetric` modifier that combines similarlity scores using the base distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on string lengths.
"""
struct TokenMax{T <: PreMetric} <: PreMetric
dist::T
end

View File

@ -4,7 +4,6 @@
## Hamming
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
current = abs(length(s2) - length(s1))
for (ch1, ch2) in zip(s1, s2)
@ -19,7 +18,13 @@ end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
##
##############################################################################
"""
Levenshtein()
Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
@ -59,10 +64,16 @@ end
##############################################################################
##
## Damerau Levenshtein
## Source: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################
"""
DamerauLevenshtein()
Creates the DamerauLevenshtein metric
The DamerauLevenshtein distance is the minimum number of operations (consisting of insertions, deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other.
"""
struct DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
@ -127,7 +138,19 @@ end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
##
##############################################################################
"""
Jaro()
Creates the Jaro metric
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
@ -189,7 +212,13 @@ end
## Ratcliff/Obershelp
##
##############################################################################
"""
RatcliffObershelp()
Creates the RatcliffObershelp metric
The distance between two strings is defined as one minus the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)

View File

@ -4,7 +4,7 @@
## Define a type that iterates through q-grams of a string
##
##############################################################################
# N is the number of characters in the QGram
# N is the number of characters for the QGram
struct QGramIterator{S <: AbstractString}
s::S # grapheme
l::Int # length of string
@ -92,12 +92,24 @@ end
##############################################################################
##
## q-gram
## Define v(s) a vector on the space of q-uple which contains number of times it appears in s
## For instance v("leila")["il"] =1
## q-gram is ∑ |v(s1, p) - v(s2, p)|
##
##############################################################################
"""
For an AbstractString s, denote v(s) the vector on the space of q-grams of length N, that contains the number of times a q-gram appears in s
The q-gram distance is ||v(s1) - v(s2)||
"""
"""
QGram(n::Int)
Creates a QGram metric.
The distance corresponds to
``||v(s1, n) - v(s2, n)||``
where ``v(s, n)`` denotes the vector on the space of q-grams of length n, that contains the number of times a q-gram appears for the string s
"""
struct QGram <: AbstractQGramDistance
N::Int
end
@ -114,9 +126,19 @@ end
##
## cosine
##
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
##
##############################################################################
"""
Cosine(n::Int)
Creates a Cosine metric.
The distance corresponds to
`` 1 - v(s1, n).v(s2, n) / ||v(s1, n)|| * ||v(s2, n)||``
where ``v(s, n)`` denotes the vector on the space of q-grams of length n, that contains the number of times a q-gram appears for the string s
"""
struct Cosine <: AbstractQGramDistance
N::Int
end
@ -135,11 +157,18 @@ end
##
## Jaccard
##
## Denote Q(s, q) the set of tuple of length q in s
## 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))|
##
##############################################################################
"""
Jaccard(n::Int)
Creates a Jaccard metric.
The distance corresponds to
``1 - |Q(s1, n) Q(s2, n)| / |Q(s1, n) Q(s2, n))|``
where ``Q(s, n)`` denotes the set of q-grams of length n for the string s
"""
struct Jaccard <: AbstractQGramDistance
N::Int
end
@ -158,9 +187,18 @@ end
##
## SorensenDice
##
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
##############################################################################
"""
SorensenDice(n::Int)
Creates a SorensenDice metric
The distance corresponds to
``1 - 2 * |Q(s1, n) Q(s2, n)| / (|Q(s1, n)| + |Q(s2, n))|)``
where ``Q(s, n)`` denotes the set of q-grams of length n for the string s
"""
struct SorensenDice <: AbstractQGramDistance
N::Int
end
@ -181,7 +219,17 @@ end
##
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
##############################################################################
"""
Overlap(n::Int)
Creates a Overlap metric
The distance corresponds to
``1 - |Q(s1, n) Q(s2, n)| / min(|Q(s1, n)|, |Q(s2, n)|)``
where ``Q(s, n)`` denotes the set of q-grams of length n for the string s
"""
struct Overlap <: AbstractQGramDistance
N::Int
end