StringDistance is now just an union

pull/23/head
matthieugomez 2020-02-08 11:49:53 -05:00
parent e1b8aa6500
commit 154f1465fd
4 changed files with 23 additions and 27 deletions

View File

@ -4,29 +4,24 @@ using Distances
import Distances: evaluate, result_type import Distances: evaluate, result_type
abstract type StringDistance <: SemiMetric end
include("utils.jl") include("utils.jl")
include("edit.jl") include("edit.jl")
include("qgram.jl") include("qgram.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance}
include("compare.jl") include("compare.jl")
include("find.jl") include("find.jl")
############################################################################## ##############################################################################
## ##
## Handle missing values ## Distances API
## ##
############################################################################## ##############################################################################
function result_type(dist::StringDistance, s1, s2)
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing typeof(evaluate(dist, "", ""))
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
end end
############################################################################## ##############################################################################
## ##
## Export ## Export

View File

@ -47,7 +47,7 @@ similarity score between two strings, when their original similarity score is a
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
length of their common prefix and `score` denotes the original score length of their common prefix and `score` denotes the original score
""" """
struct Winkler{S <: StringDistance} <: StringDistance struct Winkler{S <: SemiMetric} <: SemiMetric
dist::S dist::S
p::Float64 # scaling factor. Default to 0.1 p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost threshold. Default to 0.7 threshold::Float64 # boost threshold. Default to 0.7
@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
0.4516129032258065 0.4516129032258065
``` ```
""" """
struct Partial{S <: StringDistance} <: StringDistance struct Partial{S <: SemiMetric} <: SemiMetric
dist::S dist::S
end end
@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0)
return out return out
end end
function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0) function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
s1, s2 = reorder(s1, s2) s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2) len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist) len1 == len2 && return compare(s1, s2, dist.dist)
@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
1.0 1.0
``` ```
""" """
struct TokenSort{T <: StringDistance} <: StringDistance struct TokenSort{S <: SemiMetric} <: SemiMetric
dist::T dist::S
end end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
1.0 1.0
``` ```
""" """
struct TokenSet{T <: StringDistance} <: StringDistance struct TokenSet{S <: SemiMetric} <: SemiMetric
dist::T dist::S
end end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
0.95 0.95
``` ```
""" """
struct TokenMax{S <: StringDistance} <: StringDistance struct TokenMax{S <: SemiMetric} <: SemiMetric
dist::S dist::S
end end

View File

@ -11,8 +11,7 @@ The Jaro distance is defined as
where ``m`` is the number of matching characters and where ``m`` is the number of matching characters and
``t`` is half the number of transpositions. ``t`` is half the number of transpositions.
""" """
struct Jaro <: StringDistance end struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString ## accepts any iterator, including AbstractString
@ -81,7 +80,7 @@ Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other. substitutions of a single character) required to change one string into the other.
""" """
struct Levenshtein <: StringDistance end struct Levenshtein <: Metric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist +1 if distance higher than max_dist # Return max_dist +1 if distance higher than max_dist
@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
deletions or substitutions of a single character, or transposition of two adjacent characters) deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other. required to change one string into the other.
""" """
struct DamerauLevenshtein <: StringDistance end struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## accepts any iterator, including AbstractString ## accepts any iterator, including AbstractString
@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character
in the longest common subsequence plus, recursively, matching characters in the unmatched in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence. region on either side of the longest common subsequence.
""" """
struct RatcliffObershelp <: StringDistance end struct RatcliffObershelp <: SemiMetric end
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2))) n_matched = sum(last.(matching_blocks(s1, s2)))

View File

@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S} Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
#q-grams of AbstractVector
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s)) function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1 view(qgram.s, state:(state + qgram.q - 1)), state + 1
@ -38,13 +38,12 @@ for x in qgrams("hello", 2)
end end
``` ```
""" """
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q) qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q) qgrams(s, q::Integer) = QGramIterator(collect(s), q)
abstract type QGramDistance <: StringDistance end abstract type QGramDistance <: SemiMetric end
# For two iterators x1 and x2, that define a length and eltype method, # For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2, # this returns a dictionary which, for each element in x1 or x2,