StringDistance is now just an union

pull/23/head
matthieugomez 2020-02-08 11:49:53 -05:00
parent e1b8aa6500
commit 154f1465fd
4 changed files with 23 additions and 27 deletions

View File

@ -4,29 +4,24 @@ using Distances
import Distances: evaluate, result_type
abstract type StringDistance <: SemiMetric end
include("utils.jl")
include("edit.jl")
include("qgram.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance}
include("compare.jl")
include("find.jl")
##############################################################################
##
## Handle missing values
## Distances API
##
##############################################################################
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
function result_type(dist::StringDistance, s1, s2)
typeof(evaluate(dist, "", ""))
end
##############################################################################
##
## Export

View File

@ -47,7 +47,7 @@ similarity score between two strings, when their original similarity score is a
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
length of their common prefix and `score` denotes the original score
"""
struct Winkler{S <: StringDistance} <: StringDistance
struct Winkler{S <: SemiMetric} <: SemiMetric
dist::S
p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost threshold. Default to 0.7
@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
0.4516129032258065
```
"""
struct Partial{S <: StringDistance} <: StringDistance
struct Partial{S <: SemiMetric} <: SemiMetric
dist::S
end
@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0)
return out
end
function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0)
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
1.0
```
"""
struct TokenSort{T <: StringDistance} <: StringDistance
dist::T
struct TokenSort{S <: SemiMetric} <: SemiMetric
dist::S
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
1.0
```
"""
struct TokenSet{T <: StringDistance} <: StringDistance
dist::T
struct TokenSet{S <: SemiMetric} <: SemiMetric
dist::S
end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
0.95
```
"""
struct TokenMax{S <: StringDistance} <: StringDistance
struct TokenMax{S <: SemiMetric} <: SemiMetric
dist::S
end

View File

@ -11,8 +11,7 @@ The Jaro distance is defined as
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: StringDistance end
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString
@ -81,7 +80,7 @@ Creates the Levenshtein metric
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
substitutions of a single character) required to change one string into the other.
"""
struct Levenshtein <: StringDistance end
struct Levenshtein <: Metric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
# Return max_dist +1 if distance higher than max_dist
@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
deletions or substitutions of a single character, or transposition of two adjacent characters)
required to change one string into the other.
"""
struct DamerauLevenshtein <: StringDistance end
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## accepts any iterator, including AbstractString
@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character
in the longest common subsequence plus, recursively, matching characters in the unmatched
region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: StringDistance end
struct RatcliffObershelp <: SemiMetric end
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))

View File

@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
#q-grams of AbstractVector
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
@ -38,13 +38,12 @@ for x in qgrams("hello", 2)
end
```
"""
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
abstract type QGramDistance <: StringDistance end
abstract type QGramDistance <: SemiMetric end
# For two iterators x1 and x2, that define a length and eltype method,
# this returns a dictionary which, for each element in x1 or x2,