StringDistance is now just an union
parent
e1b8aa6500
commit
154f1465fd
|
@ -4,29 +4,24 @@ using Distances
|
|||
import Distances: evaluate, result_type
|
||||
|
||||
|
||||
abstract type StringDistance <: SemiMetric end
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance}
|
||||
include("compare.jl")
|
||||
include("find.jl")
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Handle missing values
|
||||
## Distances API
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
|
||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||
|
||||
|
||||
|
||||
function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
|
||||
typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
|
||||
function result_type(dist::StringDistance, s1, s2)
|
||||
typeof(evaluate(dist, "", ""))
|
||||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Export
|
||||
|
|
|
@ -47,7 +47,7 @@ similarity score between two strings, when their original similarity score is a
|
|||
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
|
||||
length of their common prefix and `score` denotes the original score
|
||||
"""
|
||||
struct Winkler{S <: StringDistance} <: StringDistance
|
||||
struct Winkler{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
p::Float64 # scaling factor. Default to 0.1
|
||||
threshold::Float64 # boost threshold. Default to 0.7
|
||||
|
@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
|
|||
0.4516129032258065
|
||||
```
|
||||
"""
|
||||
struct Partial{S <: StringDistance} <: StringDistance
|
||||
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0)
|
|||
return out
|
||||
end
|
||||
|
||||
function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
|
@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
|
|||
1.0
|
||||
```
|
||||
"""
|
||||
struct TokenSort{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
|
@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
|
|||
1.0
|
||||
```
|
||||
"""
|
||||
struct TokenSet{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
|
@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
|
|||
0.95
|
||||
```
|
||||
"""
|
||||
struct TokenMax{S <: StringDistance} <: StringDistance
|
||||
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
|
||||
|
|
12
src/edit.jl
12
src/edit.jl
|
@ -11,8 +11,7 @@ The Jaro distance is defined as
|
|||
where ``m`` is the number of matching characters and
|
||||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: StringDistance end
|
||||
|
||||
struct Jaro <: SemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
## accepts any iterator, including AbstractString
|
||||
|
@ -81,7 +80,7 @@ Creates the Levenshtein metric
|
|||
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
||||
substitutions of a single character) required to change one string into the other.
|
||||
"""
|
||||
struct Levenshtein <: StringDistance end
|
||||
struct Levenshtein <: Metric end
|
||||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
# Return max_dist +1 if distance higher than max_dist
|
||||
|
@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
|
|||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||
required to change one string into the other.
|
||||
"""
|
||||
struct DamerauLevenshtein <: StringDistance end
|
||||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
## accepts any iterator, including AbstractString
|
||||
|
@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character
|
|||
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
||||
region on either side of the longest common subsequence.
|
||||
"""
|
||||
struct RatcliffObershelp <: StringDistance end
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
|
|
|
@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
|||
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||
|
||||
|
||||
|
||||
#q-grams of AbstractVector
|
||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||
|
@ -38,13 +38,12 @@ for x in qgrams("hello", 2)
|
|||
end
|
||||
```
|
||||
"""
|
||||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
|
||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
|
||||
|
||||
|
||||
abstract type QGramDistance <: StringDistance end
|
||||
abstract type QGramDistance <: SemiMetric end
|
||||
|
||||
# For two iterators x1 and x2, that define a length and eltype method,
|
||||
# this returns a dictionary which, for each element in x1 or x2,
|
||||
|
|
Loading…
Reference in New Issue