StringDistance is now just an union
parent
e1b8aa6500
commit
154f1465fd
|
@ -4,29 +4,24 @@ using Distances
|
||||||
import Distances: evaluate, result_type
|
import Distances: evaluate, result_type
|
||||||
|
|
||||||
|
|
||||||
abstract type StringDistance <: SemiMetric end
|
|
||||||
include("utils.jl")
|
include("utils.jl")
|
||||||
include("edit.jl")
|
include("edit.jl")
|
||||||
include("qgram.jl")
|
include("qgram.jl")
|
||||||
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance}
|
||||||
include("compare.jl")
|
include("compare.jl")
|
||||||
include("find.jl")
|
include("find.jl")
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Handle missing values
|
## Distances API
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
|
function result_type(dist::StringDistance, s1, s2)
|
||||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
typeof(evaluate(dist, "", ""))
|
||||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
|
|
||||||
typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## Export
|
## Export
|
||||||
|
|
|
@ -47,7 +47,7 @@ similarity score between two strings, when their original similarity score is a
|
||||||
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
|
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
|
||||||
length of their common prefix and `score` denotes the original score
|
length of their common prefix and `score` denotes the original score
|
||||||
"""
|
"""
|
||||||
struct Winkler{S <: StringDistance} <: StringDistance
|
struct Winkler{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
p::Float64 # scaling factor. Default to 0.1
|
p::Float64 # scaling factor. Default to 0.1
|
||||||
threshold::Float64 # boost threshold. Default to 0.7
|
threshold::Float64 # boost threshold. Default to 0.7
|
||||||
|
@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
|
||||||
0.4516129032258065
|
0.4516129032258065
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct Partial{S <: StringDistance} <: StringDistance
|
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0)
|
||||||
return out
|
return out
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||||
|
@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
|
||||||
1.0
|
1.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenSort{T <: StringDistance} <: StringDistance
|
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||||
dist::T
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
|
@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
|
||||||
1.0
|
1.0
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenSet{T <: StringDistance} <: StringDistance
|
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||||
dist::T
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
|
@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
|
||||||
0.95
|
0.95
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
struct TokenMax{S <: StringDistance} <: StringDistance
|
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
12
src/edit.jl
12
src/edit.jl
|
@ -11,8 +11,7 @@ The Jaro distance is defined as
|
||||||
where ``m`` is the number of matching characters and
|
where ``m`` is the number of matching characters and
|
||||||
``t`` is half the number of transpositions.
|
``t`` is half the number of transpositions.
|
||||||
"""
|
"""
|
||||||
struct Jaro <: StringDistance end
|
struct Jaro <: SemiMetric end
|
||||||
|
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
## accepts any iterator, including AbstractString
|
## accepts any iterator, including AbstractString
|
||||||
|
@ -81,7 +80,7 @@ Creates the Levenshtein metric
|
||||||
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions,
|
||||||
substitutions of a single character) required to change one string into the other.
|
substitutions of a single character) required to change one string into the other.
|
||||||
"""
|
"""
|
||||||
struct Levenshtein <: StringDistance end
|
struct Levenshtein <: Metric end
|
||||||
|
|
||||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||||
# Return max_dist +1 if distance higher than max_dist
|
# Return max_dist +1 if distance higher than max_dist
|
||||||
|
@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
|
||||||
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
deletions or substitutions of a single character, or transposition of two adjacent characters)
|
||||||
required to change one string into the other.
|
required to change one string into the other.
|
||||||
"""
|
"""
|
||||||
struct DamerauLevenshtein <: StringDistance end
|
struct DamerauLevenshtein <: SemiMetric end
|
||||||
|
|
||||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
## accepts any iterator, including AbstractString
|
## accepts any iterator, including AbstractString
|
||||||
|
@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character
|
||||||
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
in the longest common subsequence plus, recursively, matching characters in the unmatched
|
||||||
region on either side of the longest common subsequence.
|
region on either side of the longest common subsequence.
|
||||||
"""
|
"""
|
||||||
struct RatcliffObershelp <: StringDistance end
|
struct RatcliffObershelp <: SemiMetric end
|
||||||
|
|
||||||
|
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||||
|
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||||
|
|
||||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
|
|
|
@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
|
||||||
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||||
|
|
||||||
|
|
||||||
|
#q-grams of AbstractVector
|
||||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||||
|
@ -38,13 +38,12 @@ for x in qgrams("hello", 2)
|
||||||
end
|
end
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
|
qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
|
||||||
qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
|
|
||||||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
abstract type QGramDistance <: StringDistance end
|
abstract type QGramDistance <: SemiMetric end
|
||||||
|
|
||||||
# For two iterators x1 and x2, that define a length and eltype method,
|
# For two iterators x1 and x2, that define a length and eltype method,
|
||||||
# this returns a dictionary which, for each element in x1 or x2,
|
# this returns a dictionary which, for each element in x1 or x2,
|
||||||
|
|
Loading…
Reference in New Issue