From 154f1465fd5e7a472a0222a4446ef00ece294b89 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Sat, 8 Feb 2020 11:49:53 -0500 Subject: [PATCH] StringDistance is now just an union --- src/StringDistances.jl | 15 +++++---------- src/compare.jl | 16 ++++++++-------- src/edit.jl | 12 +++++++----- src/qgram.jl | 7 +++---- 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 7939d30..d8efcdf 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -4,29 +4,24 @@ using Distances import Distances: evaluate, result_type -abstract type StringDistance <: SemiMetric end include("utils.jl") include("edit.jl") include("qgram.jl") +const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance} include("compare.jl") include("find.jl") ############################################################################## ## -## Handle missing values +## Distances API ## ############################################################################## - -evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing -evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing - - - -function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString) - typeof(evaluate(dist, oneunit(s1), oneunit(s2))) +function result_type(dist::StringDistance, s1, s2) + typeof(evaluate(dist, "", "")) end + ############################################################################## ## ## Export diff --git a/src/compare.jl b/src/compare.jl index 56f7da4..ac3e8c4 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -47,7 +47,7 @@ similarity score between two strings, when their original similarity score is a The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the length of their common prefix and `score` denotes the original score """ -struct Winkler{S <: StringDistance} <: StringDistance +struct Winkler{S <: SemiMetric} <: SemiMetric dist::S p::Float64 # scaling factor. Default to 0.1 threshold::Float64 # boost threshold. Default to 0.7 @@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp())) 0.4516129032258065 ``` """ -struct Partial{S <: StringDistance} <: StringDistance +struct Partial{S <: SemiMetric} <: SemiMetric dist::S end @@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0) return out end -function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0) +function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0) s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist) @@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp())) 1.0 ``` """ -struct TokenSort{T <: StringDistance} <: StringDistance - dist::T +struct TokenSort{S <: SemiMetric} <: SemiMetric + dist::S end # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ @@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp())) 1.0 ``` """ -struct TokenSet{T <: StringDistance} <: StringDistance - dist::T +struct TokenSet{S <: SemiMetric} <: SemiMetric + dist::S end # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ @@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp())) 0.95 ``` """ -struct TokenMax{S <: StringDistance} <: StringDistance +struct TokenMax{S <: SemiMetric} <: SemiMetric dist::S end diff --git a/src/edit.jl b/src/edit.jl index 116af1e..9f7aea6 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -11,8 +11,7 @@ The Jaro distance is defined as where ``m`` is the number of matching characters and ``t`` is half the number of transpositions. """ -struct Jaro <: StringDistance end - +struct Jaro <: SemiMetric end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html ## accepts any iterator, including AbstractString @@ -81,7 +80,7 @@ Creates the Levenshtein metric The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, substitutions of a single character) required to change one string into the other. """ -struct Levenshtein <: StringDistance end +struct Levenshtein <: Metric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html # Return max_dist +1 if distance higher than max_dist @@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting deletions or substitutions of a single character, or transposition of two adjacent characters) required to change one string into the other. """ -struct DamerauLevenshtein <: StringDistance end +struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## accepts any iterator, including AbstractString @@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence. """ -struct RatcliffObershelp <: StringDistance end +struct RatcliffObershelp <: SemiMetric end + +evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing +evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) n_matched = sum(last.(matching_blocks(s1, s2))) diff --git a/src/qgram.jl b/src/qgram.jl index 7c1e659..9f8bfec 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S} Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S} - +#q-grams of AbstractVector function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s)) state + qgram.q - 1 > lastindex(qgram.s) && return nothing view(qgram.s, state:(state + qgram.q - 1)), state + 1 @@ -38,13 +38,12 @@ for x in qgrams("hello", 2) end ``` """ -qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q) -qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q) +qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q) qgrams(s, q::Integer) = QGramIterator(collect(s), q) -abstract type QGramDistance <: StringDistance end +abstract type QGramDistance <: SemiMetric end # For two iterators x1 and x2, that define a length and eltype method, # this returns a dictionary which, for each element in x1 or x2,