StringDistance is now just an union

2020-02-08 11:49:53 -05:00 · 2020-02-08 11:49:53 -05:00 · 154f1465fd
parent e1b8aa6500
commit 154f1465fd
4 changed files with 23 additions and 27 deletions
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -4,29 +4,24 @@ using Distances
 import Distances: evaluate, result_type


-abstract type StringDistance <: SemiMetric end
 include("utils.jl")
 include("edit.jl")
 include("qgram.jl")
+const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance}
 include("compare.jl")
 include("find.jl")

 ##############################################################################
 ##
-## Handle missing values
+## Distances API
 ##
 ##############################################################################

-
-evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
-evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
-
-
-
-function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
-    typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
+function result_type(dist::StringDistance, s1, s2)
+    typeof(evaluate(dist, "", ""))
 end

+
 ##############################################################################
 ##
 ## Export
--- a/src/compare.jl
+++ b/src/compare.jl
@ -47,7 +47,7 @@ similarity score between  two strings, when their original similarity score is a
 The boost is equal to `min(l,  maxlength) * p * (1 - score)` where `l` denotes the 
 length of their common prefix and `score` denotes the original score
 """
-struct Winkler{S <: StringDistance} <: StringDistance
+struct Winkler{S <: SemiMetric} <: SemiMetric
    dist::S
    p::Float64          # scaling factor. Default to 0.1
    threshold::Float64  # boost threshold. Default to 0.7
@ -86,7 +86,7 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
 0.4516129032258065
 ```
 """
-struct Partial{S <: StringDistance} <: StringDistance
+struct Partial{S <: SemiMetric} <: SemiMetric
    dist::S
 end

@ -104,7 +104,7 @@ function compare(s1, s2, dist::Partial; min_score = 0.0)
    return out
 end

-function compare(s1, s2, dist::Partial{RatcliffObershelp}; min_score = 0.0)
+function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
    s1, s2 = reorder(s1, s2)
    len1, len2 = length(s1), length(s2)
    len1 == len2 && return compare(s1, s2, dist.dist)
@ -145,8 +145,8 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
 1.0
 ```
 """
-struct TokenSort{T <: StringDistance} <: StringDistance
-    dist::T
+struct TokenSort{S <: SemiMetric} <: SemiMetric
+    dist::S
 end

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -173,8 +173,8 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
 1.0
 ```
 """
-struct TokenSet{T <: StringDistance} <: StringDistance
-    dist::T
+struct TokenSet{S <: SemiMetric} <: SemiMetric
+    dist::S
 end

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
@ -212,7 +212,7 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
 0.95
 ```
 """
-struct TokenMax{S <: StringDistance} <: StringDistance
+struct TokenMax{S <: SemiMetric} <: SemiMetric
    dist::S
 end

--- a/src/edit.jl
+++ b/src/edit.jl
@ -11,8 +11,7 @@ The Jaro distance is defined as
 where ``m`` is the number of matching characters and 
 ``t`` is half the number of transpositions.
 """
-struct Jaro <: StringDistance end
-
+struct Jaro <: SemiMetric end

 ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
 ## accepts any iterator, including AbstractString
@ -81,7 +80,7 @@ Creates the Levenshtein metric
 The Levenshtein distance is the minimum number of operations (consisting of insertions, deletions, 
 substitutions of a single character) required to change one string into the other.
 """
-struct Levenshtein  <: StringDistance end
+struct Levenshtein <: Metric end

 ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
 # Return max_dist +1 if distance higher than max_dist
@ -139,7 +138,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
 deletions or substitutions of a single character, or transposition of two adjacent characters) 
 required to change one string into the other.
 """
-struct DamerauLevenshtein <: StringDistance end
+struct DamerauLevenshtein <: SemiMetric end

 ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 ## accepts any iterator, including AbstractString
@ -225,7 +224,10 @@ divided by the total number of characters in the two strings. Matching character
 in the longest common subsequence plus, recursively, matching characters in the unmatched 
 region on either side of the longest common subsequence.
 """
-struct RatcliffObershelp <: StringDistance end
+struct RatcliffObershelp <: SemiMetric end
+
+evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
+evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing

 function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
    n_matched = sum(last.(matching_blocks(s1, s2)))
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -17,7 +17,7 @@ Base.eltype(qgram::QGramIterator{SubString{S}}) where {S} = SubString{S}
 Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}


-
+#q-grams of AbstractVector
 function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
 	state + qgram.q - 1 > lastindex(qgram.s) && return nothing
 	view(qgram.s, state:(state + qgram.q - 1)), state + 1
@ -38,13 +38,12 @@ for x in qgrams("hello", 2)
 end
 ```
 """
-qgrams(s::AbstractString, q::Integer) = QGramIterator(s, q)
-qgrams(s::AbstractVector, q::Integer) = QGramIterator(s, q)
+qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s, q)
 qgrams(s, q::Integer) = QGramIterator(collect(s), q)



-abstract type QGramDistance <: StringDistance end
+abstract type QGramDistance <: SemiMetric end

 # For two iterators x1 and x2, that define a length and eltype method,
 # this returns a dictionary which, for each element in x1 or x2,