diff --git a/README.md b/README.md index 47712a7..9882369 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,15 @@ The package includes distance modifiers: #> 0.855 ``` +## Unicode +To iterate on graphemes rather than characters, use `graphemeiterator`: + +```julia +evaluate(Hamming(), "b\u0300", "a") +#> 2 +evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) +#> 1 +``` ## References - [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo - [fuzzywuzzy blog post](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) diff --git a/src/StringDistances.jl b/src/StringDistances.jl index db482f0..2f54ced 100644 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -32,54 +32,26 @@ TokenSort, TokenSet, TokenMax, graphemeiterator -############################################################################## -## -## Define GraphemeIterator as AbstractString -## -## Argument for AbstractString inheritance: -## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined -## (ii) SubString(x::GraphemeIterator, i, j) works -## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type) -## Argument for non inheritance: -## (i) All existing types <: AbstractString gives char as individual, which is important for print_escaped & search. -## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join? -## -############################################################################## -# from Base. I redefine it because I want AbstractStringinheritance -immutable GraphemeIterator{S<:AbstractString} <: AbstractString - s::S # original string (for generation of SubStrings) -end -graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s) -eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S} -function length(g::GraphemeIterator) - c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) - n = 0 - for c in g.s - n += isgraphemebreak(c0, c) - c0 = c - end - return n -end -start(g::GraphemeIterator) = start(g.s) -done(g::GraphemeIterator, i::Int) = done(g.s, i) -function next(g::GraphemeIterator, i::Int) - s = g.s - j = i - c0, k = next(s, i) - while !done(s, k) # loop until next grapheme is s[i:j] - c, ℓ = next(s, k) - isgraphemebreak(c0, c) && break - j = k - k = ℓ - c0 = c - end - return (SubString(s, i, j), k) -end -==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s -hash(g::GraphemeIterator, h::UInt) = hash(g.s, h) -isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s) -show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"") +############################################################################## +## +## TypeAlias +## +############################################################################## + +typealias GraphemeIterator Base.UTF8proc.GraphemeIterator +typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator} + +############################################################################## +## +## GraphemeIterator iterates on Grapheme +## +############################################################################## +Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i) +Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i) +Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i) +Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j::Integer)) +graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s) # added #these 2 functions allow to define prevind nextind, chr2ind, prevind etc @@ -91,7 +63,7 @@ function Base.isvalid(s::GraphemeIterator, i::Integer) return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i]) end end -function endof(s::GraphemeIterator) +function Base.endof(s::GraphemeIterator) c0 = Char(0x00ad) i = endof(s.s) i0 = start(s.s) @@ -128,7 +100,7 @@ include("modifiers/fuzzywuzzy.jl") ############################################################################## for x in (:evaluate, :compare) @eval begin - function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString) + function $x(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) len1, len2 = length(s1), length(s2) if len1 > len2 return $x(dist, s2, s1, len2, len1) @@ -139,19 +111,20 @@ for x in (:evaluate, :compare) end end + ############################################################################## ## ## compare ## ############################################################################## -function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, +function compare(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) 1.0 - evaluate(dist, s1, s2, len1, len2) end function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, - s1::AbstractString, s2::AbstractString, + s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) distance = evaluate(dist, s1, s2, len1, len2) len2 == 0 ? 1.0 : 1.0 - distance / len2 @@ -160,14 +133,14 @@ end # compare always return a value between 0 and 1. # When string length < q for qgram distance, returns s1 == s2 function compare(dist::AbstractQGram, - s1::AbstractString, s2::AbstractString, + s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) evaluate(dist, s1, s2, len1, len2) end function compare(dist::QGram, - s1::AbstractString, s2::AbstractString, + s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) distance = evaluate(dist, s1, s2, len1, len2) diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl index ed8644d..c2e107d 100644 --- a/src/distances/RatcliffObershelp.jl +++ b/src/distances/RatcliffObershelp.jl @@ -1,6 +1,6 @@ # Return start of commn substring in s1, start of common substring in s2, and length of substring # Indexes refer to character number, not index (differ for Unicode strings) -function longest_common_substring(s1::AbstractString, s2::AbstractString) +function longest_common_substring(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) if length(s1) > length(s2) start2, start1, size= longest_common_substring(s2, s1) else @@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString) return start1, start2, size end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer) +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, start1::Integer, start2::Integer) a = longest_common_substring(s1, s2) if a[3] > 0 push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) @@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: end end -function matching_blocks(s1::AbstractString, s2::AbstractString) +function matching_blocks(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) x = Set{Tuple{Int, Int, Int}}() matching_blocks!(x, s1, s2, 1, 1) return x end type RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::RatcliffObershelp, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) result = matching_blocks(s1, s2) matched = 0 for x in result diff --git a/src/distances/edit.jl b/src/distances/edit.jl index d30a3e9..fd475fd 100644 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -4,7 +4,7 @@ ## Assumes length(s1) <= length(s2) ############################################################################## -function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1) +function common_prefix(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, lim::Integer = -1) start1 = start(s1) start2 = start(s2) l = 0 @@ -24,7 +24,7 @@ end ## ############################################################################## -function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer) +function evaluate(dist::Hamming, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2:: Integer) count = 0 for (ch1, ch2) in zip(s1, s2) count += ch1 != ch2 @@ -42,7 +42,7 @@ end type Levenshtein <: SemiMetric end -function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -90,7 +90,7 @@ end type DamerauLevenshtein <: SemiMetric end -function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -158,7 +158,7 @@ end type Jaro <: SemiMetric end -function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::Jaro, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) # if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope. len2 == 0 && return 0.0 @@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte return 1.0 - score end -jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2) +jaro(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) = evaluate(Jaro(), s1, s2) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 83935a5..9fc0aa2 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -4,8 +4,8 @@ ## ############################################################################## -immutable QGramIterator{S <: AbstractString, T <: Integer} - s::S # grapheorstring +immutable QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer} + s::S # grapheme l::Int # length of string q::T # length of q-grams end @@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state) istart, idend = state done(qgram.s, idend) end -Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)} +Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s)} +Base.eltype{S <: GraphemeIterator, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s.s)} Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) function Base.collect(qgram::QGramIterator) x = Array(eltype(qgram), length(qgram)) @@ -80,7 +81,7 @@ end ############################################################################## abstract AbstractQGram <: SemiMetric -function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::AbstractQGram, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) sort1 = sort(QGramIterator(s1, len1, dist.q)) sort2 = sort(QGramIterator(s2, len2, dist.q)) evaluate(dist, CountInterator(sort1, sort2)) diff --git a/src/modifiers/fuzzywuzzy.jl b/src/modifiers/fuzzywuzzy.jl index 191dbc9..983e38b 100644 --- a/src/modifiers/fuzzywuzzy.jl +++ b/src/modifiers/fuzzywuzzy.jl @@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric end # general -function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Partial, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) len1 == len2 && return compare(dist.dist, s1, s2, len1, len2) len1 == 0 && return compare(dist.dist, "", "", 0, 0) iter = QGramIterator(s2, len2, len1) @@ -26,7 +26,7 @@ end # Specialization for RatcliffObershelp distance # Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py -function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Partial{RatcliffObershelp}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) len1 == len2 && return compare(dist.dist, s1, s2, len1, len2) out = 0.0 result = matching_blocks(s1, s2) @@ -59,7 +59,7 @@ type TokenSort{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenSort, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) if search(s1, Base._default_delims) > 0 s1 = iterator(typeof(s1), join(sort!(split(s1)), " ")) @@ -81,7 +81,7 @@ type TokenSet{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenSet, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) v0, v1, v2 = _separate!(split(s1), split(s2)) s0 = iterator(typeof(s1), join(v0, " ")) @@ -129,7 +129,7 @@ type TokenMax{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenMax, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) base = compare(dist.dist, s1, s2, len1, len2) unbase_scale = 0.95 diff --git a/src/modifiers/winkler.jl b/src/modifiers/winkler.jl index 08025a0..74b5097 100644 --- a/src/modifiers/winkler.jl +++ b/src/modifiers/winkler.jl @@ -13,7 +13,7 @@ end # restrict to distance between 0 and 1 Winkler(x) = Winkler(x, 0.1, 0.7) -function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Winkler, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer) score = compare(dist.dist, s1, s2, len1, len2) l = common_prefix(s1, s2, 4)[1] # common prefix adjustment