remove grapheme <: AbstractString

don’t
pull/3/head
matthieugomez 2016-06-28 10:52:42 -04:00
parent 0f6c814b26
commit bed352eef7
7 changed files with 56 additions and 73 deletions

View File

@ -96,6 +96,15 @@ The package includes distance modifiers:
#> 0.855
```
## Unicode
To iterate on graphemes rather than characters, use `graphemeiterator`:
```julia
evaluate(Hamming(), "b\u0300", "a")
#> 2
evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a"))
#> 1
```
## References
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
- [fuzzywuzzy blog post](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)

View File

@ -32,54 +32,26 @@ TokenSort,
TokenSet,
TokenMax,
graphemeiterator
##############################################################################
##
## Define GraphemeIterator as AbstractString
##
## Argument for AbstractString inheritance:
## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined
## (ii) SubString(x::GraphemeIterator, i, j) works
## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type)
## Argument for non inheritance:
## (i) All existing types <: AbstractString gives char as individual, which is important for print_escaped & search.
## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join?
##
##############################################################################
# from Base. I redefine it because I want AbstractStringinheritance
immutable GraphemeIterator{S<:AbstractString} <: AbstractString
s::S # original string (for generation of SubStrings)
end
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end
start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i::Int) = done(g.s, i)
function next(g::GraphemeIterator, i::Int)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k =
c0 = c
end
return (SubString(s, i, j), k)
end
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
##############################################################################
##
## TypeAlias
##
##############################################################################
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
##############################################################################
##
## GraphemeIterator iterates on Grapheme
##
##############################################################################
Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i)
Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i)
Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i)
Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j::Integer))
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
# added
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
@ -91,7 +63,7 @@ function Base.isvalid(s::GraphemeIterator, i::Integer)
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
end
end
function endof(s::GraphemeIterator)
function Base.endof(s::GraphemeIterator)
c0 = Char(0x00ad)
i = endof(s.s)
i0 = start(s.s)
@ -128,7 +100,7 @@ include("modifiers/fuzzywuzzy.jl")
##############################################################################
for x in (:evaluate, :compare)
@eval begin
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
function $x(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
len1, len2 = length(s1), length(s2)
if len1 > len2
return $x(dist, s2, s1, len2, len1)
@ -139,19 +111,20 @@ for x in (:evaluate, :compare)
end
end
##############################################################################
##
## compare
##
##############################################################################
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
function compare(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
@ -160,14 +133,14 @@ end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)

View File

@ -1,6 +1,6 @@
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
function longest_common_substring(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
if length(s1) > length(s2)
start2, start1, size= longest_common_substring(s2, s1)
else
@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
return start1, start2, size
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
end
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
function matching_blocks(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
x = Set{Tuple{Int, Int, Int}}()
matching_blocks!(x, s1, s2, 1, 1)
return x
end
type RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::RatcliffObershelp, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
result = matching_blocks(s1, s2)
matched = 0
for x in result

View File

@ -4,7 +4,7 @@
## Assumes length(s1) <= length(s2)
##############################################################################
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
function common_prefix(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, lim::Integer = -1)
start1 = start(s1)
start2 = start(s2)
l = 0
@ -24,7 +24,7 @@ end
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer)
function evaluate(dist::Hamming, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2:: Integer)
count = 0
for (ch1, ch2) in zip(s1, s2)
count += ch1 != ch2
@ -42,7 +42,7 @@ end
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -90,7 +90,7 @@ end
type DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -158,7 +158,7 @@ end
type Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::Jaro, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
len2 == 0 && return 0.0
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte
return 1.0 - score
end
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)
jaro(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) = evaluate(Jaro(), s1, s2)

View File

@ -4,8 +4,8 @@
##
##############################################################################
immutable QGramIterator{S <: AbstractString, T <: Integer}
s::S # grapheorstring
immutable QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
s::S # grapheme
l::Int # length of string
q::T # length of q-grams
end
@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s)}
Base.eltype{S <: GraphemeIterator, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s.s)}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array(eltype(qgram), length(qgram))
@ -80,7 +81,7 @@ end
##############################################################################
abstract AbstractQGram <: SemiMetric
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::AbstractQGram, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))
sort2 = sort(QGramIterator(s2, len2, dist.q))
evaluate(dist, CountInterator(sort1, sort2))

View File

@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric
end
# general
function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Partial, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
len1 == 0 && return compare(dist.dist, "", "", 0, 0)
iter = QGramIterator(s2, len2, len1)
@ -26,7 +26,7 @@ end
# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
out = 0.0
result = matching_blocks(s1, s2)
@ -59,7 +59,7 @@ type TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenSort, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
if search(s1, Base._default_delims) > 0
s1 = iterator(typeof(s1), join(sort!(split(s1)), " "))
@ -81,7 +81,7 @@ type TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenSet, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = iterator(typeof(s1), join(v0, " "))
@ -129,7 +129,7 @@ type TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenMax, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
len1::Integer, len2::Integer)
base = compare(dist.dist, s1, s2, len1, len2)
unbase_scale = 0.95

View File

@ -13,7 +13,7 @@ end
# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Winkler, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
score = compare(dist.dist, s1, s2, len1, len2)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment