redefine graphemeiterator

pull/3/head
matthieugomez 2015-11-10 09:47:12 -05:00
parent d8ac44be23
commit 0a2d96a848
8 changed files with 124 additions and 99 deletions

View File

@ -7,7 +7,8 @@ module StringDistances
## Export
##
##############################################################################
import Base.UTF8proc.GraphemeIterator
import Base: eltype, length, start, done, next, ==, hash, isless, convert, show, endof
import Base.UTF8proc: isgraphemebreak
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import Iterators: chain
export
@ -29,57 +30,78 @@ Winkler,
Partial,
TokenSort,
TokenSet,
TokenMax
TokenMax,
graphemeiterator
##############################################################################
##
## Define 2 iterators. One on character one on Grapheme.
## Define GraphemeIterator as AbstractString
##
## Argument for non inheritance: AbstractString is generally something that gives char as individual.
## Argument for inheritance: prevind, nextind, chr2ind are defined once start, next, done, isvalid, endof are defined. Also this allows to define functions with AbstractString signature & having SubString work.
##############################################################################
immutable CharacterIterator{T <: AbstractString}
s::T
# from Base. I redefine it because I want AbstractStringinheritance
immutable GraphemeIterator{S<:AbstractString} <: AbstractString
s::S # original string (for generation of SubStrings)
end
Base.start(x::CharacterIterator) = start(x.s)
Base.next(x::CharacterIterator, i::Integer) = next(x.s, i)
Base.done(x::CharacterIterator, i::Integer) = done(x.s, i)
Base.length(x::CharacterIterator) = length(x.s)
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end
start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i::Int) = done(g.s, i)
function next(g::GraphemeIterator, i::Int)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k =
c0 = c
end
return (SubString(s, i, j), k)
end
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
Base.nextind(x::CharacterIterator, i::Integer) = nextind(x.s, i)
Base.chr2ind(x::CharacterIterator, i::Integer) = chr2ind(x.s, i::Integer)
iteratortype{T <: CharacterIterator}(::Type{T}) = CharacterIterator
Base.convert{T}(::Type{CharacterIterator{T}}, x::T) = CharacterIterator(x)
# add the following methods
Base.nextind(g::GraphemeIterator, state::Integer) = next(g, state)[2]
function Base.chr2ind(s::GraphemeIterator, i::Integer)
i < start(s) && throw(BoundsError(s.s, i))
j = 1
k = start(s)
while true
c, l = next(s,k)
if i == j
return k
end
j += 1
k = l
# added
#used in prevind nextind
function Base.isvalid(s::GraphemeIterator, i::Integer)
if !isvalid(s.s, i)
return false
else
i0 = prevind(s.s, i)
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
end
end
iteratortype{T <: GraphemeIterator}(::Type{T}) = GraphemeIterator
Base.convert{T}(::Type{GraphemeIterator{T}}, x::T) = GraphemeIterator(x)
function endof(s::GraphemeIterator)
c0 = Char(0x00ad)
i = endof(s.s)
i0 = start(s.s)
while i >= i0 && !isgraphemebreak(s.s[i], c0)
i = prevind(s.s, i)
c0 = s.s[i]
end
i
end
typealias StringIterator{T} Union{GraphemeIterator{T}, CharacterIterator{T}}
Base.endof(g::StringIterator) = endof(g.s)
Base.SubString(x::StringIterator, i, j) = iteratortype(x)(SubString(x.s, i, j))
Base.string(x::StringIterator) = x.s
Base.isempty(x::StringIterator) = isempty(x.s)
Base.eltype{T}(x::StringIterator{T}) = T
Base.isless(x::StringIterator, y::StringIterator) = isless(x.s, y.s)
Base.search(x::StringIterator, args...) = search(x.s, args...)
Base.searchsortedfirst(x::StringIterator, args...) = searchsortedfirst(x.s, args...)
Base.searchsortedlast(x::StringIterator, args...) = searchsortedlast(x.s, args...)
Base.searchsorted(x::StringIterator, args...) = searchsorted(x.s, args...)
iteratortype(x::StringIterator) = iteratortype(typeof(x))
# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now
Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property
Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x
##############################################################################
##
@ -101,14 +123,6 @@ include("modifiers/fuzzywuzzy.jl")
for x in (:evaluate, :compare)
@eval begin
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
$x(dist, CharacterIterator(s1), CharacterIterator(s2))
end
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
$x(dist, CharacterIterator(s1), CharacterIterator(s2), len1, len2)
end
function $x(dist::PreMetric, s1::StringIterator, s2::StringIterator)
len1, len2 = length(s1), length(s2)
if len1 > len2
return $x(dist, s2, s1, len2, len1)
@ -125,13 +139,13 @@ end
##
##############################################################################
function compare(dist::PreMetric, s1::StringIterator, s2::StringIterator,
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::StringIterator, s2::StringIterator,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
@ -140,14 +154,14 @@ end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::StringIterator, s2::StringIterator,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::StringIterator, s2::StringIterator,
s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)

View File

@ -1,6 +1,6 @@
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::StringIterator, s2::StringIterator)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
if length(s1) > length(s2)
start2, start1, size= longest_common_substring(s2, s1)
else
@ -28,7 +28,7 @@ function longest_common_substring(s1::StringIterator, s2::StringIterator)
return start1, start2, size
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::StringIterator, s2::StringIterator, start1::Integer, start2::Integer)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::StringIterator, s2::
end
end
function matching_blocks(s1::StringIterator, s2::StringIterator)
function matching_blocks(s1::AbstractString, s2::AbstractString)
x = Set{Tuple{Int, Int, Int}}()
matching_blocks!(x, s1, s2, 1, 1)
return x
end
type RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
result = matching_blocks(s1, s2)
matched = 0
for x in result
@ -59,7 +59,7 @@ function evaluate(dist::RatcliffObershelp, s1::StringIterator, s2::StringIterato
1.0 - 2 * matched / (len1 + len2)
end
#function buildref(s::StringIterator, len)
#function buildref(s::AbstractString, len)
# ref = Array(Int, len)
# state = start(s)
# i = 0

View File

@ -4,7 +4,7 @@
## Assumes length(s1) <= length(s2)
##############################################################################
function common_prefix(s1::StringIterator, s2::StringIterator, lim::Integer = -1)
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
start1 = start(s1)
start2 = start(s2)
l = 0
@ -24,7 +24,7 @@ end
##
##############################################################################
function evaluate(dist::Hamming, s1::StringIterator, s2::StringIterator, len1::Integer, len2:: Integer)
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer)
count = 0
for (ch1, ch2) in zip(s1, s2)
count += ch1 != ch2
@ -42,7 +42,7 @@ end
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -90,7 +90,7 @@ end
type DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -158,7 +158,7 @@ end
type Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
len2 == 0 && return 0.0
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::StringIterator, s2::StringIterator, len1::Inte
return 1.0 - score
end
jaro(s1::StringIterator, s2::StringIterator) = evaluate(Jaro(), s1, s2)
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)

View File

@ -4,7 +4,7 @@
##
##############################################################################
immutable QGramIterator{S <: StringIterator, T <: Integer}
immutable QGramIterator{S <: AbstractString, T <: Integer}
s::S # grapheorstring
l::Int # length of string
q::T # length of q-grams
@ -23,7 +23,7 @@ function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.eltype(qgram::QGramIterator) = iteratortype(qgram.s){SubString{eltype(qgram.s)}}
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array(eltype(qgram), length(qgram))
@ -80,7 +80,7 @@ end
##############################################################################
abstract AbstractQGram <: SemiMetric
function evaluate(dist::AbstractQGram, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))
sort2 = sort(QGramIterator(s2, len2, dist.q))
evaluate(dist, CountInterator(sort1, sort2))

View File

@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric
end
# general
function compare(dist::Partial, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
len1 == 0 && return compare(dist.dist, "", "", 0, 0)
iter = QGramIterator(s2, len2, len1)
@ -26,7 +26,7 @@ end
# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(dist::Partial{RatcliffObershelp}, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
out = 0.0
result = matching_blocks(s1, s2)
@ -59,14 +59,15 @@ type TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSort, s1::StringIterator, s2::StringIterator,
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
if search(s1.s, Base._default_delims) > 0
s1 = iteratortype(s1)(join(sort!(split(s1.s)), " "))
if search(s1, Base._default_delims) > 0
s1 = iterator(typeof(s1), join(sort!(split(s1)), " "))
end
if search(s2.s, Base._default_delims) > 0
s2 = iteratortype(s2)(join(sort!(split(s2.s)), " "))
if search(s2, Base._default_delims) > 0
s2 = iterator(typeof(s2), join(sort!(split(s2)), " "))
end
@show s1, s2
compare(dist.dist, s1, s2)
end
@ -80,12 +81,12 @@ type TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSet, s1::StringIterator, s2::StringIterator,
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
v0, v1, v2 = _separate!(split(s1.s), split(s2.s))
s0 = iteratortype(s1)(join(v0, " "))
s1 = iteratortype(s1)(join(chain(v0, v1), " "))
s2 = iteratortype(s1)(join(chain(v0, v2), " "))
v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = iterator(typeof(s1), join(v0, " "))
s1 = iterator(typeof(s1), join(chain(v0, v1), " "))
s2 = iterator(typeof(s2), join(chain(v0, v2), " "))
if isempty(s0)
# otherwise compare(dist, "", "a")== 1.0
compare(dist.dist, s1, s2)
@ -128,7 +129,7 @@ type TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenMax, s1::StringIterator, s2::StringIterator,
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString,
len1::Integer, len2::Integer)
base = compare(dist.dist, s1, s2, len1, len2)
unbase_scale = 0.95

View File

@ -13,7 +13,7 @@ end
# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(dist::Winkler, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
score = compare(dist.dist, s1, s2, len1, len2)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment

View File

@ -122,22 +122,22 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
# grapheme
strings = [
(graphemes("martha"), graphemes("marhta")),
(graphemes("dwayne"), graphemes("duane") ),
(graphemes("dixon"), graphemes("dicksonx")),
(graphemes("william"), graphemes("williams")),
(graphemes(""), graphemes("foo")),
(graphemes("a"), graphemes("a")),
(graphemes("abc"), graphemes("xyz")),
(graphemes("abc"), graphemes("ccc")),
(graphemes("kitten"), graphemes("sitting")),
(graphemes("saturday"), graphemes("sunday")),
(graphemes("hi, my name is"), graphemes("my name is")),
(graphemes("alborgów"), graphemes("amoniak")),
(graphemes("cape sand recycling "), graphemes("edith ann graham")),
(graphemes( "jellyifhs"), graphemes("jellyfish")),
(graphemes("ifhs"), graphemes("fish")),
(graphemes("leia"), graphemes("leela")),
(graphemeiterator("martha"), graphemeiterator("marhta")),
(graphemeiterator("dwayne"), graphemeiterator("duane") ),
(graphemeiterator("dixon"), graphemeiterator("dicksonx")),
(graphemeiterator("william"), graphemeiterator("williams")),
(graphemeiterator(""), graphemeiterator("foo")),
(graphemeiterator("a"), graphemeiterator("a")),
(graphemeiterator("abc"), graphemeiterator("xyz")),
(graphemeiterator("abc"), graphemeiterator("ccc")),
(graphemeiterator("kitten"), graphemeiterator("sitting")),
(graphemeiterator("saturday"), graphemeiterator("sunday")),
(graphemeiterator("hi, my name is"), graphemeiterator("my name is")),
(graphemeiterator("alborgów"), graphemeiterator("amoniak")),
(graphemeiterator("cape sand recycling "), graphemeiterator("edith ann graham")),
(graphemeiterator( "jellyifhs"), graphemeiterator("jellyfish")),
(graphemeiterator("ifhs"), graphemeiterator("fish")),
(graphemeiterator("leia"), graphemeiterator("leela")),
]
@ -149,4 +149,4 @@ for x in solutions
end
@test evaluate(Hamming(), "b\u0300", "a") == 2
@test evaluate(Hamming(), graphemes("b\u0300"), graphemes("a")) == 1
@test evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) == 1

View File

@ -70,3 +70,13 @@ s = "HSINCHUANG"
@test_approx_eq compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0