diff --git a/README.md b/README.md index 9882369..0bebb77 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ +[![StringDistances](http://pkg.julialang.org/badges/StringDistances_0.5.svg)](http://pkg.julialang.org/?pkg=StringDistances) [![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl) [![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master) -[![StringDistances](http://pkg.julialang.org/badges/StringDistances_0.4.svg)](http://pkg.julialang.org/?pkg=StringDistances) This Julia package computes various distances between strings. @@ -96,14 +96,6 @@ The package includes distance modifiers: #> 0.855 ``` -## Unicode -To iterate on graphemes rather than characters, use `graphemeiterator`: - -```julia -evaluate(Hamming(), "b\u0300", "a") -#> 2 -evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) -#> 1 ``` ## References - [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo diff --git a/documentation.txt b/documentation.txt deleted file mode 100644 index e69de29..0000000 diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 2f54ced..4c76e7b 100644 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -40,47 +40,52 @@ graphemeiterator ############################################################################## typealias GraphemeIterator Base.UTF8proc.GraphemeIterator -typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator} +#typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator} +typealias AbstractStringorGraphemeIterator AbstractString ############################################################################## ## -## GraphemeIterator iterates on Grapheme +## GraphemeIterator. Unicode iteration broken because Unicode 9 has different iteration properties. ## ############################################################################## -Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i) -Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i) -Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i) -Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j::Integer)) -graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s) - -# added -#these 2 functions allow to define prevind nextind, chr2ind, prevind etc -function Base.isvalid(s::GraphemeIterator, i::Integer) - if !isvalid(s.s, i) - return false - else - i0 = prevind(s.s, i) - return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i]) - end -end -function Base.endof(s::GraphemeIterator) - c0 = Char(0x00ad) - i = endof(s.s) - i0 = start(s.s) - while i >= i0 && !isgraphemebreak(s.s[i], c0) - i = prevind(s.s, i) - c0 = s.s[i] - end - i -end - -# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now -Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s) -# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property -Base.split(x::GraphemeIterator, args...) = split(x.s, args...) -iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x) -iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x - +#Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i) +#Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i) +#Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i) +#Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j:#:Integer)) +#graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s) +# +## added +##these 2 functions allow to define prevind nextind, chr2ind, prevind etc +#function Base.isvalid(s::GraphemeIterator, i::Integer) +# if !isvalid(s.s, i) +# return false +# else +# k = start(s) +# while !done(s, k) +# j = k[1] +# if j == i +# return true +# end +# end +# return false +# end +#end +#function Base.endof(s::GraphemeIterator) +# k = start(s) +# while !done(s, k) +# i = k[1] +# c, k = next(s, k) +# end +# return i +#end +# +## 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. #I need to pass the original string for now +#Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s) +## 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the #enumerator property but add it back after join. But SubString for instance does not loose the property +#Base.split(x::GraphemeIterator, args...) = split(x.s, args...) +#iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x) +#iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x +# ############################################################################## ## ## include diff --git a/test/distances.jl b/test/distances.jl index 8aa3464..33dc095 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -149,4 +149,4 @@ for x in solutions end @test evaluate(Hamming(), "b\u0300", "a") == 2 -@test evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) == 1 \ No newline at end of file +#@test evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) == 1 \ No newline at end of file diff --git a/test/modifiers.jl b/test/modifiers.jl index 0af44fa..236ee6d 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -73,10 +73,10 @@ s = "HSINCHUANG" -@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0 -@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094 -@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0 -@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0 +#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0 +#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094 +#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0 +#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0