rmv grapheme iterator
parent
bed352eef7
commit
241c5d7631
10
README.md
10
README.md
|
@ -1,6 +1,6 @@
|
||||||
|
[![StringDistances](http://pkg.julialang.org/badges/StringDistances_0.5.svg)](http://pkg.julialang.org/?pkg=StringDistances)
|
||||||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||||
[![StringDistances](http://pkg.julialang.org/badges/StringDistances_0.4.svg)](http://pkg.julialang.org/?pkg=StringDistances)
|
|
||||||
|
|
||||||
This Julia package computes various distances between strings.
|
This Julia package computes various distances between strings.
|
||||||
|
|
||||||
|
@ -96,14 +96,6 @@ The package includes distance modifiers:
|
||||||
#> 0.855
|
#> 0.855
|
||||||
```
|
```
|
||||||
|
|
||||||
## Unicode
|
|
||||||
To iterate on graphemes rather than characters, use `graphemeiterator`:
|
|
||||||
|
|
||||||
```julia
|
|
||||||
evaluate(Hamming(), "b\u0300", "a")
|
|
||||||
#> 2
|
|
||||||
evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a"))
|
|
||||||
#> 1
|
|
||||||
```
|
```
|
||||||
## References
|
## References
|
||||||
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
|
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
|
||||||
|
|
|
@ -40,47 +40,52 @@ graphemeiterator
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
||||||
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
|
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
|
||||||
typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
|
#typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
|
||||||
|
typealias AbstractStringorGraphemeIterator AbstractString
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## GraphemeIterator iterates on Grapheme
|
## GraphemeIterator. Unicode iteration broken because Unicode 9 has different iteration properties.
|
||||||
##
|
##
|
||||||
##############################################################################
|
##############################################################################
|
||||||
Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i)
|
#Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i)
|
||||||
Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i)
|
#Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i)
|
||||||
Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i)
|
#Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i)
|
||||||
Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j::Integer))
|
#Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j:#:Integer))
|
||||||
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
#graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
||||||
|
#
|
||||||
# added
|
## added
|
||||||
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
|
##these 2 functions allow to define prevind nextind, chr2ind, prevind etc
|
||||||
function Base.isvalid(s::GraphemeIterator, i::Integer)
|
#function Base.isvalid(s::GraphemeIterator, i::Integer)
|
||||||
if !isvalid(s.s, i)
|
# if !isvalid(s.s, i)
|
||||||
return false
|
# return false
|
||||||
else
|
# else
|
||||||
i0 = prevind(s.s, i)
|
# k = start(s)
|
||||||
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
|
# while !done(s, k)
|
||||||
end
|
# j = k[1]
|
||||||
end
|
# if j == i
|
||||||
function Base.endof(s::GraphemeIterator)
|
# return true
|
||||||
c0 = Char(0x00ad)
|
# end
|
||||||
i = endof(s.s)
|
# end
|
||||||
i0 = start(s.s)
|
# return false
|
||||||
while i >= i0 && !isgraphemebreak(s.s[i], c0)
|
# end
|
||||||
i = prevind(s.s, i)
|
#end
|
||||||
c0 = s.s[i]
|
#function Base.endof(s::GraphemeIterator)
|
||||||
end
|
# k = start(s)
|
||||||
i
|
# while !done(s, k)
|
||||||
end
|
# i = k[1]
|
||||||
|
# c, k = next(s, k)
|
||||||
# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now
|
# end
|
||||||
Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
|
# return i
|
||||||
# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property
|
#end
|
||||||
Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
|
#
|
||||||
iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
|
## 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. #I need to pass the original string for now
|
||||||
iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x
|
#Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
|
||||||
|
## 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the #enumerator property but add it back after join. But SubString for instance does not loose the property
|
||||||
|
#Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
|
||||||
|
#iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
|
||||||
|
#iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x
|
||||||
|
#
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
## include
|
## include
|
||||||
|
|
|
@ -149,4 +149,4 @@ for x in solutions
|
||||||
end
|
end
|
||||||
|
|
||||||
@test evaluate(Hamming(), "b\u0300", "a") == 2
|
@test evaluate(Hamming(), "b\u0300", "a") == 2
|
||||||
@test evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) == 1
|
#@test evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a")) == 1
|
|
@ -73,10 +73,10 @@ s = "HSINCHUANG"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
|
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
|
||||||
@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
|
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
|
||||||
@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
|
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
|
||||||
@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
|
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue