add graphemes2

pull/3/head
matthieugomez 2015-11-06 14:43:04 -05:00
parent 5ea0624150
commit 5621eb42fb
5 changed files with 134 additions and 27 deletions

View File

@ -7,7 +7,6 @@ module StringDistances
## Export
##
##############################################################################
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import Iterators: chain
export
@ -29,7 +28,72 @@ Winkler,
Partial,
TokenSort,
TokenSet,
TokenMax
TokenMax,
graphemes2
##############################################################################
##
## Iterator
##
##############################################################################
isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
immutable GraphemeIterator2{S<:AbstractString}
s::S # original string (for generation of SubStrings)
end
graphemes2(s::AbstractString) = GraphemeIterator2{typeof(s)}(s)
Base.eltype{S}(::Type{GraphemeIterator2{S}}) = SubString{S}
function Base.length(g::GraphemeIterator2)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end
Base.start(g::GraphemeIterator2) = start(g.s)
Base.done(g::GraphemeIterator2, i) = done(g.s, i)
function Base.next(g::GraphemeIterator2, i)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k =
c0 = c
end
return (SubString(s, i, j), k)
end
# functions not defined in base
Base.nextind(g::GraphemeIterator2, state::Integer) = next(g, state)[2]
function Base.chr2ind(g::GraphemeIterator2, idx::Integer)
state = start(g)
i = 0
while !done(g, state)
i += 1
i == idx && return state
ch, state = next(g, state)
end
end
Base.endof(g::GraphemeIterator2) = endof(g.s)
typealias GraphemeOrString Union{GraphemeIterator2, AbstractString}
Base.SubString(x::GraphemeIterator2, i, j) = SubString(x.s, i, j)
##############################################################################
##
## include
##
##############################################################################
include("distances/edit.jl")
include("distances/qgram.jl")
@ -44,7 +108,8 @@ include("modifiers/fuzzywuzzy.jl")
##
##############################################################################
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return evaluate(dist, s2, s1, len2, len1)
@ -59,7 +124,7 @@ end
##
##############################################################################
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return compare(dist, s2, s1, len2, len1)
@ -68,13 +133,13 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
end
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::AbstractString, s2::AbstractString,
s1::GraphemeOrString, s2::GraphemeOrString,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
@ -83,14 +148,14 @@ end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::AbstractString, s2::AbstractString,
s1::GraphemeOrString, s2::GraphemeOrString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::AbstractString, s2::AbstractString,
s1::GraphemeOrString, s2::GraphemeOrString,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
@ -98,4 +163,7 @@ function compare(dist::QGram,
end
end

View File

@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
return start1, start2, size
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
end
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString)
x = Set{Tuple{Int, Int, Int}}()
matching_blocks!(x, s1, s2, 1, 1)
return x
end
type RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
result = matching_blocks(s1, s2)
matched = 0
for x in result
@ -58,3 +58,16 @@ function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractStrin
end
1.0 - 2 * matched / (len1 + len2)
end
#function buildref(s::GraphemeOrString, len)
# ref = Array(Int, len)
# state = start(s)
# i = 0
# while !done(s, state)
# i += 1
# ref[i] = state
# ch, state = next(s, state)
# end
# return ref
#end

View File

@ -4,7 +4,7 @@
## Assumes length(s1) <= length(s2)
##############################################################################
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1)
start1 = start(s1)
start2 = start(s2)
l = 0
@ -24,7 +24,7 @@ end
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer)
function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer)
count = 0
for (ch1, ch2) in zip(s1, s2)
count += ch1 != ch2
@ -42,7 +42,7 @@ end
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -90,7 +90,7 @@ end
type DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -158,7 +158,7 @@ end
type Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
len2 == 0 && return 0.0
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte
return 1.0 - score
end
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)
jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2)

View File

@ -4,14 +4,14 @@
##
##############################################################################
type QGramIterator{S <: AbstractString, T <: Integer}
s::S # string
immutable QGramIterator{S, T <: Integer}
s::S # grapheorstring
l::Int # length of string
q::T # length of q-grams
end
function Base.start(qgram::QGramIterator)
len = length(qgram.s)
(1, len < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
end
function Base.next(qgram::QGramIterator, state)
istart, iend = state
@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S}
Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator2{S}, T}) = SubString{S}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array(eltype(qgram), length(qgram))
@ -80,7 +81,7 @@ end
##############################################################################
abstract AbstractQGram <: SemiMetric
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))
sort2 = sort(QGramIterator(s2, len2, dist.q))
evaluate(dist, CountInterator(sort1, sort2))

View File

@ -44,7 +44,6 @@ using StringDistances, Base.Test
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
Set([(1,1,3) (4,5,1) (6,6,1)])
@test matching_blocks("dwayne", "duane") ==
Set([(5,4,2) (1,1,1) (3,3,1)])
@test matching_blocks("dixon", "dicksonx") ==
@ -79,9 +78,7 @@ strings = [
("leia", "leela"),
]
# Test with R package StringDist
for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
@ -89,6 +86,8 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
# Test with R package StringDist
for x in solutions
t, solution = x
for i in 1:length(solution)
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
@ -124,4 +123,30 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
=#
# grapheme
strings = [
(graphemes2("martha"), graphemes2("marhta")),
(graphemes2("dwayne"), graphemes2("duane") ),
(graphemes2("dixon"), graphemes2("dicksonx")),
(graphemes2("william"), graphemes2("williams")),
(graphemes2(""), graphemes2("foo")),
(graphemes2("a"), graphemes2("a")),
(graphemes2("abc"), graphemes2("xyz")),
(graphemes2("abc"), graphemes2("ccc")),
(graphemes2("kitten"), graphemes2("sitting")),
(graphemes2("saturday"), graphemes2("sunday")),
(graphemes2("hi, my name is"), graphemes2("my name is")),
(graphemes2("alborgów"), graphemes2("amoniak")),
(graphemes2("cape sand recycling "), graphemes2("edith ann graham")),
(graphemes2( "jellyifhs"), graphemes2("jellyfish")),
(graphemes2("ifhs"), graphemes2("fish")),
(graphemes2("leia"), graphemes2("leela")),
]
for x in solutions
t, solution = x
for i in 1:length(solution)
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
end
end