add graphemes2
parent
5ea0624150
commit
5621eb42fb
|
@ -7,7 +7,6 @@ module StringDistances
|
|||
## Export
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
|
||||
import Iterators: chain
|
||||
export
|
||||
|
@ -29,7 +28,72 @@ Winkler,
|
|||
Partial,
|
||||
TokenSort,
|
||||
TokenSet,
|
||||
TokenMax
|
||||
TokenMax,
|
||||
graphemes2
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Iterator
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
isgraphemebreak(c1::Char, c2::Char) =
|
||||
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
|
||||
|
||||
immutable GraphemeIterator2{S<:AbstractString}
|
||||
s::S # original string (for generation of SubStrings)
|
||||
end
|
||||
graphemes2(s::AbstractString) = GraphemeIterator2{typeof(s)}(s)
|
||||
|
||||
Base.eltype{S}(::Type{GraphemeIterator2{S}}) = SubString{S}
|
||||
|
||||
function Base.length(g::GraphemeIterator2)
|
||||
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
|
||||
n = 0
|
||||
for c in g.s
|
||||
n += isgraphemebreak(c0, c)
|
||||
c0 = c
|
||||
end
|
||||
return n
|
||||
end
|
||||
|
||||
Base.start(g::GraphemeIterator2) = start(g.s)
|
||||
Base.done(g::GraphemeIterator2, i) = done(g.s, i)
|
||||
|
||||
function Base.next(g::GraphemeIterator2, i)
|
||||
s = g.s
|
||||
j = i
|
||||
c0, k = next(s, i)
|
||||
while !done(s, k) # loop until next grapheme is s[i:j]
|
||||
c, ℓ = next(s, k)
|
||||
isgraphemebreak(c0, c) && break
|
||||
j = k
|
||||
k = ℓ
|
||||
c0 = c
|
||||
end
|
||||
return (SubString(s, i, j), k)
|
||||
end
|
||||
|
||||
# functions not defined in base
|
||||
Base.nextind(g::GraphemeIterator2, state::Integer) = next(g, state)[2]
|
||||
function Base.chr2ind(g::GraphemeIterator2, idx::Integer)
|
||||
state = start(g)
|
||||
i = 0
|
||||
while !done(g, state)
|
||||
i += 1
|
||||
i == idx && return state
|
||||
ch, state = next(g, state)
|
||||
end
|
||||
end
|
||||
Base.endof(g::GraphemeIterator2) = endof(g.s)
|
||||
|
||||
typealias GraphemeOrString Union{GraphemeIterator2, AbstractString}
|
||||
Base.SubString(x::GraphemeIterator2, i, j) = SubString(x.s, i, j)
|
||||
##############################################################################
|
||||
##
|
||||
## include
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
|
@ -44,7 +108,8 @@ include("modifiers/fuzzywuzzy.jl")
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
|
||||
function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return evaluate(dist, s2, s1, len2, len1)
|
||||
|
@ -59,7 +124,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return compare(dist, s2, s1, len2, len1)
|
||||
|
@ -68,13 +133,13 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
len1::Integer, len2::Integer)
|
||||
1.0 - evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
len1::Integer, len2::Integer)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||
|
@ -83,14 +148,14 @@ end
|
|||
# compare always return a value between 0 and 1.
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
function compare(dist::AbstractQGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::QGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
|
@ -98,4 +163,7 @@ function compare(dist::QGram,
|
|||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
end
|
|
@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
|||
return start1, start2, size
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
|
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
end
|
||||
end
|
||||
|
||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||
function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
x = Set{Tuple{Int, Int, Int}}()
|
||||
matching_blocks!(x, s1, s2, 1, 1)
|
||||
return x
|
||||
end
|
||||
|
||||
type RatcliffObershelp <: PreMetric end
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
result = matching_blocks(s1, s2)
|
||||
matched = 0
|
||||
for x in result
|
||||
|
@ -58,3 +58,16 @@ function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractStrin
|
|||
end
|
||||
1.0 - 2 * matched / (len1 + len2)
|
||||
end
|
||||
|
||||
#function buildref(s::GraphemeOrString, len)
|
||||
# ref = Array(Int, len)
|
||||
# state = start(s)
|
||||
# i = 0
|
||||
# while !done(s, state)
|
||||
# i += 1
|
||||
# ref[i] = state
|
||||
# ch, state = next(s, state)
|
||||
# end
|
||||
# return ref
|
||||
#end
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
## Assumes length(s1) <= length(s2)
|
||||
##############################################################################
|
||||
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1)
|
||||
start1 = start(s1)
|
||||
start2 = start(s2)
|
||||
l = 0
|
||||
|
@ -24,7 +24,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer)
|
||||
function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer)
|
||||
count = 0
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
count += ch1 != ch2
|
||||
|
@ -42,7 +42,7 @@ end
|
|||
|
||||
|
||||
type Levenshtein <: SemiMetric end
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -90,7 +90,7 @@ end
|
|||
|
||||
type DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -158,7 +158,7 @@ end
|
|||
|
||||
type Jaro <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
|
||||
len2 == 0 && return 0.0
|
||||
|
||||
|
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte
|
|||
return 1.0 - score
|
||||
end
|
||||
|
||||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)
|
||||
jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2)
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
type QGramIterator{S <: AbstractString, T <: Integer}
|
||||
s::S # string
|
||||
immutable QGramIterator{S, T <: Integer}
|
||||
s::S # grapheorstring
|
||||
l::Int # length of string
|
||||
q::T # length of q-grams
|
||||
end
|
||||
|
||||
function Base.start(qgram::QGramIterator)
|
||||
len = length(qgram.s)
|
||||
(1, len < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
||||
(1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q))
|
||||
end
|
||||
function Base.next(qgram::QGramIterator, state)
|
||||
istart, iend = state
|
||||
|
@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state)
|
|||
istart, idend = state
|
||||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
|
||||
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S}
|
||||
Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator2{S}, T}) = SubString{S}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array(eltype(qgram), length(qgram))
|
||||
|
@ -80,7 +81,7 @@ end
|
|||
##############################################################################
|
||||
abstract AbstractQGram <: SemiMetric
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
||||
evaluate(dist, CountInterator(sort1, sort2))
|
||||
|
|
|
@ -44,7 +44,6 @@ using StringDistances, Base.Test
|
|||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||
|
||||
Set([(1,1,3) (4,5,1) (6,6,1)])
|
||||
@test matching_blocks("dwayne", "duane") ==
|
||||
Set([(5,4,2) (1,1,1) (3,3,1)])
|
||||
@test matching_blocks("dixon", "dicksonx") ==
|
||||
|
@ -79,9 +78,7 @@ strings = [
|
|||
("leia", "leela"),
|
||||
]
|
||||
|
||||
|
||||
# Test with R package StringDist
|
||||
for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
|
@ -89,6 +86,8 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
|||
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
# Test with R package StringDist
|
||||
for x in solutions
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||
|
@ -124,4 +123,30 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
|
|||
=#
|
||||
|
||||
|
||||
# grapheme
|
||||
strings = [
|
||||
(graphemes2("martha"), graphemes2("marhta")),
|
||||
(graphemes2("dwayne"), graphemes2("duane") ),
|
||||
(graphemes2("dixon"), graphemes2("dicksonx")),
|
||||
(graphemes2("william"), graphemes2("williams")),
|
||||
(graphemes2(""), graphemes2("foo")),
|
||||
(graphemes2("a"), graphemes2("a")),
|
||||
(graphemes2("abc"), graphemes2("xyz")),
|
||||
(graphemes2("abc"), graphemes2("ccc")),
|
||||
(graphemes2("kitten"), graphemes2("sitting")),
|
||||
(graphemes2("saturday"), graphemes2("sunday")),
|
||||
(graphemes2("hi, my name is"), graphemes2("my name is")),
|
||||
(graphemes2("alborgów"), graphemes2("amoniak")),
|
||||
(graphemes2("cape sand recycling "), graphemes2("edith ann graham")),
|
||||
(graphemes2( "jellyifhs"), graphemes2("jellyfish")),
|
||||
(graphemes2("ifhs"), graphemes2("fish")),
|
||||
(graphemes2("leia"), graphemes2("leela")),
|
||||
]
|
||||
|
||||
|
||||
for x in solutions
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue