diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 6cb8481..a7d337a 100644 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -7,7 +7,6 @@ module StringDistances ## Export ## ############################################################################## - import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric import Iterators: chain export @@ -29,7 +28,72 @@ Winkler, Partial, TokenSort, TokenSet, -TokenMax +TokenMax, +graphemes2 + +############################################################################## +## +## Iterator +## +############################################################################## + +isgraphemebreak(c1::Char, c2::Char) = + ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2) + +immutable GraphemeIterator2{S<:AbstractString} + s::S # original string (for generation of SubStrings) +end +graphemes2(s::AbstractString) = GraphemeIterator2{typeof(s)}(s) + +Base.eltype{S}(::Type{GraphemeIterator2{S}}) = SubString{S} + +function Base.length(g::GraphemeIterator2) + c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this) + n = 0 + for c in g.s + n += isgraphemebreak(c0, c) + c0 = c + end + return n +end + +Base.start(g::GraphemeIterator2) = start(g.s) +Base.done(g::GraphemeIterator2, i) = done(g.s, i) + +function Base.next(g::GraphemeIterator2, i) + s = g.s + j = i + c0, k = next(s, i) + while !done(s, k) # loop until next grapheme is s[i:j] + c, ℓ = next(s, k) + isgraphemebreak(c0, c) && break + j = k + k = ℓ + c0 = c + end + return (SubString(s, i, j), k) +end + +# functions not defined in base +Base.nextind(g::GraphemeIterator2, state::Integer) = next(g, state)[2] +function Base.chr2ind(g::GraphemeIterator2, idx::Integer) + state = start(g) + i = 0 + while !done(g, state) + i += 1 + i == idx && return state + ch, state = next(g, state) + end +end +Base.endof(g::GraphemeIterator2) = endof(g.s) + +typealias GraphemeOrString Union{GraphemeIterator2, AbstractString} +Base.SubString(x::GraphemeIterator2, i, j) = SubString(x.s, i, j) +############################################################################## +## +## include +## +############################################################################## include("distances/edit.jl") include("distances/qgram.jl") @@ -44,7 +108,8 @@ include("modifiers/fuzzywuzzy.jl") ## ############################################################################## -function evaluate(dist::PreMetric, s1::AbstractString, s2::AbstractString) + +function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString) len1, len2 = length(s1), length(s2) if len1 > len2 return evaluate(dist, s2, s1, len2, len1) @@ -59,7 +124,7 @@ end ## ############################################################################## -function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) +function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString) len1, len2 = length(s1), length(s2) if len1 > len2 return compare(dist, s2, s1, len2, len1) @@ -68,13 +133,13 @@ function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) end end -function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString, +function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) 1.0 - evaluate(dist, s1, s2, len1, len2) end function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, - s1::AbstractString, s2::AbstractString, + s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) distance = evaluate(dist, s1, s2, len1, len2) len2 == 0 ? 1.0 : 1.0 - distance / len2 @@ -83,14 +148,14 @@ end # compare always return a value between 0 and 1. # When string length < q for qgram distance, returns s1 == s2 function compare(dist::AbstractQGram, - s1::AbstractString, s2::AbstractString, + s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) evaluate(dist, s1, s2, len1, len2) end function compare(dist::QGram, - s1::AbstractString, s2::AbstractString, + s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) distance = evaluate(dist, s1, s2, len1, len2) @@ -98,4 +163,7 @@ function compare(dist::QGram, end + + + end \ No newline at end of file diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl index ed8644d..79169d2 100644 --- a/src/distances/RatcliffObershelp.jl +++ b/src/distances/RatcliffObershelp.jl @@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString) return start1, start2, size end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer) +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer) a = longest_common_substring(s1, s2) if a[3] > 0 push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) @@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2:: end end -function matching_blocks(s1::AbstractString, s2::AbstractString) +function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString) x = Set{Tuple{Int, Int, Int}}() matching_blocks!(x, s1, s2, 1, 1) return x end type RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) result = matching_blocks(s1, s2) matched = 0 for x in result @@ -58,3 +58,16 @@ function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractStrin end 1.0 - 2 * matched / (len1 + len2) end + +#function buildref(s::GraphemeOrString, len) +# ref = Array(Int, len) +# state = start(s) +# i = 0 +# while !done(s, state) +# i += 1 +# ref[i] = state +# ch, state = next(s, state) +# end +# return ref +#end + diff --git a/src/distances/edit.jl b/src/distances/edit.jl index d30a3e9..3695b33 100644 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -4,7 +4,7 @@ ## Assumes length(s1) <= length(s2) ############################################################################## -function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1) +function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1) start1 = start(s1) start2 = start(s2) l = 0 @@ -24,7 +24,7 @@ end ## ############################################################################## -function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer) +function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer) count = 0 for (ch1, ch2) in zip(s1, s2) count += ch1 != ch2 @@ -42,7 +42,7 @@ end type Levenshtein <: SemiMetric end -function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -90,7 +90,7 @@ end type DamerauLevenshtein <: SemiMetric end -function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -158,7 +158,7 @@ end type Jaro <: SemiMetric end -function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) # if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope. len2 == 0 && return 0.0 @@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte return 1.0 - score end -jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2) +jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index f8fb1dc..fa168d7 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -4,14 +4,14 @@ ## ############################################################################## -type QGramIterator{S <: AbstractString, T <: Integer} - s::S # string +immutable QGramIterator{S, T <: Integer} + s::S # grapheorstring l::Int # length of string q::T # length of q-grams end + function Base.start(qgram::QGramIterator) - len = length(qgram.s) - (1, len < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q)) + (1, qgram.l < qgram.q ? endof(qgram.s) + 1 : chr2ind(qgram.s, qgram.q)) end function Base.next(qgram::QGramIterator, state) istart, iend = state @@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state) istart, idend = state done(qgram.s, idend) end -Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)} +Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S} +Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator2{S}, T}) = SubString{S} Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) function Base.collect(qgram::QGramIterator) x = Array(eltype(qgram), length(qgram)) @@ -80,7 +81,7 @@ end ############################################################################## abstract AbstractQGram <: SemiMetric -function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) sort1 = sort(QGramIterator(s1, len1, dist.q)) sort2 = sort(QGramIterator(s2, len2, dist.q)) evaluate(dist, CountInterator(sort1, sort2)) diff --git a/test/distances.jl b/test/distances.jl index 6b00048..9e28a28 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -44,7 +44,6 @@ using StringDistances, Base.Test @test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4 @test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4 -Set([(1,1,3) (4,5,1) (6,6,1)]) @test matching_blocks("dwayne", "duane") == Set([(5,4,2) (1,1,1) (3,3,1)]) @test matching_blocks("dixon", "dicksonx") == @@ -79,9 +78,7 @@ strings = [ ("leia", "leela"), ] - -# Test with R package StringDist -for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), +solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), (DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]), (Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.24722222 0.16190476 0.48809524 0.49166667 0.07407407 0.16666667 0.21666667]), (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), @@ -89,6 +86,8 @@ for x in ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), (Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]), (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) +# Test with R package StringDist +for x in solutions t, solution = x for i in 1:length(solution) @test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4 @@ -124,4 +123,30 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1) =# +# grapheme +strings = [ +(graphemes2("martha"), graphemes2("marhta")), +(graphemes2("dwayne"), graphemes2("duane") ), +(graphemes2("dixon"), graphemes2("dicksonx")), +(graphemes2("william"), graphemes2("williams")), +(graphemes2(""), graphemes2("foo")), +(graphemes2("a"), graphemes2("a")), +(graphemes2("abc"), graphemes2("xyz")), +(graphemes2("abc"), graphemes2("ccc")), +(graphemes2("kitten"), graphemes2("sitting")), +(graphemes2("saturday"), graphemes2("sunday")), +(graphemes2("hi, my name is"), graphemes2("my name is")), +(graphemes2("alborgów"), graphemes2("amoniak")), +(graphemes2("cape sand recycling "), graphemes2("edith ann graham")), +(graphemes2( "jellyifhs"), graphemes2("jellyfish")), +(graphemes2("ifhs"), graphemes2("fish")), +(graphemes2("leia"), graphemes2("leela")), +] + +for x in solutions + t, solution = x + for i in 1:length(solution) + @test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4 + end +end \ No newline at end of file