diff --git a/documentation.txt b/documentation.txt new file mode 100644 index 0000000..e69de29 diff --git a/src/StringDistances.jl b/src/StringDistances.jl index 2707cf5..50b3cc0 100644 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -32,10 +32,22 @@ TokenSet, TokenMax ############################################################################## ## -## Extend methods of GraphemeIterator +## Define 2 iterators. One on character one on Grapheme. ## ############################################################################## -typealias GraphemeOrString Union{GraphemeIterator, AbstractString} + +immutable CharacterIterator{T <: AbstractString} + s::T +end +Base.start(x::CharacterIterator) = start(x.s) +Base.next(x::CharacterIterator, i::Integer) = next(x.s, i) +Base.done(x::CharacterIterator, i::Integer) = done(x.s, i) +Base.length(x::CharacterIterator) = length(x.s) + +Base.nextind(x::CharacterIterator, i::Integer) = nextind(x.s, i) +Base.chr2ind(x::CharacterIterator, i::Integer) = chr2ind(x.s, i::Integer) +iteratortype{T <: CharacterIterator}(::Type{T}) = CharacterIterator +Base.convert{T}(::Type{CharacterIterator{T}}, x::T) = CharacterIterator(x) # add the following methods Base.nextind(g::GraphemeIterator, state::Integer) = next(g, state)[2] @@ -52,8 +64,22 @@ function Base.chr2ind(s::GraphemeIterator, i::Integer) k = l end end -Base.endof(g::GraphemeIterator) = endof(g.s) -Base.SubString(x::GraphemeIterator, i, j) = graphemes(SubString(x.s, i, j)) +iteratortype{T <: GraphemeIterator}(::Type{T}) = GraphemeIterator +Base.convert{T}(::Type{GraphemeIterator{T}}, x::T) = GraphemeIterator(x) + + +typealias StringIterator{T} Union{GraphemeIterator{T}, CharacterIterator{T}} +Base.endof(g::StringIterator) = endof(g.s) +Base.SubString(x::StringIterator, i, j) = iteratortype(x)(SubString(x.s, i, j)) +Base.string(x::StringIterator) = x.s +Base.isempty(x::StringIterator) = isempty(x.s) +Base.eltype{T}(x::StringIterator{T}) = T +Base.isless(x::StringIterator, y::StringIterator) = isless(x.s, y.s) +Base.search(x::StringIterator, args...) = search(x.s, args...) +Base.searchsortedfirst(x::StringIterator, args...) = searchsortedfirst(x.s, args...) +Base.searchsortedlast(x::StringIterator, args...) = searchsortedlast(x.s, args...) +Base.searchsorted(x::StringIterator, args...) = searchsorted(x.s, args...) +iteratortype(x::StringIterator) = iteratortype(typeof(x)) ############################################################################## ## @@ -72,13 +98,24 @@ include("modifiers/fuzzywuzzy.jl") ## Higher level functions ## ############################################################################## +for x in (:evaluate, :compare) + @eval begin + function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString) + $x(dist, CharacterIterator(s1), CharacterIterator(s2)) + end -function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString) - len1, len2 = length(s1), length(s2) - if len1 > len2 - return evaluate(dist, s2, s1, len2, len1) - else - return evaluate(dist, s1, s2, len1, len2) + function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) + $x(dist, CharacterIterator(s1), CharacterIterator(s2), len1, len2) + end + + function $x(dist::PreMetric, s1::StringIterator, s2::StringIterator) + len1, len2 = length(s1), length(s2) + if len1 > len2 + return $x(dist, s2, s1, len2, len1) + else + return $x(dist, s1, s2, len1, len2) + end + end end end @@ -88,22 +125,13 @@ end ## ############################################################################## -function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString) - len1, len2 = length(s1), length(s2) - if len1 > len2 - return compare(dist, s2, s1, len2, len1) - else - return compare(dist, s1, s2, len1, len2) - end -end - -function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString, +function compare(dist::PreMetric, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) 1.0 - evaluate(dist, s1, s2, len1, len2) end function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, - s1::GraphemeOrString, s2::GraphemeOrString, + s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) distance = evaluate(dist, s1, s2, len1, len2) len2 == 0 ? 1.0 : 1.0 - distance / len2 @@ -112,14 +140,14 @@ end # compare always return a value between 0 and 1. # When string length < q for qgram distance, returns s1 == s2 function compare(dist::AbstractQGram, - s1::GraphemeOrString, s2::GraphemeOrString, + s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) evaluate(dist, s1, s2, len1, len2) end function compare(dist::QGram, - s1::GraphemeOrString, s2::GraphemeOrString, + s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) len1 <= (dist.q - 1) && return convert(Float64, s1 == s2) distance = evaluate(dist, s1, s2, len1, len2) diff --git a/src/distances/RatcliffObershelp.jl b/src/distances/RatcliffObershelp.jl index 79169d2..93d40c7 100644 --- a/src/distances/RatcliffObershelp.jl +++ b/src/distances/RatcliffObershelp.jl @@ -1,6 +1,6 @@ # Return start of commn substring in s1, start of common substring in s2, and length of substring # Indexes refer to character number, not index (differ for Unicode strings) -function longest_common_substring(s1::AbstractString, s2::AbstractString) +function longest_common_substring(s1::StringIterator, s2::StringIterator) if length(s1) > length(s2) start2, start1, size= longest_common_substring(s2, s1) else @@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString) return start1, start2, size end -function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer) +function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::StringIterator, s2::StringIterator, start1::Integer, start2::Integer) a = longest_common_substring(s1, s2) if a[3] > 0 push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3])) @@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2 end end -function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString) +function matching_blocks(s1::StringIterator, s2::StringIterator) x = Set{Tuple{Int, Int, Int}}() matching_blocks!(x, s1, s2, 1, 1) return x end type RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) +function evaluate(dist::RatcliffObershelp, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) result = matching_blocks(s1, s2) matched = 0 for x in result @@ -59,7 +59,7 @@ function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrS 1.0 - 2 * matched / (len1 + len2) end -#function buildref(s::GraphemeOrString, len) +#function buildref(s::StringIterator, len) # ref = Array(Int, len) # state = start(s) # i = 0 diff --git a/src/distances/edit.jl b/src/distances/edit.jl index 3695b33..ef1ea8c 100644 --- a/src/distances/edit.jl +++ b/src/distances/edit.jl @@ -4,7 +4,7 @@ ## Assumes length(s1) <= length(s2) ############################################################################## -function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1) +function common_prefix(s1::StringIterator, s2::StringIterator, lim::Integer = -1) start1 = start(s1) start2 = start(s2) l = 0 @@ -24,7 +24,7 @@ end ## ############################################################################## -function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer) +function evaluate(dist::Hamming, s1::StringIterator, s2::StringIterator, len1::Integer, len2:: Integer) count = 0 for (ch1, ch2) in zip(s1, s2) count += ch1 != ch2 @@ -42,7 +42,7 @@ end type Levenshtein <: SemiMetric end -function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) +function evaluate(dist::Levenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -90,7 +90,7 @@ end type DamerauLevenshtein <: SemiMetric end -function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) +function evaluate(dist::DamerauLevenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) # prefix common to both strings can be ignored k, start1, start2 = common_prefix(s1, s2) @@ -158,7 +158,7 @@ end type Jaro <: SemiMetric end -function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) +function evaluate(dist::Jaro, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) # if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope. len2 == 0 && return 0.0 @@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1:: return 1.0 - score end -jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2) +jaro(s1::StringIterator, s2::StringIterator) = evaluate(Jaro(), s1, s2) diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index 3cb8362..a432dba 100644 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -4,7 +4,7 @@ ## ############################################################################## -immutable QGramIterator{S, T <: Integer} +immutable QGramIterator{S <: StringIterator, T <: Integer} s::S # grapheorstring l::Int # length of string q::T # length of q-grams @@ -23,8 +23,7 @@ function Base.done(qgram::QGramIterator, state) istart, idend = state done(qgram.s, idend) end -Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S} -Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator{S}, T}) = SubString{S} +Base.eltype(qgram::QGramIterator) = iteratortype(qgram.s){SubString{eltype(qgram.s)}} Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0) function Base.collect(qgram::QGramIterator) x = Array(eltype(qgram), length(qgram)) @@ -81,7 +80,7 @@ end ############################################################################## abstract AbstractQGram <: SemiMetric -function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer) +function evaluate(dist::AbstractQGram, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) sort1 = sort(QGramIterator(s1, len1, dist.q)) sort2 = sort(QGramIterator(s2, len2, dist.q)) evaluate(dist, CountInterator(sort1, sort2)) diff --git a/src/modifiers/fuzzywuzzy.jl b/src/modifiers/fuzzywuzzy.jl index c012721..cdac061 100644 --- a/src/modifiers/fuzzywuzzy.jl +++ b/src/modifiers/fuzzywuzzy.jl @@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric end # general -function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Partial, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) len1 == len2 && return compare(dist.dist, s1, s2, len1, len2) len1 == 0 && return compare(dist.dist, "", "", 0, 0) iter = QGramIterator(s2, len2, len1) @@ -26,7 +26,7 @@ end # Specialization for RatcliffObershelp distance # Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py -function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Partial{RatcliffObershelp}, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) len1 == len2 && return compare(dist.dist, s1, s2, len1, len2) out = 0.0 result = matching_blocks(s1, s2) @@ -59,13 +59,13 @@ type TokenSort{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenSort, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) - if search(s1, Base._default_delims) > 0 - s1 = join(sort!(split(s1)), " ") + if search(s1.s, Base._default_delims) > 0 + s1 = iteratortype(s1)(join(sort!(split(s1.s)), " ")) end - if search(s2, Base._default_delims) > 0 - s2 = join(sort!(split(s2)), " ") + if search(s2.s, Base._default_delims) > 0 + s2 = iteratortype(s2)(join(sort!(split(s2.s)), " ")) end compare(dist.dist, s1, s2) end @@ -80,12 +80,12 @@ type TokenSet{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenSet, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) - v0, v1, v2 = _separate!(split(s1), split(s2)) - s0 = join(v0, " ") - s1 = join(chain(v0, v1), " ") - s2 = join(chain(v0, v2), " ") + v0, v1, v2 = _separate!(split(s1.s), split(s2.s)) + s0 = iteratortype(s1)(join(v0, " ")) + s1 = iteratortype(s1)(join(chain(v0, v1), " ")) + s2 = iteratortype(s1)(join(chain(v0, v2), " ")) if isempty(s0) # otherwise compare(dist, "", "a")== 1.0 compare(dist.dist, s1, s2) @@ -128,7 +128,7 @@ type TokenMax{T <: PreMetric} <: PreMetric dist::T end -function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString, +function compare(dist::TokenMax, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) base = compare(dist.dist, s1, s2, len1, len2) unbase_scale = 0.95 diff --git a/src/modifiers/winkler.jl b/src/modifiers/winkler.jl index 08025a0..99420bf 100644 --- a/src/modifiers/winkler.jl +++ b/src/modifiers/winkler.jl @@ -13,7 +13,7 @@ end # restrict to distance between 0 and 1 Winkler(x) = Winkler(x, 0.1, 0.7) -function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer) +function compare(dist::Winkler, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer) score = compare(dist.dist, s1, s2, len1, len2) l = common_prefix(s1, s2, 4)[1] # common prefix adjustment diff --git a/test/distances.jl b/test/distances.jl index 24e704b..716bc05 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -44,10 +44,7 @@ using StringDistances, Base.Test @test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4 @test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4 -@test matching_blocks("dwayne", "duane") == -Set([(5,4,2) (1,1,1) (3,3,1)]) -@test matching_blocks("dixon", "dicksonx") == -Set([(1,1,2) (4,6,2)]) + @test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154 @test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579 @test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666