try 2 iterator types

pull/3/head
matthieugomez 2015-11-09 18:29:08 -05:00
parent 9e066e3c5a
commit d8ac44be23
8 changed files with 80 additions and 56 deletions

0
documentation.txt Normal file
View File

View File

@ -32,10 +32,22 @@ TokenSet,
TokenMax
##############################################################################
##
## Extend methods of GraphemeIterator
## Define 2 iterators. One on character one on Grapheme.
##
##############################################################################
typealias GraphemeOrString Union{GraphemeIterator, AbstractString}
immutable CharacterIterator{T <: AbstractString}
s::T
end
Base.start(x::CharacterIterator) = start(x.s)
Base.next(x::CharacterIterator, i::Integer) = next(x.s, i)
Base.done(x::CharacterIterator, i::Integer) = done(x.s, i)
Base.length(x::CharacterIterator) = length(x.s)
Base.nextind(x::CharacterIterator, i::Integer) = nextind(x.s, i)
Base.chr2ind(x::CharacterIterator, i::Integer) = chr2ind(x.s, i::Integer)
iteratortype{T <: CharacterIterator}(::Type{T}) = CharacterIterator
Base.convert{T}(::Type{CharacterIterator{T}}, x::T) = CharacterIterator(x)
# add the following methods
Base.nextind(g::GraphemeIterator, state::Integer) = next(g, state)[2]
@ -52,8 +64,22 @@ function Base.chr2ind(s::GraphemeIterator, i::Integer)
k = l
end
end
Base.endof(g::GraphemeIterator) = endof(g.s)
Base.SubString(x::GraphemeIterator, i, j) = graphemes(SubString(x.s, i, j))
iteratortype{T <: GraphemeIterator}(::Type{T}) = GraphemeIterator
Base.convert{T}(::Type{GraphemeIterator{T}}, x::T) = GraphemeIterator(x)
typealias StringIterator{T} Union{GraphemeIterator{T}, CharacterIterator{T}}
Base.endof(g::StringIterator) = endof(g.s)
Base.SubString(x::StringIterator, i, j) = iteratortype(x)(SubString(x.s, i, j))
Base.string(x::StringIterator) = x.s
Base.isempty(x::StringIterator) = isempty(x.s)
Base.eltype{T}(x::StringIterator{T}) = T
Base.isless(x::StringIterator, y::StringIterator) = isless(x.s, y.s)
Base.search(x::StringIterator, args...) = search(x.s, args...)
Base.searchsortedfirst(x::StringIterator, args...) = searchsortedfirst(x.s, args...)
Base.searchsortedlast(x::StringIterator, args...) = searchsortedlast(x.s, args...)
Base.searchsorted(x::StringIterator, args...) = searchsorted(x.s, args...)
iteratortype(x::StringIterator) = iteratortype(typeof(x))
##############################################################################
##
@ -72,13 +98,24 @@ include("modifiers/fuzzywuzzy.jl")
## Higher level functions
##
##############################################################################
for x in (:evaluate, :compare)
@eval begin
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
$x(dist, CharacterIterator(s1), CharacterIterator(s2))
end
function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return evaluate(dist, s2, s1, len2, len1)
else
return evaluate(dist, s1, s2, len1, len2)
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
$x(dist, CharacterIterator(s1), CharacterIterator(s2), len1, len2)
end
function $x(dist::PreMetric, s1::StringIterator, s2::StringIterator)
len1, len2 = length(s1), length(s2)
if len1 > len2
return $x(dist, s2, s1, len2, len1)
else
return $x(dist, s1, s2, len1, len2)
end
end
end
end
@ -88,22 +125,13 @@ end
##
##############################################################################
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return compare(dist, s2, s1, len2, len1)
else
return compare(dist, s1, s2, len1, len2)
end
end
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString,
function compare(dist::PreMetric, s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
s1::GraphemeOrString, s2::GraphemeOrString,
s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
@ -112,14 +140,14 @@ end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
s1::GraphemeOrString, s2::GraphemeOrString,
s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
s1::GraphemeOrString, s2::GraphemeOrString,
s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)

View File

@ -1,6 +1,6 @@
# Return start of commn substring in s1, start of common substring in s2, and length of substring
# Indexes refer to character number, not index (differ for Unicode strings)
function longest_common_substring(s1::AbstractString, s2::AbstractString)
function longest_common_substring(s1::StringIterator, s2::StringIterator)
if length(s1) > length(s2)
start2, start1, size= longest_common_substring(s2, s1)
else
@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
return start1, start2, size
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer)
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::StringIterator, s2::StringIterator, start1::Integer, start2::Integer)
a = longest_common_substring(s1, s2)
if a[3] > 0
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2
end
end
function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString)
function matching_blocks(s1::StringIterator, s2::StringIterator)
x = Set{Tuple{Int, Int, Int}}()
matching_blocks!(x, s1, s2, 1, 1)
return x
end
type RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
function evaluate(dist::RatcliffObershelp, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
result = matching_blocks(s1, s2)
matched = 0
for x in result
@ -59,7 +59,7 @@ function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrS
1.0 - 2 * matched / (len1 + len2)
end
#function buildref(s::GraphemeOrString, len)
#function buildref(s::StringIterator, len)
# ref = Array(Int, len)
# state = start(s)
# i = 0

View File

@ -4,7 +4,7 @@
## Assumes length(s1) <= length(s2)
##############################################################################
function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1)
function common_prefix(s1::StringIterator, s2::StringIterator, lim::Integer = -1)
start1 = start(s1)
start2 = start(s2)
l = 0
@ -24,7 +24,7 @@ end
##
##############################################################################
function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer)
function evaluate(dist::Hamming, s1::StringIterator, s2::StringIterator, len1::Integer, len2:: Integer)
count = 0
for (ch1, ch2) in zip(s1, s2)
count += ch1 != ch2
@ -42,7 +42,7 @@ end
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
function evaluate(dist::Levenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -90,7 +90,7 @@ end
type DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
function evaluate(dist::DamerauLevenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
@ -158,7 +158,7 @@ end
type Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
function evaluate(dist::Jaro, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
len2 == 0 && return 0.0
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::
return 1.0 - score
end
jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2)
jaro(s1::StringIterator, s2::StringIterator) = evaluate(Jaro(), s1, s2)

View File

@ -4,7 +4,7 @@
##
##############################################################################
immutable QGramIterator{S, T <: Integer}
immutable QGramIterator{S <: StringIterator, T <: Integer}
s::S # grapheorstring
l::Int # length of string
q::T # length of q-grams
@ -23,8 +23,7 @@ function Base.done(qgram::QGramIterator, state)
istart, idend = state
done(qgram.s, idend)
end
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S}
Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator{S}, T}) = SubString{S}
Base.eltype(qgram::QGramIterator) = iteratortype(qgram.s){SubString{eltype(qgram.s)}}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array(eltype(qgram), length(qgram))
@ -81,7 +80,7 @@ end
##############################################################################
abstract AbstractQGram <: SemiMetric
function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
function evaluate(dist::AbstractQGram, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))
sort2 = sort(QGramIterator(s2, len2, dist.q))
evaluate(dist, CountInterator(sort1, sort2))

View File

@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric
end
# general
function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Partial, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
len1 == 0 && return compare(dist.dist, "", "", 0, 0)
iter = QGramIterator(s2, len2, len1)
@ -26,7 +26,7 @@ end
# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Partial{RatcliffObershelp}, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
out = 0.0
result = matching_blocks(s1, s2)
@ -59,13 +59,13 @@ type TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenSort, s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
if search(s1, Base._default_delims) > 0
s1 = join(sort!(split(s1)), " ")
if search(s1.s, Base._default_delims) > 0
s1 = iteratortype(s1)(join(sort!(split(s1.s)), " "))
end
if search(s2, Base._default_delims) > 0
s2 = join(sort!(split(s2)), " ")
if search(s2.s, Base._default_delims) > 0
s2 = iteratortype(s2)(join(sort!(split(s2.s)), " "))
end
compare(dist.dist, s1, s2)
end
@ -80,12 +80,12 @@ type TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenSet, s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
v0, v1, v2 = _separate!(split(s1), split(s2))
s0 = join(v0, " ")
s1 = join(chain(v0, v1), " ")
s2 = join(chain(v0, v2), " ")
v0, v1, v2 = _separate!(split(s1.s), split(s2.s))
s0 = iteratortype(s1)(join(v0, " "))
s1 = iteratortype(s1)(join(chain(v0, v1), " "))
s2 = iteratortype(s1)(join(chain(v0, v2), " "))
if isempty(s0)
# otherwise compare(dist, "", "a")== 1.0
compare(dist.dist, s1, s2)
@ -128,7 +128,7 @@ type TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString,
function compare(dist::TokenMax, s1::StringIterator, s2::StringIterator,
len1::Integer, len2::Integer)
base = compare(dist.dist, s1, s2, len1, len2)
unbase_scale = 0.95

View File

@ -13,7 +13,7 @@ end
# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function compare(dist::Winkler, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
score = compare(dist.dist, s1, s2, len1, len2)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment

View File

@ -44,10 +44,7 @@ using StringDistances, Base.Test
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
@test matching_blocks("dwayne", "duane") ==
Set([(5,4,2) (1,1,1) (3,3,1)])
@test matching_blocks("dixon", "dicksonx") ==
Set([(1,1,2) (4,6,2)])
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666