try 2 iterator types
parent
9e066e3c5a
commit
d8ac44be23
|
@ -32,10 +32,22 @@ TokenSet,
|
|||
TokenMax
|
||||
##############################################################################
|
||||
##
|
||||
## Extend methods of GraphemeIterator
|
||||
## Define 2 iterators. One on character one on Grapheme.
|
||||
##
|
||||
##############################################################################
|
||||
typealias GraphemeOrString Union{GraphemeIterator, AbstractString}
|
||||
|
||||
immutable CharacterIterator{T <: AbstractString}
|
||||
s::T
|
||||
end
|
||||
Base.start(x::CharacterIterator) = start(x.s)
|
||||
Base.next(x::CharacterIterator, i::Integer) = next(x.s, i)
|
||||
Base.done(x::CharacterIterator, i::Integer) = done(x.s, i)
|
||||
Base.length(x::CharacterIterator) = length(x.s)
|
||||
|
||||
Base.nextind(x::CharacterIterator, i::Integer) = nextind(x.s, i)
|
||||
Base.chr2ind(x::CharacterIterator, i::Integer) = chr2ind(x.s, i::Integer)
|
||||
iteratortype{T <: CharacterIterator}(::Type{T}) = CharacterIterator
|
||||
Base.convert{T}(::Type{CharacterIterator{T}}, x::T) = CharacterIterator(x)
|
||||
|
||||
# add the following methods
|
||||
Base.nextind(g::GraphemeIterator, state::Integer) = next(g, state)[2]
|
||||
|
@ -52,8 +64,22 @@ function Base.chr2ind(s::GraphemeIterator, i::Integer)
|
|||
k = l
|
||||
end
|
||||
end
|
||||
Base.endof(g::GraphemeIterator) = endof(g.s)
|
||||
Base.SubString(x::GraphemeIterator, i, j) = graphemes(SubString(x.s, i, j))
|
||||
iteratortype{T <: GraphemeIterator}(::Type{T}) = GraphemeIterator
|
||||
Base.convert{T}(::Type{GraphemeIterator{T}}, x::T) = GraphemeIterator(x)
|
||||
|
||||
|
||||
typealias StringIterator{T} Union{GraphemeIterator{T}, CharacterIterator{T}}
|
||||
Base.endof(g::StringIterator) = endof(g.s)
|
||||
Base.SubString(x::StringIterator, i, j) = iteratortype(x)(SubString(x.s, i, j))
|
||||
Base.string(x::StringIterator) = x.s
|
||||
Base.isempty(x::StringIterator) = isempty(x.s)
|
||||
Base.eltype{T}(x::StringIterator{T}) = T
|
||||
Base.isless(x::StringIterator, y::StringIterator) = isless(x.s, y.s)
|
||||
Base.search(x::StringIterator, args...) = search(x.s, args...)
|
||||
Base.searchsortedfirst(x::StringIterator, args...) = searchsortedfirst(x.s, args...)
|
||||
Base.searchsortedlast(x::StringIterator, args...) = searchsortedlast(x.s, args...)
|
||||
Base.searchsorted(x::StringIterator, args...) = searchsorted(x.s, args...)
|
||||
iteratortype(x::StringIterator) = iteratortype(typeof(x))
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -72,13 +98,24 @@ include("modifiers/fuzzywuzzy.jl")
|
|||
## Higher level functions
|
||||
##
|
||||
##############################################################################
|
||||
for x in (:evaluate, :compare)
|
||||
@eval begin
|
||||
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
$x(dist, CharacterIterator(s1), CharacterIterator(s2))
|
||||
end
|
||||
|
||||
function evaluate(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return evaluate(dist, s2, s1, len2, len1)
|
||||
else
|
||||
return evaluate(dist, s1, s2, len1, len2)
|
||||
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
$x(dist, CharacterIterator(s1), CharacterIterator(s2), len1, len2)
|
||||
end
|
||||
|
||||
function $x(dist::PreMetric, s1::StringIterator, s2::StringIterator)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return $x(dist, s2, s1, len2, len1)
|
||||
else
|
||||
return $x(dist, s1, s2, len1, len2)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -88,22 +125,13 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return compare(dist, s2, s1, len2, len1)
|
||||
else
|
||||
return compare(dist, s1, s2, len1, len2)
|
||||
end
|
||||
end
|
||||
|
||||
function compare(dist::PreMetric, s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
function compare(dist::PreMetric, s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
1.0 - evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||
|
@ -112,14 +140,14 @@ end
|
|||
# compare always return a value between 0 and 1.
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
function compare(dist::AbstractQGram,
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::QGram,
|
||||
s1::GraphemeOrString, s2::GraphemeOrString,
|
||||
s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
||||
# Indexes refer to character number, not index (differ for Unicode strings)
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
||||
function longest_common_substring(s1::StringIterator, s2::StringIterator)
|
||||
if length(s1) > length(s2)
|
||||
start2, start1, size= longest_common_substring(s2, s1)
|
||||
else
|
||||
|
@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
|||
return start1, start2, size
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2::GraphemeOrString, start1::Integer, start2::Integer)
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::StringIterator, s2::StringIterator, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
|
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::GraphemeOrString, s2
|
|||
end
|
||||
end
|
||||
|
||||
function matching_blocks(s1::GraphemeOrString, s2::GraphemeOrString)
|
||||
function matching_blocks(s1::StringIterator, s2::StringIterator)
|
||||
x = Set{Tuple{Int, Int, Int}}()
|
||||
matching_blocks!(x, s1, s2, 1, 1)
|
||||
return x
|
||||
end
|
||||
|
||||
type RatcliffObershelp <: PreMetric end
|
||||
function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::RatcliffObershelp, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
result = matching_blocks(s1, s2)
|
||||
matched = 0
|
||||
for x in result
|
||||
|
@ -59,7 +59,7 @@ function evaluate(dist::RatcliffObershelp, s1::GraphemeOrString, s2::GraphemeOrS
|
|||
1.0 - 2 * matched / (len1 + len2)
|
||||
end
|
||||
|
||||
#function buildref(s::GraphemeOrString, len)
|
||||
#function buildref(s::StringIterator, len)
|
||||
# ref = Array(Int, len)
|
||||
# state = start(s)
|
||||
# i = 0
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
## Assumes length(s1) <= length(s2)
|
||||
##############################################################################
|
||||
|
||||
function common_prefix(s1::GraphemeOrString, s2::GraphemeOrString, lim::Integer = -1)
|
||||
function common_prefix(s1::StringIterator, s2::StringIterator, lim::Integer = -1)
|
||||
start1 = start(s1)
|
||||
start2 = start(s2)
|
||||
l = 0
|
||||
|
@ -24,7 +24,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function evaluate(dist::Hamming, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2:: Integer)
|
||||
function evaluate(dist::Hamming, s1::StringIterator, s2::StringIterator, len1::Integer, len2:: Integer)
|
||||
count = 0
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
count += ch1 != ch2
|
||||
|
@ -42,7 +42,7 @@ end
|
|||
|
||||
|
||||
type Levenshtein <: SemiMetric end
|
||||
function evaluate(dist::Levenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Levenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -90,7 +90,7 @@ end
|
|||
|
||||
type DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::DamerauLevenshtein, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -158,7 +158,7 @@ end
|
|||
|
||||
type Jaro <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Jaro, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
|
||||
len2 == 0 && return 0.0
|
||||
|
||||
|
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::GraphemeOrString, s2::GraphemeOrString, len1::
|
|||
return 1.0 - score
|
||||
end
|
||||
|
||||
jaro(s1::GraphemeOrString, s2::GraphemeOrString) = evaluate(Jaro(), s1, s2)
|
||||
jaro(s1::StringIterator, s2::StringIterator) = evaluate(Jaro(), s1, s2)
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable QGramIterator{S, T <: Integer}
|
||||
immutable QGramIterator{S <: StringIterator, T <: Integer}
|
||||
s::S # grapheorstring
|
||||
l::Int # length of string
|
||||
q::T # length of q-grams
|
||||
|
@ -23,8 +23,7 @@ function Base.done(qgram::QGramIterator, state)
|
|||
istart, idend = state
|
||||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{S}
|
||||
Base.eltype{S, T}(qgram::QGramIterator{GraphemeIterator{S}, T}) = SubString{S}
|
||||
Base.eltype(qgram::QGramIterator) = iteratortype(qgram.s){SubString{eltype(qgram.s)}}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array(eltype(qgram), length(qgram))
|
||||
|
@ -81,7 +80,7 @@ end
|
|||
##############################################################################
|
||||
abstract AbstractQGram <: SemiMetric
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::GraphemeOrString, s2::GraphemeOrString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::AbstractQGram, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
||||
evaluate(dist, CountInterator(sort1, sort2))
|
||||
|
|
|
@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
# general
|
||||
function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Partial, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
|
||||
len1 == 0 && return compare(dist.dist, "", "", 0, 0)
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
|
@ -26,7 +26,7 @@ end
|
|||
|
||||
# Specialization for RatcliffObershelp distance
|
||||
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
|
||||
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Partial{RatcliffObershelp}, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
|
||||
out = 0.0
|
||||
result = matching_blocks(s1, s2)
|
||||
|
@ -59,13 +59,13 @@ type TokenSort{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenSort, s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
if search(s1, Base._default_delims) > 0
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
if search(s1.s, Base._default_delims) > 0
|
||||
s1 = iteratortype(s1)(join(sort!(split(s1.s)), " "))
|
||||
end
|
||||
if search(s2, Base._default_delims) > 0
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
if search(s2.s, Base._default_delims) > 0
|
||||
s2 = iteratortype(s2)(join(sort!(split(s2.s)), " "))
|
||||
end
|
||||
compare(dist.dist, s1, s2)
|
||||
end
|
||||
|
@ -80,12 +80,12 @@ type TokenSet{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenSet, s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(chain(v0, v1), " ")
|
||||
s2 = join(chain(v0, v2), " ")
|
||||
v0, v1, v2 = _separate!(split(s1.s), split(s2.s))
|
||||
s0 = iteratortype(s1)(join(v0, " "))
|
||||
s1 = iteratortype(s1)(join(chain(v0, v1), " "))
|
||||
s2 = iteratortype(s1)(join(chain(v0, v2), " "))
|
||||
if isempty(s0)
|
||||
# otherwise compare(dist, "", "a")== 1.0
|
||||
compare(dist.dist, s1, s2)
|
||||
|
@ -128,7 +128,7 @@ type TokenMax{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenMax, s1::StringIterator, s2::StringIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
base = compare(dist.dist, s1, s2, len1, len2)
|
||||
unbase_scale = 0.95
|
||||
|
|
|
@ -13,7 +13,7 @@ end
|
|||
# restrict to distance between 0 and 1
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
||||
|
||||
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Winkler, s1::StringIterator, s2::StringIterator, len1::Integer, len2::Integer)
|
||||
score = compare(dist.dist, s1, s2, len1, len2)
|
||||
l = common_prefix(s1, s2, 4)[1]
|
||||
# common prefix adjustment
|
||||
|
|
|
@ -44,10 +44,7 @@ using StringDistances, Base.Test
|
|||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||
|
||||
@test matching_blocks("dwayne", "duane") ==
|
||||
Set([(5,4,2) (1,1,1) (3,3,1)])
|
||||
@test matching_blocks("dixon", "dicksonx") ==
|
||||
Set([(1,1,2) (4,6,2)])
|
||||
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
||||
|
|
Loading…
Reference in New Issue