parent
0f6c814b26
commit
bed352eef7
|
@ -96,6 +96,15 @@ The package includes distance modifiers:
|
|||
#> 0.855
|
||||
```
|
||||
|
||||
## Unicode
|
||||
To iterate on graphemes rather than characters, use `graphemeiterator`:
|
||||
|
||||
```julia
|
||||
evaluate(Hamming(), "b\u0300", "a")
|
||||
#> 2
|
||||
evaluate(Hamming(), graphemeiterator("b\u0300"), graphemeiterator("a"))
|
||||
#> 1
|
||||
```
|
||||
## References
|
||||
- [The stringdist Package for Approximate String Matching](https://journal.r-project.org/archive/2014-1/loo.pdf) Mark P.J. van der Loo
|
||||
- [fuzzywuzzy blog post](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)
|
||||
|
|
|
@ -32,54 +32,26 @@ TokenSort,
|
|||
TokenSet,
|
||||
TokenMax,
|
||||
graphemeiterator
|
||||
##############################################################################
|
||||
##
|
||||
## Define GraphemeIterator as AbstractString
|
||||
##
|
||||
## Argument for AbstractString inheritance:
|
||||
## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined
|
||||
## (ii) SubString(x::GraphemeIterator, i, j) works
|
||||
## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type)
|
||||
## Argument for non inheritance:
|
||||
## (i) All existing types <: AbstractString gives char as individual, which is important for print_escaped & search.
|
||||
## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join?
|
||||
##
|
||||
##############################################################################
|
||||
# from Base. I redefine it because I want AbstractStringinheritance
|
||||
immutable GraphemeIterator{S<:AbstractString} <: AbstractString
|
||||
s::S # original string (for generation of SubStrings)
|
||||
end
|
||||
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
||||
eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
|
||||
function length(g::GraphemeIterator)
|
||||
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
|
||||
n = 0
|
||||
for c in g.s
|
||||
n += isgraphemebreak(c0, c)
|
||||
c0 = c
|
||||
end
|
||||
return n
|
||||
end
|
||||
start(g::GraphemeIterator) = start(g.s)
|
||||
done(g::GraphemeIterator, i::Int) = done(g.s, i)
|
||||
function next(g::GraphemeIterator, i::Int)
|
||||
s = g.s
|
||||
j = i
|
||||
c0, k = next(s, i)
|
||||
while !done(s, k) # loop until next grapheme is s[i:j]
|
||||
c, ℓ = next(s, k)
|
||||
isgraphemebreak(c0, c) && break
|
||||
j = k
|
||||
k = ℓ
|
||||
c0 = c
|
||||
end
|
||||
return (SubString(s, i, j), k)
|
||||
end
|
||||
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
|
||||
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
|
||||
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
|
||||
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## TypeAlias
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
|
||||
typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## GraphemeIterator iterates on Grapheme
|
||||
##
|
||||
##############################################################################
|
||||
Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i)
|
||||
Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i)
|
||||
Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i)
|
||||
Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j::Integer))
|
||||
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
||||
|
||||
# added
|
||||
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
|
||||
|
@ -91,7 +63,7 @@ function Base.isvalid(s::GraphemeIterator, i::Integer)
|
|||
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
|
||||
end
|
||||
end
|
||||
function endof(s::GraphemeIterator)
|
||||
function Base.endof(s::GraphemeIterator)
|
||||
c0 = Char(0x00ad)
|
||||
i = endof(s.s)
|
||||
i0 = start(s.s)
|
||||
|
@ -128,7 +100,7 @@ include("modifiers/fuzzywuzzy.jl")
|
|||
##############################################################################
|
||||
for x in (:evaluate, :compare)
|
||||
@eval begin
|
||||
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
function $x(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
if len1 > len2
|
||||
return $x(dist, s2, s1, len2, len1)
|
||||
|
@ -139,19 +111,20 @@ for x in (:evaluate, :compare)
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## compare
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::PreMetric, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
1.0 - evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
len2 == 0 ? 1.0 : 1.0 - distance / len2
|
||||
|
@ -160,14 +133,14 @@ end
|
|||
# compare always return a value between 0 and 1.
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
function compare(dist::AbstractQGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
evaluate(dist, s1, s2, len1, len2)
|
||||
end
|
||||
|
||||
function compare(dist::QGram,
|
||||
s1::AbstractString, s2::AbstractString,
|
||||
s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
|
||||
distance = evaluate(dist, s1, s2, len1, len2)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Return start of commn substring in s1, start of common substring in s2, and length of substring
|
||||
# Indexes refer to character number, not index (differ for Unicode strings)
|
||||
function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
||||
function longest_common_substring(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
|
||||
if length(s1) > length(s2)
|
||||
start2, start1, size= longest_common_substring(s2, s1)
|
||||
else
|
||||
|
@ -28,7 +28,7 @@ function longest_common_substring(s1::AbstractString, s2::AbstractString)
|
|||
return start1, start2, size
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString, start1::Integer, start2::Integer)
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, start1::Integer, start2::Integer)
|
||||
a = longest_common_substring(s1, s2)
|
||||
if a[3] > 0
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
|
@ -43,14 +43,14 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
end
|
||||
end
|
||||
|
||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||
function matching_blocks(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator)
|
||||
x = Set{Tuple{Int, Int, Int}}()
|
||||
matching_blocks!(x, s1, s2, 1, 1)
|
||||
return x
|
||||
end
|
||||
|
||||
type RatcliffObershelp <: PreMetric end
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
result = matching_blocks(s1, s2)
|
||||
matched = 0
|
||||
for x in result
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
## Assumes length(s1) <= length(s2)
|
||||
##############################################################################
|
||||
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
function common_prefix(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, lim::Integer = -1)
|
||||
start1 = start(s1)
|
||||
start2 = start(s2)
|
||||
l = 0
|
||||
|
@ -24,7 +24,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString, len1::Integer, len2:: Integer)
|
||||
function evaluate(dist::Hamming, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2:: Integer)
|
||||
count = 0
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
count += ch1 != ch2
|
||||
|
@ -42,7 +42,7 @@ end
|
|||
|
||||
|
||||
type Levenshtein <: SemiMetric end
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -90,7 +90,7 @@ end
|
|||
|
||||
type DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
|
@ -158,7 +158,7 @@ end
|
|||
|
||||
type Jaro <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::Jaro, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
|
||||
len2 == 0 && return 0.0
|
||||
|
||||
|
@ -199,4 +199,4 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString, len1::Inte
|
|||
return 1.0 - score
|
||||
end
|
||||
|
||||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(Jaro(), s1, s2)
|
||||
jaro(s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator) = evaluate(Jaro(), s1, s2)
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable QGramIterator{S <: AbstractString, T <: Integer}
|
||||
s::S # grapheorstring
|
||||
immutable QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
|
||||
s::S # grapheme
|
||||
l::Int # length of string
|
||||
q::T # length of q-grams
|
||||
end
|
||||
|
@ -23,7 +23,8 @@ function Base.done(qgram::QGramIterator, state)
|
|||
istart, idend = state
|
||||
done(qgram.s, idend)
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator) = SubString{typeof(qgram.s)}
|
||||
Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s)}
|
||||
Base.eltype{S <: GraphemeIterator, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s.s)}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array(eltype(qgram), length(qgram))
|
||||
|
@ -80,7 +81,7 @@ end
|
|||
##############################################################################
|
||||
abstract AbstractQGram <: SemiMetric
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
||||
sort2 = sort(QGramIterator(s2, len2, dist.q))
|
||||
evaluate(dist, CountInterator(sort1, sort2))
|
||||
|
|
|
@ -9,7 +9,7 @@ type Partial{T <: PreMetric} <: PreMetric
|
|||
end
|
||||
|
||||
# general
|
||||
function compare(dist::Partial, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Partial, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
|
||||
len1 == 0 && return compare(dist.dist, "", "", 0, 0)
|
||||
iter = QGramIterator(s2, len2, len1)
|
||||
|
@ -26,7 +26,7 @@ end
|
|||
|
||||
# Specialization for RatcliffObershelp distance
|
||||
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
|
||||
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Partial{RatcliffObershelp}, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
len1 == len2 && return compare(dist.dist, s1, s2, len1, len2)
|
||||
out = 0.0
|
||||
result = matching_blocks(s1, s2)
|
||||
|
@ -59,7 +59,7 @@ type TokenSort{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSort, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenSort, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
if search(s1, Base._default_delims) > 0
|
||||
s1 = iterator(typeof(s1), join(sort!(split(s1)), " "))
|
||||
|
@ -81,7 +81,7 @@ type TokenSet{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenSet, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenSet, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
v0, v1, v2 = _separate!(split(s1), split(s2))
|
||||
s0 = iterator(typeof(s1), join(v0, " "))
|
||||
|
@ -129,7 +129,7 @@ type TokenMax{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(dist::TokenMax, s1::AbstractString, s2::AbstractString,
|
||||
function compare(dist::TokenMax, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator,
|
||||
len1::Integer, len2::Integer)
|
||||
base = compare(dist.dist, s1, s2, len1, len2)
|
||||
unbase_scale = 0.95
|
||||
|
|
|
@ -13,7 +13,7 @@ end
|
|||
# restrict to distance between 0 and 1
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
||||
|
||||
function compare(dist::Winkler, s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function compare(dist::Winkler, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
score = compare(dist.dist, s1, s2, len1, len2)
|
||||
l = common_prefix(s1, s2, 4)[1]
|
||||
# common prefix adjustment
|
||||
|
|
Loading…
Reference in New Issue