StringDistances.jl/src/StringDistances.jl

181 lines
5.8 KiB
Julia
Raw Normal View History

2015-10-22 18:12:44 +02:00
__precompile__(true)
module StringDistances
##############################################################################
##
## Export
##
##############################################################################
2015-11-10 15:47:12 +01:00
import Base: eltype, length, start, done, next, ==, hash, isless, convert, show, endof
import Base.UTF8proc: isgraphemebreak
2015-10-25 22:26:17 +01:00
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
2015-11-04 18:40:30 +01:00
import Iterators: chain
export
evaluate,
compare,
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
QGram,
Cosine,
Jaccard,
2015-11-05 16:51:32 +01:00
SorensenDice,
Overlap,
2015-11-04 18:40:30 +01:00
longest_common_substring,
matching_blocks,
RatcliffObershelp,
Winkler,
Partial,
TokenSort,
2015-11-06 16:47:15 +01:00
TokenSet,
2015-11-10 15:47:12 +01:00
TokenMax,
graphemeiterator
2015-11-06 20:43:04 +01:00
##############################################################################
##
2015-11-10 15:47:12 +01:00
## Define GraphemeIterator as AbstractString
2015-11-06 20:43:04 +01:00
##
2015-11-10 16:01:45 +01:00
## Argument for AbstractString inheritance:
## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined
## (ii) SubString(x::GraphemeIterator, i, j) works
## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type)
## Argument for non inheritance:
2016-04-28 15:29:16 +02:00
## (i) All existing types <: AbstractString gives char as individual, which is important for print_escaped & search.
2015-11-10 16:01:45 +01:00
## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join?
##
2015-11-06 20:43:04 +01:00
##############################################################################
2015-11-10 15:47:12 +01:00
# from Base. I redefine it because I want AbstractStringinheritance
immutable GraphemeIterator{S<:AbstractString} <: AbstractString
s::S # original string (for generation of SubStrings)
2015-11-10 00:29:08 +01:00
end
2015-11-10 15:47:12 +01:00
graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
eltype{S}(::Type{GraphemeIterator{S}}) = SubString{S}
function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g.s
n += isgraphemebreak(c0, c)
c0 = c
end
return n
end
start(g::GraphemeIterator) = start(g.s)
done(g::GraphemeIterator, i::Int) = done(g.s, i)
function next(g::GraphemeIterator, i::Int)
s = g.s
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, = next(s, k)
isgraphemebreak(c0, c) && break
j = k
k =
c0 = c
2015-11-06 20:43:04 +01:00
end
2015-11-10 15:47:12 +01:00
return (SubString(s, i, j), k)
2015-11-06 20:43:04 +01:00
end
2015-11-10 15:47:12 +01:00
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
show{S}(io::IO, g::GraphemeIterator{S}) = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
# added
2015-11-10 16:04:57 +01:00
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
2015-11-10 15:47:12 +01:00
function Base.isvalid(s::GraphemeIterator, i::Integer)
if !isvalid(s.s, i)
return false
else
i0 = prevind(s.s, i)
return i0 < start(s.s) || isgraphemebreak(s.s[i0], s.s[i])
end
end
function endof(s::GraphemeIterator)
c0 = Char(0x00ad)
i = endof(s.s)
i0 = start(s.s)
while i >= i0 && !isgraphemebreak(s.s[i], c0)
i = prevind(s.s, i)
c0 = s.s[i]
end
i
end
# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now
Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property
Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
iterator{T <: AbstractString}(::Type{T}, x::AbstractString) = x
2015-11-07 16:07:03 +01:00
2015-11-06 20:43:04 +01:00
##############################################################################
##
## include
##
##############################################################################
2015-11-04 18:40:30 +01:00
include("distances/edit.jl")
include("distances/qgram.jl")
include("distances/RatcliffObershelp.jl")
include("modifiers/winkler.jl")
2015-11-06 16:47:15 +01:00
include("modifiers/fuzzywuzzy.jl")
2015-10-22 18:53:27 +02:00
2015-11-06 03:03:45 +01:00
##############################################################################
##
## Higher level functions
##
##############################################################################
2015-11-10 00:29:08 +01:00
for x in (:evaluate, :compare)
@eval begin
function $x(dist::PreMetric, s1::AbstractString, s2::AbstractString)
len1, len2 = length(s1), length(s2)
if len1 > len2
return $x(dist, s2, s1, len2, len1)
else
return $x(dist, s1, s2, len1, len2)
end
end
2015-11-06 03:03:45 +01:00
end
end
##############################################################################
##
## compare
##
##############################################################################
2015-11-10 15:47:12 +01:00
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString,
2015-11-06 03:03:45 +01:00
len1::Integer, len2::Integer)
1.0 - evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
2015-11-10 15:47:12 +01:00
s1::AbstractString, s2::AbstractString,
2015-11-06 03:03:45 +01:00
len1::Integer, len2::Integer)
distance = evaluate(dist, s1, s2, len1, len2)
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare(dist::AbstractQGram,
2015-11-10 15:47:12 +01:00
s1::AbstractString, s2::AbstractString,
2015-11-06 03:03:45 +01:00
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
evaluate(dist, s1, s2, len1, len2)
end
function compare(dist::QGram,
2015-11-10 15:47:12 +01:00
s1::AbstractString, s2::AbstractString,
2015-11-06 03:03:45 +01:00
len1::Integer, len2::Integer)
len1 <= (dist.q - 1) && return convert(Float64, s1 == s2)
distance = evaluate(dist, s1, s2, len1, len2)
1 - distance / (len1 + len2 - 2 * dist.q + 2)
end
2015-10-22 18:12:44 +02:00
2015-11-06 20:43:04 +01:00
2015-10-22 18:38:04 +02:00
end