2015-10-22 18:12:44 +02:00
__precompile__ ( true )
module StringDistances
##############################################################################
##
## Export
##
##############################################################################
2015-11-10 15:47:12 +01:00
import Base : eltype , length , start , done , next , == , hash , isless , convert , show , endof
import Base . UTF8proc : isgraphemebreak
2015-10-25 22:26:17 +01:00
import Distances : evaluate , Hamming , hamming , PreMetric , SemiMetric
2015-11-04 18:40:30 +01:00
import Iterators : chain
export
evaluate ,
compare ,
Hamming ,
Levenshtein ,
DamerauLevenshtein ,
Jaro ,
QGram ,
Cosine ,
Jaccard ,
2015-11-05 16:51:32 +01:00
SorensenDice ,
Overlap ,
2015-11-04 18:40:30 +01:00
longest_common_substring ,
matching_blocks ,
RatcliffObershelp ,
Winkler ,
Partial ,
TokenSort ,
2015-11-06 16:47:15 +01:00
TokenSet ,
2015-11-10 15:47:12 +01:00
TokenMax ,
graphemeiterator
2015-11-06 20:43:04 +01:00
##############################################################################
##
2015-11-10 15:47:12 +01:00
## Define GraphemeIterator as AbstractString
2015-11-06 20:43:04 +01:00
##
2015-11-10 16:01:45 +01:00
## Argument for AbstractString inheritance:
## (i) prevind, nextind, chr2ind, are defined once start, next, done, isvalid, endof are defined
## (ii) SubString(x::GraphemeIterator, i, j) works
## (ii) I can define functions with AbstractString signature in this package (but I could also just define a union type)
## Argument for non inheritance:
2016-04-28 15:29:16 +02:00
## (i) All existing types <: AbstractString gives char as individual, which is important for print_escaped & search.
2015-11-10 16:01:45 +01:00
## (ii) How to make split return GraphemeIterator rather than strings? How to join multiple GraphemeIterator w/o rewriting join?
##
2015-11-06 20:43:04 +01:00
##############################################################################
2015-11-10 15:47:12 +01:00
# from Base. I redefine it because I want AbstractStringinheritance
immutable GraphemeIterator { S <: AbstractString } <: AbstractString
s :: S # original string (for generation of SubStrings)
2015-11-10 00:29:08 +01:00
end
2015-11-10 15:47:12 +01:00
graphemeiterator ( s :: AbstractString ) = GraphemeIterator { typeof ( s ) } ( s )
eltype { S } ( :: Type { GraphemeIterator { S } } ) = SubString { S }
function length ( g :: GraphemeIterator )
c0 = Char ( 0x00ad ) # soft hyphen (grapheme break always allowed after this)
n = 0
for c in g . s
n += isgraphemebreak ( c0 , c )
c0 = c
end
return n
end
start ( g :: GraphemeIterator ) = start ( g . s )
done ( g :: GraphemeIterator , i :: Int ) = done ( g . s , i )
function next ( g :: GraphemeIterator , i :: Int )
s = g . s
j = i
c0 , k = next ( s , i )
while ! done ( s , k ) # loop until next grapheme is s[i:j]
c , ℓ = next ( s , k )
isgraphemebreak ( c0 , c ) && break
j = k
k = ℓ
c0 = c
2015-11-06 20:43:04 +01:00
end
2015-11-10 15:47:12 +01:00
return ( SubString ( s , i , j ) , k )
2015-11-06 20:43:04 +01:00
end
2015-11-10 15:47:12 +01:00
== ( g1 :: GraphemeIterator , g2 :: GraphemeIterator ) = g1 . s == g2 . s
hash ( g :: GraphemeIterator , h :: UInt ) = hash ( g . s , h )
isless ( g1 :: GraphemeIterator , g2 :: GraphemeIterator ) = isless ( g1 . s , g2 . s )
show { S } ( io :: IO , g :: GraphemeIterator { S } ) = print ( io , " length- $ ( length ( g ) ) GraphemeIterator{ $S } for \" $ ( g . s ) \" " )
# added
2015-11-10 16:04:57 +01:00
#these 2 functions allow to define prevind nextind, chr2ind, prevind etc
2015-11-10 15:47:12 +01:00
function Base . isvalid ( s :: GraphemeIterator , i :: Integer )
if ! isvalid ( s . s , i )
return false
else
i0 = prevind ( s . s , i )
return i0 < start ( s . s ) || isgraphemebreak ( s . s [ i0 ] , s . s [ i ] )
end
end
function endof ( s :: GraphemeIterator )
c0 = Char ( 0x00ad )
i = endof ( s . s )
i0 = start ( s . s )
while i >= i0 && ! isgraphemebreak ( s . s [ i ] , c0 )
i = prevind ( s . s , i )
c0 = s . s [ i ]
end
i
end
# 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. I need to pass the original string for now
Base . search ( x :: GraphemeIterator , s :: Vector { Char } ) = search ( x . s , s )
# 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the enumerator property but add it back after join. But SubString for instance does not loose the property
Base . split ( x :: GraphemeIterator , args ... ) = split ( x . s , args ... )
iterator { T <: GraphemeIterator } ( :: Type { T } , x :: AbstractString ) = graphemeiterator ( x )
iterator { T <: AbstractString } ( :: Type { T } , x :: AbstractString ) = x
2015-11-07 16:07:03 +01:00
2015-11-06 20:43:04 +01:00
##############################################################################
##
## include
##
##############################################################################
2015-11-04 18:40:30 +01:00
include ( " distances/edit.jl " )
include ( " distances/qgram.jl " )
include ( " distances/RatcliffObershelp.jl " )
include ( " modifiers/winkler.jl " )
2015-11-06 16:47:15 +01:00
include ( " modifiers/fuzzywuzzy.jl " )
2015-10-22 18:53:27 +02:00
2015-11-06 03:03:45 +01:00
##############################################################################
##
## Higher level functions
##
##############################################################################
2015-11-10 00:29:08 +01:00
for x in ( :evaluate , :compare )
@eval begin
function $ x ( dist :: PreMetric , s1 :: AbstractString , s2 :: AbstractString )
len1 , len2 = length ( s1 ) , length ( s2 )
if len1 > len2
return $ x ( dist , s2 , s1 , len2 , len1 )
else
return $ x ( dist , s1 , s2 , len1 , len2 )
end
end
2015-11-06 03:03:45 +01:00
end
end
##############################################################################
##
## compare
##
##############################################################################
2015-11-10 15:47:12 +01:00
function compare ( dist :: PreMetric , s1 :: AbstractString , s2 :: AbstractString ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
1.0 - evaluate ( dist , s1 , s2 , len1 , len2 )
end
function compare ( dist :: Union { Hamming , Levenshtein , DamerauLevenshtein } ,
2015-11-10 15:47:12 +01:00
s1 :: AbstractString , s2 :: AbstractString ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
distance = evaluate ( dist , s1 , s2 , len1 , len2 )
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
# compare always return a value between 0 and 1.
# When string length < q for qgram distance, returns s1 == s2
function compare ( dist :: AbstractQGram ,
2015-11-10 15:47:12 +01:00
s1 :: AbstractString , s2 :: AbstractString ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
len1 <= ( dist . q - 1 ) && return convert ( Float64 , s1 == s2 )
evaluate ( dist , s1 , s2 , len1 , len2 )
end
function compare ( dist :: QGram ,
2015-11-10 15:47:12 +01:00
s1 :: AbstractString , s2 :: AbstractString ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
len1 <= ( dist . q - 1 ) && return convert ( Float64 , s1 == s2 )
distance = evaluate ( dist , s1 , s2 , len1 , len2 )
1 - distance / ( len1 + len2 - 2 * dist . q + 2 )
end
2015-10-22 18:12:44 +02:00
2015-11-06 20:43:04 +01:00
2015-10-22 18:38:04 +02:00
end