2015-10-22 18:12:44 +02:00
__precompile__ ( true )
module StringDistances
##############################################################################
##
## Export
##
##############################################################################
2015-11-10 15:47:12 +01:00
import Base : eltype , length , start , done , next , == , hash , isless , convert , show , endof
import Base . UTF8proc : isgraphemebreak
2015-10-25 22:26:17 +01:00
import Distances : evaluate , Hamming , hamming , PreMetric , SemiMetric
2017-07-01 10:13:43 +02:00
import IterTools : chain
2015-11-04 18:40:30 +01:00
export
evaluate ,
compare ,
Hamming ,
Levenshtein ,
DamerauLevenshtein ,
Jaro ,
QGram ,
Cosine ,
Jaccard ,
2015-11-05 16:51:32 +01:00
SorensenDice ,
Overlap ,
2015-11-04 18:40:30 +01:00
longest_common_substring ,
matching_blocks ,
RatcliffObershelp ,
Winkler ,
Partial ,
TokenSort ,
2015-11-06 16:47:15 +01:00
TokenSet ,
2015-11-10 15:47:12 +01:00
TokenMax ,
graphemeiterator
2016-06-28 16:52:42 +02:00
2015-11-06 20:43:04 +01:00
##############################################################################
##
2016-06-28 16:52:42 +02:00
## TypeAlias
2015-11-06 20:43:04 +01:00
##
2016-06-28 16:52:42 +02:00
##############################################################################
2017-05-12 23:41:56 +02:00
const GraphemeIterator = Base . UTF8proc . GraphemeIterator
#const AbstractStringorGraphemeIterator = Union{AbstractString, Base.UTF8proc.GraphemeIterator}
const AbstractStringorGraphemeIterator = AbstractString
2016-06-28 16:52:42 +02:00
##############################################################################
##
2016-08-31 22:05:38 +02:00
## GraphemeIterator. Unicode iteration broken because Unicode 9 has different iteration properties.
2015-11-10 16:01:45 +01:00
##
2015-11-06 20:43:04 +01:00
##############################################################################
2016-08-31 22:05:38 +02:00
#Base.prevind(x::GraphemeIterator, i::Integer) = prevind(x.s, i)
#Base.nextind(x::GraphemeIterator, i::Integer) = nextind(x.s, i)
#Base.chr2ind(x::GraphemeIterator, i::Integer) = chr2ind(x.s, i)
#Base.SubString(x::GraphemeIterator, i::Integer, j::Integer) = graphemeiterator(SubString(x.s, i::Integer, j:#:Integer))
#graphemeiterator(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
#
## added
##these 2 functions allow to define prevind nextind, chr2ind, prevind etc
#function Base.isvalid(s::GraphemeIterator, i::Integer)
2017-07-01 10:13:43 +02:00
# if !isvalid(s.s, i)
2016-08-31 22:05:38 +02:00
# return false
# else
# k = start(s)
# while !done(s, k)
# j = k[1]
# if j == i
# return true
# end
# end
# return false
# end
#end
#function Base.endof(s::GraphemeIterator)
# k = start(s)
# while !done(s, k)
# i = k[1]
# c, k = next(s, k)
# end
# return i
#end
#
## 1. issues with stuff like search or print_escaped where character iteration vs string iteration matters. #I need to pass the original string for now
#Base.search(x::GraphemeIterator, s::Vector{Char}) = search(x.s, s)
## 2. issue with keeping iterator property for stuff like split, join. for now, I decide to loose the #enumerator property but add it back after join. But SubString for instance does not loose the property
#Base.split(x::GraphemeIterator, args...) = split(x.s, args...)
#iterator{T <: GraphemeIterator}(::Type{T}, x::AbstractString) = graphemeiterator(x)
2017-05-20 01:54:38 +02:00
iterator ( :: Type { T } , x :: AbstractString ) where { T <: AbstractString } = x
2016-08-31 22:05:38 +02:00
#
2015-11-06 20:43:04 +01:00
##############################################################################
##
## include
##
##############################################################################
2015-11-04 18:40:30 +01:00
include ( " distances/edit.jl " )
include ( " distances/qgram.jl " )
include ( " distances/RatcliffObershelp.jl " )
include ( " modifiers/winkler.jl " )
2015-11-06 16:47:15 +01:00
include ( " modifiers/fuzzywuzzy.jl " )
2015-10-22 18:53:27 +02:00
2015-11-06 03:03:45 +01:00
##############################################################################
##
## Higher level functions
##
##############################################################################
2015-11-10 00:29:08 +01:00
for x in ( :evaluate , :compare )
@eval begin
2016-06-28 16:52:42 +02:00
function $ x ( dist :: PreMetric , s1 :: AbstractStringorGraphemeIterator , s2 :: AbstractStringorGraphemeIterator )
2015-11-10 00:29:08 +01:00
len1 , len2 = length ( s1 ) , length ( s2 )
if len1 > len2
return $ x ( dist , s2 , s1 , len2 , len1 )
else
return $ x ( dist , s1 , s2 , len1 , len2 )
end
end
2015-11-06 03:03:45 +01:00
end
end
2016-06-28 16:52:42 +02:00
2015-11-06 03:03:45 +01:00
##############################################################################
##
## compare
##
##############################################################################
2017-07-01 10:13:43 +02:00
function compare ( dist :: PreMetric , s1 :: AbstractStringorGraphemeIterator , s2 :: AbstractStringorGraphemeIterator ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
1.0 - evaluate ( dist , s1 , s2 , len1 , len2 )
end
2017-07-01 10:13:43 +02:00
function compare ( dist :: Union { Hamming , Levenshtein , DamerauLevenshtein } ,
2016-06-28 16:52:42 +02:00
s1 :: AbstractStringorGraphemeIterator , s2 :: AbstractStringorGraphemeIterator ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
distance = evaluate ( dist , s1 , s2 , len1 , len2 )
len2 == 0 ? 1.0 : 1.0 - distance / len2
end
2017-07-01 10:13:43 +02:00
# compare always return a value between 0 and 1.
2015-11-06 03:03:45 +01:00
# When string length < q for qgram distance, returns s1 == s2
2017-07-01 10:13:43 +02:00
function compare ( dist :: AbstractQGram ,
s1 :: AbstractStringorGraphemeIterator , s2 :: AbstractStringorGraphemeIterator ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
len1 <= ( dist . q - 1 ) && return convert ( Float64 , s1 == s2 )
evaluate ( dist , s1 , s2 , len1 , len2 )
end
2017-07-01 10:13:43 +02:00
function compare ( dist :: QGram ,
s1 :: AbstractStringorGraphemeIterator , s2 :: AbstractStringorGraphemeIterator ,
2015-11-06 03:03:45 +01:00
len1 :: Integer , len2 :: Integer )
len1 <= ( dist . q - 1 ) && return convert ( Float64 , s1 == s2 )
distance = evaluate ( dist , s1 , s2 , len1 , len2 )
1 - distance / ( len1 + len2 - 2 * dist . q + 2 )
end
2015-10-22 18:12:44 +02:00
2015-11-06 20:43:04 +01:00
2017-07-01 10:13:43 +02:00
end