2015-10-22 18:12:44 +02:00
module StringDistances
2019-12-12 15:38:20 +01:00
using Distances
import Distances : evaluate , result_type
using DataStructures # for SortedSet in TokenSort
2015-10-22 18:12:44 +02:00
##############################################################################
##
## Export
##
##############################################################################
2019-12-12 15:38:20 +01:00
2015-11-04 18:40:30 +01:00
export
evaluate ,
compare ,
2019-12-11 20:45:58 +01:00
result_type ,
2015-11-04 18:40:30 +01:00
Hamming ,
Levenshtein ,
DamerauLevenshtein ,
Jaro ,
2019-08-17 18:57:35 +02:00
RatcliffObershelp ,
2015-11-04 18:40:30 +01:00
QGram ,
Cosine ,
Jaccard ,
2015-11-05 16:51:32 +01:00
SorensenDice ,
Overlap ,
2015-11-04 18:40:30 +01:00
Winkler ,
Partial ,
TokenSort ,
2015-11-06 16:47:15 +01:00
TokenSet ,
2019-08-17 18:15:14 +02:00
TokenMax ,
2019-08-20 18:32:52 +02:00
qgram ,
2019-08-20 19:21:31 +02:00
find_best ,
find_all
2016-06-28 16:52:42 +02:00
2015-11-06 20:43:04 +01:00
##############################################################################
##
## include
##
##############################################################################
2019-08-18 18:52:37 +02:00
include ( " utils.jl " )
include ( " edit.jl " )
include ( " qgram.jl " )
2017-08-05 20:45:19 +02:00
include ( " compare.jl " )
2019-08-20 19:21:31 +02:00
include ( " find.jl " )
2016-06-28 16:52:42 +02:00
2019-12-11 22:24:47 +01:00
function result_type ( m :: Union { Hamming , Jaro , Levenshtein , DamerauLevenshtein , RatcliffObershelp , AbstractQGramDistance , Winkler , Partial , TokenSort , TokenSet , TokenMax } , a :: AbstractString , b :: AbstractString )
typeof ( evaluate ( m , oneunit ( a ) , oneunit ( b ) ) )
end
2015-11-06 03:03:45 +01:00
end
2019-08-14 16:30:22 +02:00
##############################################################################
##
## Some memo about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid – they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
2019-12-11 20:45:58 +01:00
##############################################################################