2015-10-22 18:12:44 +02:00
module StringDistances
2019-12-12 15:38:20 +01:00
using Distances
import Distances : evaluate , result_type
using DataStructures # for SortedSet in TokenSort
2019-12-12 20:48:52 +01:00
##############################################################################
##
## include
##
##############################################################################
abstract type StringDistance <: SemiMetric end
include ( " utils.jl " )
include ( " edit.jl " )
include ( " qgram.jl " )
include ( " compare.jl " )
include ( " find.jl " )
2019-12-12 21:32:59 +01:00
##############################################################################
##
## Handle missing values
##
##############################################################################
evaluate ( :: StringDistance , :: Missing , :: AbstractString ) = missing
evaluate ( :: StringDistance , :: AbstractString , :: Missing ) = missing
evaluate ( :: StringDistance , :: Missing , :: Missing ) = missing
compare ( :: Missing , :: AbstractString , :: StringDistance ; min_score = 0.0 ) = missing
compare ( :: AbstractString , :: Missing , :: StringDistance ; min_score = 0.0 ) = missing
compare ( :: Missing , :: Missing , :: StringDistance ; min_score = 0.0 ) = missing
function result_type ( dist :: StringDistance , s1 :: AbstractString , s2 :: AbstractString )
typeof ( evaluate ( dist , oneunit ( s1 ) , oneunit ( s2 ) ) )
2019-12-12 20:48:52 +01:00
end
2015-10-22 18:12:44 +02:00
##############################################################################
##
## Export
##
##############################################################################
2019-12-12 15:38:20 +01:00
2015-11-04 18:40:30 +01:00
export
2019-12-12 20:48:52 +01:00
StringDistance ,
2015-11-04 18:40:30 +01:00
Levenshtein ,
DamerauLevenshtein ,
Jaro ,
2019-08-17 18:57:35 +02:00
RatcliffObershelp ,
2015-11-04 18:40:30 +01:00
QGram ,
Cosine ,
Jaccard ,
2015-11-05 16:51:32 +01:00
SorensenDice ,
Overlap ,
2015-11-04 18:40:30 +01:00
Winkler ,
Partial ,
TokenSort ,
2015-11-06 16:47:15 +01:00
TokenSet ,
2019-08-17 18:15:14 +02:00
TokenMax ,
2019-12-12 20:48:52 +01:00
evaluate ,
compare ,
result_type ,
qgram
2015-11-06 03:03:45 +01:00
end
2019-08-14 16:30:22 +02:00
##############################################################################
##
## Some memo about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid – they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
2019-12-11 20:45:58 +01:00
##############################################################################