StringDistances.jl/src/StringDistances.jl

68 lines
1.9 KiB
Julia
Raw Normal View History

2015-10-22 18:12:44 +02:00
module StringDistances
2019-12-12 15:38:20 +01:00
using Distances
import Distances: evaluate, result_type
using DataStructures # for SortedSet in TokenSort
2015-10-22 18:12:44 +02:00
##############################################################################
##
## Export
##
##############################################################################
2019-12-12 15:38:20 +01:00
2015-11-04 18:40:30 +01:00
export
evaluate,
compare,
result_type,
2015-11-04 18:40:30 +01:00
Hamming,
Levenshtein,
DamerauLevenshtein,
Jaro,
2019-08-17 18:57:35 +02:00
RatcliffObershelp,
2015-11-04 18:40:30 +01:00
QGram,
Cosine,
Jaccard,
2015-11-05 16:51:32 +01:00
SorensenDice,
Overlap,
2015-11-04 18:40:30 +01:00
Winkler,
Partial,
TokenSort,
2015-11-06 16:47:15 +01:00
TokenSet,
2019-08-17 18:15:14 +02:00
TokenMax,
2019-08-20 18:32:52 +02:00
qgram,
2019-08-20 19:21:31 +02:00
find_best,
find_all
2015-11-06 20:43:04 +01:00
##############################################################################
##
## include
##
##############################################################################
2019-08-18 18:52:37 +02:00
include("utils.jl")
include("edit.jl")
include("qgram.jl")
2017-08-05 20:45:19 +02:00
include("compare.jl")
2019-08-20 19:21:31 +02:00
include("find.jl")
2019-12-11 22:24:47 +01:00
function result_type(m::Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, AbstractQGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}, a::AbstractString, b::AbstractString)
typeof(evaluate(m, oneunit(a), oneunit(b)))
end
2015-11-06 03:03:45 +01:00
end
2019-08-14 16:30:22 +02:00
##############################################################################
##
## Some memo about Strings
# length: number of characters
# ncodeunits: Return the number of code units in a string (aking to index of vector). Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str multiplied by the size, in bytes, of one code unit in str.
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
##############################################################################