StringDistances.jl/src/StringDistances.jl

69 lines
1.8 KiB
Julia
Raw Normal View History

2015-10-22 18:12:44 +02:00
module StringDistances
2019-12-12 15:38:20 +01:00
using Distances
import Distances: evaluate, result_type
2019-12-18 16:17:08 +01:00
2019-12-12 20:48:52 +01:00
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("compare.jl")
2020-02-08 18:00:44 +01:00
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
2019-12-12 20:48:52 +01:00
include("find.jl")
2019-12-12 21:32:59 +01:00
##############################################################################
##
2020-02-08 17:49:53 +01:00
## Distances API
2019-12-12 21:32:59 +01:00
##
##############################################################################
2020-02-08 17:49:53 +01:00
function result_type(dist::StringDistance, s1, s2)
typeof(evaluate(dist, "", ""))
2019-12-12 20:48:52 +01:00
end
2020-02-08 17:49:53 +01:00
2015-10-22 18:12:44 +02:00
##############################################################################
##
## Export
##
##############################################################################
2019-12-12 15:38:20 +01:00
2015-11-04 18:40:30 +01:00
export
2019-12-12 20:48:52 +01:00
StringDistance,
2015-11-04 18:40:30 +01:00
Levenshtein,
DamerauLevenshtein,
Jaro,
2019-08-17 18:57:35 +02:00
RatcliffObershelp,
2015-11-04 18:40:30 +01:00
QGram,
Cosine,
Jaccard,
2015-11-05 16:51:32 +01:00
SorensenDice,
Overlap,
2015-11-04 18:40:30 +01:00
Winkler,
Partial,
TokenSort,
2015-11-06 16:47:15 +01:00
TokenSet,
2019-08-17 18:15:14 +02:00
TokenMax,
2019-12-12 20:48:52 +01:00
evaluate,
compare,
result_type,
2019-12-18 16:17:08 +01:00
qgrams
2015-11-06 03:03:45 +01:00
end
2019-08-14 16:30:22 +02:00
##############################################################################
##
2019-12-18 16:17:08 +01:00
## Some things about Strings
2019-08-14 16:30:22 +02:00
# length: number of characters
2019-12-13 00:55:41 +01:00
# ncodeunits: Return the number of code units in a string (aking to index of vector).
# Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str
# multiplied by the size, in bytes, of one code unit in str.
2019-08-14 16:30:22 +02:00
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
2019-12-13 00:55:41 +01:00
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are
# less than N characters, return ncodeunits(str) + (N - length(s))
2019-08-14 16:30:22 +02:00
##############################################################################