2015-10-22 18:12:44 +02:00
|
|
|
|
module StringDistances
|
|
|
|
|
|
2019-12-12 15:38:20 +01:00
|
|
|
|
using Distances
|
|
|
|
|
import Distances: evaluate, result_type
|
|
|
|
|
|
2019-12-18 16:17:08 +01:00
|
|
|
|
|
2019-12-12 20:48:52 +01:00
|
|
|
|
include("utils.jl")
|
|
|
|
|
include("edit.jl")
|
|
|
|
|
include("qgram.jl")
|
|
|
|
|
include("compare.jl")
|
2020-02-08 18:00:44 +01:00
|
|
|
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
|
2019-12-12 20:48:52 +01:00
|
|
|
|
include("find.jl")
|
|
|
|
|
|
2019-12-12 21:32:59 +01:00
|
|
|
|
##############################################################################
|
|
|
|
|
##
|
2020-02-08 17:49:53 +01:00
|
|
|
|
## Distances API
|
2019-12-12 21:32:59 +01:00
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
|
|
|
|
|
2020-02-08 17:49:53 +01:00
|
|
|
|
function result_type(dist::StringDistance, s1, s2)
|
|
|
|
|
typeof(evaluate(dist, "", ""))
|
2019-12-12 20:48:52 +01:00
|
|
|
|
end
|
|
|
|
|
|
2020-02-08 17:49:53 +01:00
|
|
|
|
|
2015-10-22 18:12:44 +02:00
|
|
|
|
##############################################################################
|
|
|
|
|
##
|
|
|
|
|
## Export
|
|
|
|
|
##
|
|
|
|
|
##############################################################################
|
2019-12-12 15:38:20 +01:00
|
|
|
|
|
2015-11-04 18:40:30 +01:00
|
|
|
|
export
|
2019-12-12 20:48:52 +01:00
|
|
|
|
StringDistance,
|
2015-11-04 18:40:30 +01:00
|
|
|
|
Levenshtein,
|
|
|
|
|
DamerauLevenshtein,
|
|
|
|
|
Jaro,
|
2019-08-17 18:57:35 +02:00
|
|
|
|
RatcliffObershelp,
|
2015-11-04 18:40:30 +01:00
|
|
|
|
QGram,
|
|
|
|
|
Cosine,
|
|
|
|
|
Jaccard,
|
2015-11-05 16:51:32 +01:00
|
|
|
|
SorensenDice,
|
|
|
|
|
Overlap,
|
2015-11-04 18:40:30 +01:00
|
|
|
|
Winkler,
|
|
|
|
|
Partial,
|
|
|
|
|
TokenSort,
|
2015-11-06 16:47:15 +01:00
|
|
|
|
TokenSet,
|
2019-08-17 18:15:14 +02:00
|
|
|
|
TokenMax,
|
2019-12-12 20:48:52 +01:00
|
|
|
|
evaluate,
|
|
|
|
|
compare,
|
|
|
|
|
result_type,
|
2019-12-18 16:17:08 +01:00
|
|
|
|
qgrams
|
2015-11-06 03:03:45 +01:00
|
|
|
|
end
|
|
|
|
|
|
2019-08-14 16:30:22 +02:00
|
|
|
|
##############################################################################
|
|
|
|
|
##
|
2019-12-18 16:17:08 +01:00
|
|
|
|
## Some things about Strings
|
2019-08-14 16:30:22 +02:00
|
|
|
|
|
|
|
|
|
# length: number of characters
|
2019-12-13 00:55:41 +01:00
|
|
|
|
# ncodeunits: Return the number of code units in a string (aking to index of vector).
|
|
|
|
|
# Not all such indices are valid – they may not be the start of a character,.
|
|
|
|
|
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str
|
|
|
|
|
# multiplied by the size, in bytes, of one code unit in str.
|
2019-08-14 16:30:22 +02:00
|
|
|
|
|
|
|
|
|
# lastindex: Return the last index of a collection
|
|
|
|
|
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
|
2019-12-13 00:55:41 +01:00
|
|
|
|
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are
|
|
|
|
|
# less than N characters, return ncodeunits(str) + (N - length(s))
|
2019-08-14 16:30:22 +02:00
|
|
|
|
|
2019-12-11 20:45:58 +01:00
|
|
|
|
##############################################################################
|