StringDistances.jl/src/StringDistances.jl

74 lines
1.9 KiB
Julia
Raw Normal View History

2015-10-22 18:12:44 +02:00
module StringDistances
2019-12-12 15:38:20 +01:00
using Distances
import Distances: evaluate, result_type
2019-12-18 16:17:08 +01:00
2019-12-12 20:48:52 +01:00
abstract type StringDistance <: SemiMetric end
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("compare.jl")
include("find.jl")
2019-12-12 21:32:59 +01:00
##############################################################################
##
## Handle missing values
##
##############################################################################
2020-02-07 14:31:00 +01:00
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
2019-12-12 21:32:59 +01:00
function result_type(dist::StringDistance, s1::AbstractString, s2::AbstractString)
typeof(evaluate(dist, oneunit(s1), oneunit(s2)))
2019-12-12 20:48:52 +01:00
end
2015-10-22 18:12:44 +02:00
##############################################################################
##
## Export
##
##############################################################################
2019-12-12 15:38:20 +01:00
2015-11-04 18:40:30 +01:00
export
2019-12-12 20:48:52 +01:00
StringDistance,
2015-11-04 18:40:30 +01:00
Levenshtein,
DamerauLevenshtein,
Jaro,
2019-08-17 18:57:35 +02:00
RatcliffObershelp,
2015-11-04 18:40:30 +01:00
QGram,
Cosine,
Jaccard,
2015-11-05 16:51:32 +01:00
SorensenDice,
Overlap,
2015-11-04 18:40:30 +01:00
Winkler,
Partial,
TokenSort,
2015-11-06 16:47:15 +01:00
TokenSet,
2019-08-17 18:15:14 +02:00
TokenMax,
2019-12-12 20:48:52 +01:00
evaluate,
compare,
result_type,
2019-12-18 16:17:08 +01:00
qgrams
2015-11-06 03:03:45 +01:00
end
2019-08-14 16:30:22 +02:00
##############################################################################
##
2019-12-18 16:17:08 +01:00
## Some things about Strings
2019-08-14 16:30:22 +02:00
# length: number of characters
2019-12-13 00:55:41 +01:00
# ncodeunits: Return the number of code units in a string (aking to index of vector).
# Not all such indices are valid they may not be the start of a character,.
# sizeof: Size, in bytes, of the string str. Equal to the number of code units in str
# multiplied by the size, in bytes, of one code unit in str.
2019-08-14 16:30:22 +02:00
# lastindex: Return the last index of a collection
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
2019-12-13 00:55:41 +01:00
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are
# less than N characters, return ncodeunits(str) + (N - length(s))
2019-08-14 16:30:22 +02:00
##############################################################################