2015-10-22 18:12:44 +02:00
|
|
|
module StringDistances
|
|
|
|
|
2019-12-12 15:38:20 +01:00
|
|
|
using Distances
|
|
|
|
|
2019-12-12 20:48:52 +01:00
|
|
|
include("utils.jl")
|
|
|
|
include("edit.jl")
|
|
|
|
include("qgram.jl")
|
2020-02-13 15:48:35 +01:00
|
|
|
include("normalize.jl")
|
2020-02-09 19:37:37 +01:00
|
|
|
|
2020-02-12 15:41:46 +01:00
|
|
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
2020-02-13 15:44:27 +01:00
|
|
|
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
|
|
|
|
2020-02-13 15:48:35 +01:00
|
|
|
"""
|
|
|
|
compare(s1, s2, dist)
|
2020-02-13 15:44:27 +01:00
|
|
|
|
2020-02-13 15:48:35 +01:00
|
|
|
return a similarity score between 0 and 1 for the strings `s1` and
|
|
|
|
`s2` based on the distance `dist`.
|
2020-02-13 15:44:27 +01:00
|
|
|
|
2020-02-13 15:48:35 +01:00
|
|
|
### Examples
|
|
|
|
```julia-repl
|
|
|
|
julia> compare("martha", "marhta", Levenshtein())
|
|
|
|
0.6666666666666667
|
|
|
|
```
|
|
|
|
"""
|
|
|
|
compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - normalize(dist)(s1, s2, 1 - min_score)
|
2020-02-13 15:44:27 +01:00
|
|
|
|
2020-02-09 19:37:37 +01:00
|
|
|
include("find.jl")
|
2020-02-08 17:49:53 +01:00
|
|
|
|
2015-10-22 18:12:44 +02:00
|
|
|
##############################################################################
|
|
|
|
##
|
|
|
|
## Export
|
|
|
|
##
|
|
|
|
##############################################################################
|
2019-12-12 15:38:20 +01:00
|
|
|
|
2015-11-04 18:40:30 +01:00
|
|
|
export
|
2019-12-12 20:48:52 +01:00
|
|
|
StringDistance,
|
2015-11-04 18:40:30 +01:00
|
|
|
Levenshtein,
|
|
|
|
DamerauLevenshtein,
|
|
|
|
Jaro,
|
2019-08-17 18:57:35 +02:00
|
|
|
RatcliffObershelp,
|
2015-11-04 18:40:30 +01:00
|
|
|
QGram,
|
|
|
|
Cosine,
|
|
|
|
Jaccard,
|
2015-11-05 16:51:32 +01:00
|
|
|
SorensenDice,
|
|
|
|
Overlap,
|
2015-11-04 18:40:30 +01:00
|
|
|
Winkler,
|
|
|
|
Partial,
|
|
|
|
TokenSort,
|
2015-11-06 16:47:15 +01:00
|
|
|
TokenSet,
|
2019-08-17 18:15:14 +02:00
|
|
|
TokenMax,
|
2019-12-12 20:48:52 +01:00
|
|
|
evaluate,
|
|
|
|
compare,
|
|
|
|
result_type,
|
2020-02-09 19:42:29 +01:00
|
|
|
qgrams,
|
|
|
|
normalize
|
2015-11-06 03:03:45 +01:00
|
|
|
end
|
|
|
|
|