parent
6e1013d49c
commit
aed1fc2ad8
|
@ -11,7 +11,6 @@ Distances are defined for `AbstractStrings`, and any iterator that define `lengt
|
|||
The available distances are:
|
||||
|
||||
- Edit Distances
|
||||
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
|
||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
|
|
|
@ -7,7 +7,7 @@ include("distances/edit.jl")
|
|||
include("distances/qgram.jl")
|
||||
include("normalize.jl")
|
||||
|
||||
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
# Distances API
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
include("find.jl")
|
||||
|
@ -21,7 +21,6 @@ include("find.jl")
|
|||
|
||||
export
|
||||
StringDistance,
|
||||
Hamming,
|
||||
Levenshtein,
|
||||
DamerauLevenshtein,
|
||||
Jaro,
|
||||
|
|
|
@ -1,16 +1,3 @@
|
|||
|
||||
function (dist::Hamming)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, max_dist::Union{Integer, Nothing} = nothing)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
current = abs(length(s2) - length(s1))
|
||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
current += ch1 != ch2
|
||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
Jaro()
|
||||
|
||||
|
|
|
@ -12,18 +12,8 @@ normalize(dist::SemiMetric) = Normalize(dist)
|
|||
normalize(dist::Normalize) = dist
|
||||
|
||||
|
||||
function (dist::Normalize{Hamming})(s1::AbstractString, s2::AbstractString; max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
out = evaluate(dist, s1, s2, max_dist * len2) / len2
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
|
||||
|
||||
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
||||
function (dist::Normalize{<: Union{Hamming, Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
((s1 === missing) | (s2 === missing)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
|
|
@ -3,14 +3,6 @@ using StringDistances, Unicode, Test
|
|||
|
||||
@testset "Distances" begin
|
||||
|
||||
@testset "Hamming" begin
|
||||
@test evaluate(Hamming(), "martha", "marhta") ≈ 2
|
||||
@test evaluate(Hamming(), "es an ", " vs an") ≈ 6
|
||||
@test result_type(Hamming(), "hello", "world") == typeof(1)
|
||||
@inferred evaluate(Hamming(), "", "")
|
||||
@test ismissing(evaluate(Hamming(), "", missing))
|
||||
end
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
|
|
|
@ -16,10 +16,6 @@ using StringDistances, Unicode, Test
|
|||
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
||||
|
||||
#Hamming
|
||||
compare("aüa", "aua", Hamming()) ≈ 2/3 atol = 1e-4
|
||||
compare("aaua", "aa", Partial(Hamming())) ≈ 1.0 atol = 1e-4
|
||||
|
||||
# Jaro
|
||||
compare("aüa", "aua", Jaro())
|
||||
|
||||
|
|
Loading…
Reference in New Issue