parent
6e1013d49c
commit
aed1fc2ad8
|
@ -11,7 +11,6 @@ Distances are defined for `AbstractStrings`, and any iterator that define `lengt
|
||||||
The available distances are:
|
The available distances are:
|
||||||
|
|
||||||
- Edit Distances
|
- Edit Distances
|
||||||
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
|
|
||||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||||
|
|
|
@ -7,7 +7,7 @@ include("distances/edit.jl")
|
||||||
include("distances/qgram.jl")
|
include("distances/qgram.jl")
|
||||||
include("normalize.jl")
|
include("normalize.jl")
|
||||||
|
|
||||||
const StringDistance = Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||||
# Distances API
|
# Distances API
|
||||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||||
include("find.jl")
|
include("find.jl")
|
||||||
|
@ -21,7 +21,6 @@ include("find.jl")
|
||||||
|
|
||||||
export
|
export
|
||||||
StringDistance,
|
StringDistance,
|
||||||
Hamming,
|
|
||||||
Levenshtein,
|
Levenshtein,
|
||||||
DamerauLevenshtein,
|
DamerauLevenshtein,
|
||||||
Jaro,
|
Jaro,
|
||||||
|
|
|
@ -1,16 +1,3 @@
|
||||||
|
|
||||||
function (dist::Hamming)(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, max_dist::Union{Integer, Nothing} = nothing)
|
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
|
||||||
current = abs(length(s2) - length(s1))
|
|
||||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
|
||||||
for (ch1, ch2) in zip(s1, s2)
|
|
||||||
current += ch1 != ch2
|
|
||||||
max_dist !== nothing && current > max_dist && return max_dist + 1
|
|
||||||
end
|
|
||||||
return current
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Jaro()
|
Jaro()
|
||||||
|
|
||||||
|
|
|
@ -12,18 +12,8 @@ normalize(dist::SemiMetric) = Normalize(dist)
|
||||||
normalize(dist::Normalize) = dist
|
normalize(dist::Normalize) = dist
|
||||||
|
|
||||||
|
|
||||||
function (dist::Normalize{Hamming})(s1::AbstractString, s2::AbstractString; max_dist = 1.0)
|
|
||||||
s1, s2 = reorder(s1, s2)
|
|
||||||
len1, len2 = length(s1), length(s2)
|
|
||||||
len2 == 0 && return 1.0
|
|
||||||
out = evaluate(dist, s1, s2, max_dist * len2) / len2
|
|
||||||
out > max_dist ? 1.0 : out
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
# A normalized distance is between 0 and 1, and accept a third argument, max_dist.
|
||||||
function (dist::Normalize{<: Union{Hamming, Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||||
((s1 === missing) | (s2 === missing)) && return missing
|
((s1 === missing) | (s2 === missing)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
|
|
@ -3,14 +3,6 @@ using StringDistances, Unicode, Test
|
||||||
|
|
||||||
@testset "Distances" begin
|
@testset "Distances" begin
|
||||||
|
|
||||||
@testset "Hamming" begin
|
|
||||||
@test evaluate(Hamming(), "martha", "marhta") ≈ 2
|
|
||||||
@test evaluate(Hamming(), "es an ", " vs an") ≈ 6
|
|
||||||
@test result_type(Hamming(), "hello", "world") == typeof(1)
|
|
||||||
@inferred evaluate(Hamming(), "", "")
|
|
||||||
@test ismissing(evaluate(Hamming(), "", missing))
|
|
||||||
end
|
|
||||||
|
|
||||||
@testset "Jaro" begin
|
@testset "Jaro" begin
|
||||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||||
|
|
|
@ -16,10 +16,6 @@ using StringDistances, Unicode, Test
|
||||||
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
||||||
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
||||||
|
|
||||||
#Hamming
|
|
||||||
compare("aüa", "aua", Hamming()) ≈ 2/3 atol = 1e-4
|
|
||||||
compare("aaua", "aa", Partial(Hamming())) ≈ 1.0 atol = 1e-4
|
|
||||||
|
|
||||||
# Jaro
|
# Jaro
|
||||||
compare("aüa", "aua", Jaro())
|
compare("aüa", "aua", Jaro())
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue