add functors
parent
1ccef94f1a
commit
4806349088
|
@ -50,6 +50,12 @@ evaluate(TokenMax(RatcliffObershelp()), "martha", "marhta")
|
|||
|
||||
A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
|
||||
|
||||
Alternatively, each `dist` struct can be used as a callable to call the evaluate function of each metric or modified metric, for example:
|
||||
```julia
|
||||
Jaro()("martha", "marhta")
|
||||
Winkler(Jaro())("martha", "marhta")
|
||||
QGram(2)("martha", "marhta")
|
||||
```
|
||||
## Compare
|
||||
The function `compare` is defined as 1 minus the normalized distance between two strings. It always returns a number between 0 and 1: a value of 0 means completely different and a value of 1 means completely similar.
|
||||
```julia
|
||||
|
|
|
@ -1,16 +1,15 @@
|
|||
module StringDistances
|
||||
|
||||
using Distances
|
||||
import Distances: evaluate, result_type
|
||||
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("modifiers.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
|
||||
result_type(dist::StringDistance, s1, s2) = typeof(evaluate(dist, "", ""))
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax, Normalize}
|
||||
Distances.result_type(dist::StringDistance, s1, s2) = typeof(dist("", ""))
|
||||
Distances.evaluate(dist::StringDistance, args...) = dist(args...)
|
||||
include("find.jl")
|
||||
|
||||
##############################################################################
|
||||
|
|
|
@ -16,7 +16,7 @@ isnormalized(::Jaro) = true
|
|||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
||||
function (dist::Jaro)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -85,7 +85,7 @@ struct Levenshtein <: Metric end
|
|||
# Return max_dist +1 if distance higher than max_dist
|
||||
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||
# This is important for find_all
|
||||
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
|
||||
function (dist::Levenshtein)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -139,7 +139,7 @@ required to change one string into the other.
|
|||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
||||
function (dist::DamerauLevenshtein)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -225,7 +225,7 @@ struct RatcliffObershelp <: SemiMetric end
|
|||
|
||||
isnormalized(::RatcliffObershelp) = true
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
|
||||
function (dist::RatcliffObershelp)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
|
|
@ -10,7 +10,7 @@ julia> compare("martha", "marhta", Levenshtein())
|
|||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
|
||||
compare(s1, s2, dist::StringDistance; min_score = 0.0) = 1 - normalize(dist)(s1, s2, 1 - min_score)
|
||||
|
||||
|
||||
"""
|
||||
|
|
|
@ -14,26 +14,26 @@ isnormalized(dist::SemiMetric) = false
|
|||
isnormalized(dist::Normalize) = true
|
||||
|
||||
|
||||
function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}})(s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
|
||||
d = dist.dist(s1, s2, ceil(Int, len2 * max_dist))
|
||||
out = d / len2
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
|
||||
function (dist::Normalize{<: QGramDistance})(s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 <= dist.dist.q - 1 && return convert(Float64, s1 != s2)
|
||||
if typeof(dist.dist) <: QGram
|
||||
evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||
dist.dist(s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||
else
|
||||
evaluate(dist.dist, s1, s2)
|
||||
dist.dist(s1, s2)
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -55,17 +55,16 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
|
|||
maxlength::Integer # max length of common prefix. Default to 4
|
||||
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
|
||||
end
|
||||
isnormalized(dist::Winkler) = true
|
||||
|
||||
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
||||
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
|
||||
end
|
||||
isnormalized(dist::Winkler) = true
|
||||
|
||||
|
||||
function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
|
||||
function (dist::Winkler)(s1, s2, max_dist = 1.0)
|
||||
# cannot do min_score because of boosting threshold
|
||||
score = evaluate(dist.dist, s1, s2)
|
||||
score = dist.dist(s1, s2)
|
||||
if score <= 1 - dist.threshold
|
||||
l = common_prefix(s1, s2)[1]
|
||||
score -= min(l, dist.maxlength) * dist.p * score
|
||||
|
@ -97,24 +96,24 @@ end
|
|||
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::Partial) = true
|
||||
|
||||
function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
|
||||
function (dist::Partial)(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
|
||||
len1 == len2 && return dist.dist(s1, s2, max_dist)
|
||||
len1 == 0 && return 0.0
|
||||
out = 1.0
|
||||
for x in qgrams(s2, len1)
|
||||
curr = evaluate(dist.dist, s1, x, max_dist)
|
||||
curr = dist.dist(s1, x, max_dist)
|
||||
out = min(out, curr)
|
||||
max_dist = min(out, max_dist)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
|
||||
function (dist::Partial{RatcliffObershelp})(s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return evaluate(dist.dist, s1, s2)
|
||||
len1 == len2 && return dist.dist(s1, s2)
|
||||
out = 1.0
|
||||
for r in matching_blocks(s1, s2)
|
||||
# Make sure the substring of s2 has length len1
|
||||
|
@ -127,7 +126,7 @@ function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
|
|||
s2_start += len2 - s2_end
|
||||
s2_end += len2 - s2_end
|
||||
end
|
||||
curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
|
||||
curr = dist.dist(s1, _slice(s2, s2_start - 1, s2_end))
|
||||
|
||||
out = min(out, curr)
|
||||
end
|
||||
|
@ -159,10 +158,10 @@ TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist)
|
|||
isnormalized(dist::TokenSort) = true
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
function (dist::TokenSort)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
evaluate(dist.dist, s1, s2, max_dist)
|
||||
dist.dist(s1, s2, max_dist)
|
||||
end
|
||||
|
||||
|
||||
|
@ -190,19 +189,19 @@ TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
|
|||
isnormalized(dist::TokenSet) = true
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
function (dist::TokenSet)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
|
||||
score_01 = evaluate(dist.dist, s0, s1, max_dist)
|
||||
isempty(s0) && return dist.dist(s1, s2, max_dist)
|
||||
score_01 = dist.dist(s0, s1, max_dist)
|
||||
max_dist = min(max_dist, score_01)
|
||||
score_02 = evaluate(dist.dist, s0, s2, max_dist)
|
||||
score_02 = dist.dist(s0, s2, max_dist)
|
||||
max_dist = min(max_dist, score_02)
|
||||
score_12 = evaluate(dist.dist, s1, s2, max_dist)
|
||||
score_12 = dist.dist(s1, s2, max_dist)
|
||||
min(score_01, score_02, score_12)
|
||||
end
|
||||
|
||||
|
@ -232,29 +231,29 @@ end
|
|||
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenMax) = true
|
||||
|
||||
function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
function (dist::TokenMax)(s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
score = evaluate(dist.dist, s1, s2, max_dist)
|
||||
score = dist.dist(s1, s2, max_dist)
|
||||
min_score = min(max_dist, score)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||
score_partial = 1 - partial_scale * (1 - Partial(dist.dist)(s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||
min_score = min(max_dist, score_partial)
|
||||
score_sort = 1 - unbase_scale * partial_scale *
|
||||
(1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
(1 - TokenSort(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale * partial_scale *
|
||||
(1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
(1 - TokenSet(Partial(dist.dist))(s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
return min(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
score_sort = 1 - unbase_scale *
|
||||
(1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
(1 - TokenSort(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale *
|
||||
(1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
(1 - TokenSet(dist.dist)(s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
return min(score, score_sort, score_set)
|
||||
end
|
||||
end
|
16
src/qgram.jl
16
src/qgram.jl
|
@ -45,9 +45,6 @@ qgrams(s::Union{AbstractString, AbstractVector}, q::Integer) = QGramIterator(s,
|
|||
qgrams(s, q::Integer) = QGramIterator(collect(s), q)
|
||||
|
||||
|
||||
|
||||
abstract type QGramDistance <: SemiMetric end
|
||||
|
||||
# For two iterators x1 and x2, that define a length and eltype method,
|
||||
# this returns a dictionary which, for each element in x1 or x2,
|
||||
# returns a tuple with the numbers of times it appears in x1 and x2
|
||||
|
@ -80,6 +77,9 @@ function count_map(s1, s2)
|
|||
return d
|
||||
end
|
||||
|
||||
|
||||
abstract type QGramDistance <: SemiMetric end
|
||||
|
||||
"""
|
||||
QGram(q::Int)
|
||||
|
||||
|
@ -96,7 +96,7 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, s1, s2)
|
||||
function (dist::QGram)(s1, s2)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
n = 0
|
||||
|
@ -122,7 +122,7 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
|
||||
function (dist::Cosine)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
|
@ -149,7 +149,7 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
|
||||
function (dist::Jaccard)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -176,7 +176,7 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
|
||||
function (dist::SorensenDice)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -203,7 +203,7 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
|
||||
function (dist::Overlap)(s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
|
|
@ -9,11 +9,15 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), [1, 2, 3], [1,2, 4]) ≈ 0.2222222222222222
|
||||
@test evaluate(Jaro(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Jaro(), "alborgów", "amoniak")
|
||||
@test Jaro()(" vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Jaro(), "", "")
|
||||
@test ismissing(evaluate(Jaro(), "", missing))
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
|
@ -25,6 +29,7 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
@test evaluate(Levenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(Levenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(Levenshtein(), "alborgów", "amoniak")
|
||||
@test Levenshtein()("", "abc") == 3
|
||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||
@inferred Int evaluate(Levenshtein(), "", "")
|
||||
@test ismissing(evaluate(Levenshtein(), "", missing))
|
||||
|
@ -41,6 +46,7 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), [1, 2, 3], [1,2, 4]) == 1
|
||||
@test evaluate(DamerauLevenshtein(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(DamerauLevenshtein(), "alborgów", "amoniak")
|
||||
@test DamerauLevenshtein()("bc", "abc") == 1
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@inferred Int evaluate(DamerauLevenshtein(), "", "")
|
||||
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
|
||||
|
@ -56,6 +62,7 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test evaluate(RatcliffObershelp(), [1, 2, 3], [1,2, 4]) ≈ 1/3
|
||||
@test evaluate(RatcliffObershelp(), graphemes("alborgów"), graphemes("amoniak")) == evaluate(RatcliffObershelp(), "alborgów", "amoniak")
|
||||
@test RatcliffObershelp()("pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(RatcliffObershelp(), "", "")
|
||||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
|
@ -68,19 +75,23 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
|
||||
@test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
|
||||
@test evaluate(QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
|
||||
@test evaluate(QGram(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(QGram(2), "alborgów", "amoniak")
|
||||
@test QGram(1)("abc", "cba") == 0
|
||||
@test result_type(QGram(1), "hello", "world") == Int
|
||||
@test ismissing(evaluate(QGram(1), "", missing))
|
||||
@inferred Int evaluate(QGram(1), "", "")
|
||||
end
|
||||
|
||||
|
||||
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test evaluate(Cosine(2), [1, 2, 3], [1, 2, 4]) ≈ 0.5
|
||||
@test evaluate(Cosine(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Cosine(2), "alborgów", "amoniak")
|
||||
@test Cosine(2)("leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Cosine(2), "", "")
|
||||
@test ismissing(evaluate(Cosine(2), "", missing))
|
||||
|
@ -92,6 +103,7 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), [1, 2, 3], [1, 2, 4]) ≈ 2/3 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(Jaccard(2), "alborgów", "amoniak")
|
||||
@test Jaccard(2)("leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Jaccard(1), "", "")
|
||||
@test ismissing(evaluate(Jaccard(1), "", missing))
|
||||
|
@ -101,6 +113,7 @@ using StringDistances, Unicode, Test
|
|||
@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), graphemes("alborgów"), graphemes("amoniak")) ≈ evaluate(SorensenDice(2), "alborgów", "amoniak")
|
||||
@test SorensenDice(2)("night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(SorensenDice(1), "", "")
|
||||
@test ismissing(evaluate(SorensenDice(1), "", missing))
|
||||
|
@ -109,6 +122,7 @@ using StringDistances, Unicode, Test
|
|||
@testset "Overlap" begin
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
@test Overlap(1)("context", "contact") ≈ .2 atol = 1e-4
|
||||
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred Float64 evaluate(Overlap(1), "", "")
|
||||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
|
|
Loading…
Reference in New Issue