pull/17/head
matthieugomez 2019-08-20 13:21:31 -04:00
parent 385461a61c
commit 318ede6665
8 changed files with 134 additions and 97 deletions

View File

@ -88,7 +88,9 @@ The package includes distance "modifiers", that can be applied to any distance.
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
#> 0.855
```
## Compare vs Evaluate
The function `compare` returns a similarity score: a value of 0 means completely different and a value of 1 means completely similar.
In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.
@ -99,13 +101,22 @@ evaluate(Levenshtein(), "New York", "New York")
#> 0
```
## Extract
On master, there is an experimental `extract` function that returns the best match in an iterator of AbstractStrings:
## Find (experimental)
`find_best` returns the best match in an iterator of AbstractStrings:
```julia
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
#> "NewYork"
```
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
`find_all` returns a `Vector` with all the matches in an iterator of AbstractStrings:
```julia
find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.8)
#> 1-element Array{String,1}:
#> "NewYork"
```
These functions are optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
## Which distance should I use?

View File

@ -27,7 +27,8 @@ TokenSort,
TokenSet,
TokenMax,
qgram,
extract
find_best,
find_all
##############################################################################
##
@ -38,7 +39,7 @@ include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("compare.jl")
include("extract.jl")
include("find.jl")
end

View File

@ -10,26 +10,38 @@
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
"""
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::Hamming; min_score::Nothing = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
if min_dist === nothing
1.0 - evaluate(dist, s1, s2) / len2
else
max_dist = ceil(Int, len2 * (1 - min_dist))
# need to add max in case of integer stuff
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
end
1.0 - evaluate(dist, s1, s2) / len2
end
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist::Nothing = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score::Nothing = nothing)
1.0 - evaluate(dist, s1, s2)
end
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
if min_score === nothing
1.0 - evaluate(dist, s1, s2) / len2
else
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
out = 1.0 - d / len2
if d == -1 || out < min_score
return 0.0
else
return out
end
end
end
function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance;
min_dist::Nothing = nothing)
min_score::Nothing = nothing)
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -67,10 +79,10 @@ end
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
# hard to use min_dist because of whether there is boost or not in the end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist::Nothing = nothing)
# hard to use min_score because of whether there is boost or not in the end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score::Nothing = nothing)
l = remove_prefix(s1, s2, dist.l)[1]
# cannot do min_dist because of boosting threshold
# cannot do min_score because of boosting threshold
score = compare(s1, s2, dist.dist)
if score >= dist.boosting_threshold
score += l * dist.p * (1 - score)
@ -95,21 +107,21 @@ struct Partial{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_dist = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist; min_dist = min_dist)
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
len1 == 0 && return 1.0
out = 0.0
for x in qgram(s2, len1)
curr = compare(s1, x, dist.dist; min_dist = min_dist)
curr = compare(s1, x, dist.dist; min_score = min_score)
out = max(out, curr)
end
return out
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp};
min_dist = nothing)
min_score = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
@ -148,10 +160,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_dist = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = nothing)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(s1, s2, dist.dist; min_dist = min_dist)
compare(s1, s2, dist.dist; min_score = min_score)
end
##############################################################################
@ -169,17 +181,17 @@ struct TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_dist = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = nothing)
v1 = SortedSet(split(s1))
v2 = SortedSet(split(s2))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return compare(s1, s2, dist.dist; min_dist = min_dist)
max(compare(s0, s1, dist.dist; min_dist = min_dist),
compare(s0, s2, dist.dist; min_dist = min_dist),
compare(s1, s2, dist.dist; min_dist = min_dist))
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
max(compare(s0, s1, dist.dist; min_score = min_score),
compare(s0, s2, dist.dist; min_score = min_score),
compare(s1, s2, dist.dist; min_score = min_score))
end
@ -197,24 +209,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dist = nothing)
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist0 = compare(s1, s2, dist.dist)
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1)
partial = compare(s1, s2, Partial(dist.dist); min_dist = min_dist)
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_dist = min_dist)
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_dist = min_dist)
partial = compare(s1, s2, Partial(dist.dist); min_score = min_score)
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_score = min_score)
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_score = min_score)
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
return max(dist0,
partial * partial_scale,
ptsor * unbase_scale * partial_scale,
ptser * unbase_scale * partial_scale)
else
ptsor = compare(s1, s2, TokenSort(dist.dist); min_dist = min_dist)
ptser = compare(s1, s2, TokenSet(dist.dist); min_dist = min_dist)
ptsor = compare(s1, s2, TokenSort(dist.dist); min_score = min_score)
ptser = compare(s1, s2, TokenSet(dist.dist); min_score = min_score)
return max(dist0,
ptsor * unbase_scale,
ptser * unbase_scale)
@ -228,13 +240,13 @@ end
##
##############################################################################
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing)
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_score = nothing)
missing
end
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing)
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_score = nothing)
missing
end
function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing)
function compare(::Missing, ::Missing, dist::PreMetric; min_score = nothing)
missing
end

View File

@ -4,12 +4,10 @@
## Hamming
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString; max_dist = nothing)
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
current = abs(length(s2) - length(s1))
max_dist !== nothing && current >= max_dist && return max_dist
for (ch1, ch2) in zip(s1, s2)
current += ch1 != ch2
max_dist !== nothing && current >= max_dist && return max_dist
end
return current
end
@ -110,7 +108,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
max_dist = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
max_dist !== nothing && len2 - len1 > max_dist && return -1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
x1 == nothing && return len2 - k
@ -139,11 +137,11 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
x2 = iterate(s2, state2)
i2 += 1
end
max_dist !== nothing && min_dist >= max_dist && return max_dist
max_dist !== nothing && min_dist > max_dist && return -1
x1 = iterate(s1, state1)
i1 += 1
end
max_dist !== nothing && return min(current, max_dist)
max_dist !== nothing && current > max_dist && return - 1
return current
end
@ -166,7 +164,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
max_dist = nothing)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
max_dist !== nothing && len2 - len1 > max_dist && return -1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
(x1 == nothing) && return len2 - k
@ -214,12 +212,12 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
i2 += 1
prevch2 = ch2
end
max_dist !== nothing && (v0[i1 + len2 - len1] >= max_dist) && return max_dist
max_dist !== nothing && (v0[i1 + len2 - len1] > max_dist) && return -1
x1 = iterate(s1, state1)
i1 += 1
prevch1 = ch1
end
max_dist !== nothing && return min(current, max_dist)
max_dist !== nothing && current > max_dist && return - 1
return current
end
@ -240,8 +238,7 @@ The distance between two strings is defined as one minus the number of matching
"""
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString;
max_dist::Nothing = nothing)
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)

View File

@ -1,35 +0,0 @@
"""
extract(s1::AbstractString, iter, dist::PreMetric)
extrat returns the best element `iter` that has the best similarity score with `s1` according to the distance `dist`.
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist; min_dist = best_score)
if (score !== missing) && (score > best_score)
best_s2 = s2
best_score = score
end
end
return best_s2
end
function extract(s1::AbstractString, iter_s2, dist::PreMetric)
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist)
if (score !== missing) && (score > best_score)
best_s2 = s2
best_score = score
end
end
return best_s2
end
function extract(::Missing, iter_s2, dist::PreMetric)
return missing
end

43
src/find.jl Executable file
View File

@ -0,0 +1,43 @@
"""
find_best(s1::AbstractString, iter, dist::PreMetric)
`find_best` returns the best element `iter` that has the best similarity score with `s1` according to the distance `dist`.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist; min_score = best_score)
if score > best_score
best_s2 = s2
best_score = score
end
end
return best_s2
end
function find_best(s1::AbstractString, iter_s2, dist::PreMetric)
best_score = 0.0
best_s2 = nothing
for s2 in iter_s2
score = compare(s1, s2, dist)
if score > best_score
best_s2 = s2
best_score = score
end
end
return best_s2
end
"""
find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8)
`find_all` returns a vector with all the elements of `iter` that have a similarity score higher than 0.8 according to the distance `dist`.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
"""
function find_all(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}; min_score = 0.8) where T <: Union{Levenshtein, DamerauLevenshtein}
collect(s2 for s2 in iter_s2 if compare(s1, s2, dist; min_score = min_score) >= min_score)
end
function find_all(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.8)
collect(s2 for s2 in iter_s2 if compare(s1, s2, dist) >= min_score)
end

View File

@ -99,12 +99,6 @@ for x in solutions
end
for dist in (Hamming, Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) max(compare(strings[i]..., dist()), 1 / 3)
end
end

View File

@ -150,10 +150,24 @@ dist = QGram(2)
# check missing
@test compare(s1, missing, Levenshtein()) === missing
# check extract
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"
# check min
for dist in (Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0
else
@test compare(strings[i]..., dist() ; min_score = 1/ 3) compare(strings[i]..., dist())
end
end
end
# check find_best and find_all
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
@test find_best("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == "NewYork"
@test find_best("New York", skipmissing([missing, "NewYork", "Newark"]), Levenshtein()) == "NewYork"
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
@test find_all("New York", skipmissing([missing, "NewYork", "Newark"]), Jaro()) == ["NewYork", "Newark"]