update
parent
385461a61c
commit
318ede6665
19
README.md
19
README.md
|
@ -88,7 +88,9 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
|
||||
#> 0.855
|
||||
```
|
||||
|
||||
## Compare vs Evaluate
|
||||
|
||||
The function `compare` returns a similarity score: a value of 0 means completely different and a value of 1 means completely similar.
|
||||
In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.
|
||||
|
||||
|
@ -99,13 +101,22 @@ evaluate(Levenshtein(), "New York", "New York")
|
|||
#> 0
|
||||
```
|
||||
|
||||
## Extract
|
||||
On master, there is an experimental `extract` function that returns the best match in an iterator of AbstractStrings:
|
||||
## Find (experimental)
|
||||
`find_best` returns the best match in an iterator of AbstractStrings:
|
||||
```julia
|
||||
extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
|
||||
find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein())
|
||||
#> "NewYork"
|
||||
```
|
||||
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
|
||||
`find_all` returns a `Vector` with all the matches in an iterator of AbstractStrings:
|
||||
|
||||
```julia
|
||||
find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.8)
|
||||
#> 1-element Array{String,1}:
|
||||
#> "NewYork"
|
||||
```
|
||||
|
||||
These functions are optimized for `Levenshtein` and `DamerauLevenshtein` distances (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
|
||||
## Which distance should I use?
|
||||
|
||||
|
|
|
@ -27,7 +27,8 @@ TokenSort,
|
|||
TokenSet,
|
||||
TokenMax,
|
||||
qgram,
|
||||
extract
|
||||
find_best,
|
||||
find_all
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -38,7 +39,7 @@ include("utils.jl")
|
|||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("compare.jl")
|
||||
include("extract.jl")
|
||||
include("find.jl")
|
||||
|
||||
end
|
||||
|
||||
|
|
|
@ -10,26 +10,38 @@
|
|||
|
||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||
"""
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
dist::Union{Hamming, Levenshtein, DamerauLevenshtein}; min_dist = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Hamming; min_score::Nothing = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
if min_dist === nothing
|
||||
1.0 - evaluate(dist, s1, s2) / len2
|
||||
else
|
||||
max_dist = ceil(Int, len2 * (1 - min_dist))
|
||||
# need to add max in case of integer stuff
|
||||
max(1.0 - evaluate(dist, s1, s2; max_dist = max_dist) / len2, min_dist)
|
||||
end
|
||||
1.0 - evaluate(dist, s1, s2) / len2
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_dist::Nothing = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score::Nothing = nothing)
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
if min_score === nothing
|
||||
1.0 - evaluate(dist, s1, s2) / len2
|
||||
else
|
||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
||||
out = 1.0 - d / len2
|
||||
if d == -1 || out < min_score
|
||||
return 0.0
|
||||
else
|
||||
return out
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance;
|
||||
min_dist::Nothing = nothing)
|
||||
min_score::Nothing = nothing)
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -67,10 +79,10 @@ end
|
|||
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
|
||||
|
||||
# hard to use min_dist because of whether there is boost or not in the end
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_dist::Nothing = nothing)
|
||||
# hard to use min_score because of whether there is boost or not in the end
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score::Nothing = nothing)
|
||||
l = remove_prefix(s1, s2, dist.l)[1]
|
||||
# cannot do min_dist because of boosting threshold
|
||||
# cannot do min_score because of boosting threshold
|
||||
score = compare(s1, s2, dist.dist)
|
||||
if score >= dist.boosting_threshold
|
||||
score += l * dist.p * (1 - score)
|
||||
|
@ -95,21 +107,21 @@ struct Partial{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_dist = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
len1 == 0 && return 1.0
|
||||
out = 0.0
|
||||
for x in qgram(s2, len1)
|
||||
curr = compare(s1, x, dist.dist; min_dist = min_dist)
|
||||
curr = compare(s1, x, dist.dist; min_score = min_score)
|
||||
out = max(out, curr)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp};
|
||||
min_dist = nothing)
|
||||
min_score = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
|
@ -148,10 +160,10 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_dist = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = nothing)
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||
compare(s1, s2, dist.dist; min_score = min_score)
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -169,17 +181,17 @@ struct TokenSet{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_dist = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = nothing)
|
||||
v1 = SortedSet(split(s1))
|
||||
v2 = SortedSet(split(s2))
|
||||
v0 = intersect(v1, v2)
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return compare(s1, s2, dist.dist; min_dist = min_dist)
|
||||
max(compare(s0, s1, dist.dist; min_dist = min_dist),
|
||||
compare(s0, s2, dist.dist; min_dist = min_dist),
|
||||
compare(s1, s2, dist.dist; min_dist = min_dist))
|
||||
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
max(compare(s0, s1, dist.dist; min_score = min_score),
|
||||
compare(s0, s2, dist.dist; min_score = min_score),
|
||||
compare(s1, s2, dist.dist; min_score = min_score))
|
||||
end
|
||||
|
||||
|
||||
|
@ -197,24 +209,24 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_dist = nothing)
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist0 = compare(s1, s2, dist.dist)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial = compare(s1, s2, Partial(dist.dist); min_dist = min_dist)
|
||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_dist = min_dist)
|
||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_dist = min_dist)
|
||||
partial = compare(s1, s2, Partial(dist.dist); min_score = min_score)
|
||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)); min_score = min_score)
|
||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)); min_score = min_score)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
return max(dist0,
|
||||
partial * partial_scale,
|
||||
ptsor * unbase_scale * partial_scale,
|
||||
ptser * unbase_scale * partial_scale)
|
||||
else
|
||||
ptsor = compare(s1, s2, TokenSort(dist.dist); min_dist = min_dist)
|
||||
ptser = compare(s1, s2, TokenSet(dist.dist); min_dist = min_dist)
|
||||
ptsor = compare(s1, s2, TokenSort(dist.dist); min_score = min_score)
|
||||
ptser = compare(s1, s2, TokenSet(dist.dist); min_score = min_score)
|
||||
return max(dist0,
|
||||
ptsor * unbase_scale,
|
||||
ptser * unbase_scale)
|
||||
|
@ -228,13 +240,13 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_dist = nothing)
|
||||
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, ::Missing, dist::PreMetric; min_dist = nothing)
|
||||
function compare(::Missing, ::Missing, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
|
||||
|
|
19
src/edit.jl
19
src/edit.jl
|
@ -4,12 +4,10 @@
|
|||
## Hamming
|
||||
##
|
||||
##############################################################################
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString; max_dist = nothing)
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
||||
current = abs(length(s2) - length(s1))
|
||||
max_dist !== nothing && current >= max_dist && return max_dist
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
current += ch1 != ch2
|
||||
max_dist !== nothing && current >= max_dist && return max_dist
|
||||
end
|
||||
return current
|
||||
end
|
||||
|
@ -110,7 +108,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|||
max_dist = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return -1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
x1 == nothing && return len2 - k
|
||||
|
@ -139,11 +137,11 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
|||
x2 = iterate(s2, state2)
|
||||
i2 += 1
|
||||
end
|
||||
max_dist !== nothing && min_dist >= max_dist && return max_dist
|
||||
max_dist !== nothing && min_dist > max_dist && return -1
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
end
|
||||
max_dist !== nothing && return min(current, max_dist)
|
||||
max_dist !== nothing && current > max_dist && return - 1
|
||||
return current
|
||||
end
|
||||
|
||||
|
@ -166,7 +164,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
max_dist = nothing)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 >= max_dist && return max_dist
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return -1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
(x1 == nothing) && return len2 - k
|
||||
|
@ -214,12 +212,12 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
i2 += 1
|
||||
prevch2 = ch2
|
||||
end
|
||||
max_dist !== nothing && (v0[i1 + len2 - len1] >= max_dist) && return max_dist
|
||||
max_dist !== nothing && (v0[i1 + len2 - len1] > max_dist) && return -1
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevch1 = ch1
|
||||
end
|
||||
max_dist !== nothing && return min(current, max_dist)
|
||||
max_dist !== nothing && current > max_dist && return - 1
|
||||
return current
|
||||
end
|
||||
|
||||
|
@ -240,8 +238,7 @@ The distance between two strings is defined as one minus the number of matching
|
|||
"""
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString;
|
||||
max_dist::Nothing = nothing)
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
"""
|
||||
extract(s1::AbstractString, iter, dist::PreMetric)
|
||||
|
||||
extrat returns the best element `iter` that has the best similarity score with `s1` according to the distance `dist`.
|
||||
The function is particularly fast for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function extract(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist; min_dist = best_score)
|
||||
if (score !== missing) && (score > best_score)
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
function extract(s1::AbstractString, iter_s2, dist::PreMetric)
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist)
|
||||
if (score !== missing) && (score > best_score)
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
|
||||
|
||||
function extract(::Missing, iter_s2, dist::PreMetric)
|
||||
return missing
|
||||
end
|
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
find_best(s1::AbstractString, iter, dist::PreMetric)
|
||||
|
||||
`find_best` returns the best element `iter` that has the best similarity score with `s1` according to the distance `dist`.
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function find_best(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist; min_score = best_score)
|
||||
if score > best_score
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
function find_best(s1::AbstractString, iter_s2, dist::PreMetric)
|
||||
best_score = 0.0
|
||||
best_s2 = nothing
|
||||
for s2 in iter_s2
|
||||
score = compare(s1, s2, dist)
|
||||
if score > best_score
|
||||
best_s2 = s2
|
||||
best_score = score
|
||||
end
|
||||
end
|
||||
return best_s2
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8)
|
||||
`find_all` returns a vector with all the elements of `iter` that have a similarity score higher than 0.8 according to the distance `dist`.
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances (potentially modified by `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`)
|
||||
"""
|
||||
function find_all(s1::AbstractString, iter_s2, dist::Union{T, Partial{T}, TokenSort{T}, TokenSet{T}, TokenMax{T}}; min_score = 0.8) where T <: Union{Levenshtein, DamerauLevenshtein}
|
||||
collect(s2 for s2 in iter_s2 if compare(s1, s2, dist; min_score = min_score) >= min_score)
|
||||
end
|
||||
function find_all(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.8)
|
||||
collect(s2 for s2 in iter_s2 if compare(s1, s2, dist) >= min_score)
|
||||
end
|
|
@ -99,12 +99,6 @@ for x in solutions
|
|||
end
|
||||
|
||||
|
||||
for dist in (Hamming, Levenshtein, DamerauLevenshtein)
|
||||
for i in eachindex(strings)
|
||||
@test compare(strings[i]..., dist() ; min_dist = 1/ 3) ≈ max(compare(strings[i]..., dist()), 1 / 3)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -150,10 +150,24 @@ dist = QGram(2)
|
|||
# check missing
|
||||
@test compare(s1, missing, Levenshtein()) === missing
|
||||
|
||||
# check extract
|
||||
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
|
||||
@test extract("New York", ["NewYork", "Newark", missing], Levenshtein()) == "NewYork"
|
||||
@test extract("New York", [missing, "NewYork", "Newark"], Levenshtein()) == "NewYork"
|
||||
# check min
|
||||
for dist in (Levenshtein, DamerauLevenshtein)
|
||||
for i in eachindex(strings)
|
||||
if compare(strings[i]..., dist()) < 1 / 3
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0
|
||||
else
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ compare(strings[i]..., dist())
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# check find_best and find_all
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
|
||||
@test find_best("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == "NewYork"
|
||||
@test find_best("New York", skipmissing([missing, "NewYork", "Newark"]), Levenshtein()) == "NewYork"
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
|
||||
@test find_all("New York", skipmissing([missing, "NewYork", "Newark"]), Jaro()) == ["NewYork", "Newark"]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue