allow any iterator in. Define evaluate for modifiers.

pull/23/head
matthieugomez 2020-02-09 13:37:37 -05:00
parent a949f7bd62
commit 5cbbfc5bde
7 changed files with 157 additions and 111 deletions

View File

@ -10,9 +10,11 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is:
```julia
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
compare(s1, s2, dist::StringDistance)
```
where `s1` and `s2` can be any iterator with a `length` method (e.g. `AbstractString`, `GraphemeIterator`, `AbstractVector`...).
- Edit Distances
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`

View File

@ -3,24 +3,37 @@ module StringDistances
using Distances
import Distances: evaluate, result_type
isnormalized(dist::SemiMetric) = false
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("compare.jl")
include("modifier.jl")
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
include("find.jl")
##############################################################################
##
## Distances API
##
##############################################################################
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
end
# distance API
function result_type(dist::StringDistance, s1, s2)
typeof(evaluate(dist, "", ""))
end
include("find.jl")
##############################################################################
##

View File

@ -12,10 +12,11 @@ where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
isnormalized(::Jaro) = true
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
## accepts any iterator, including AbstractString
function evaluate(dist::Jaro, s1, s2)
function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -87,7 +88,7 @@ struct Levenshtein <: Metric end
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
# This is important for find_all
## accepts any iterator, including AbstractString
function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing)
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -142,7 +143,7 @@ struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
## accepts any iterator, including AbstractString
function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing)
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
@ -226,20 +227,20 @@ region on either side of the longest common subsequence.
"""
struct RatcliffObershelp <: SemiMetric end
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
isnormalized(::RatcliffObershelp) = true
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
function matching_blocks(s1, s2)
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
end
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2,
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
a = longest_common_pattern(s1, s2, len1 , len2)
# exit if there is no common substring
@ -247,18 +248,18 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
# add the info of the common to the existing set
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
# add the longest common substring that happens before
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
s1before = _take(s1, a[1] - 1)
s2before = _take(s2, a[2] - 1)
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
# add the longest common substring that happens after
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
s1after = _drop(s1, a[1] + a[3] - 1)
s2after = _drop(s2, a[2] + a[3] - 1)
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
return x
end
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
if len1 > len2
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
else

View File

@ -1,42 +1,36 @@
"""
compare(s1, s2, dist)
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`.
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1, s2, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
1.0 - evaluate(dist, s1, s2)
struct Normalize{S <: SemiMetric} <: SemiMetric
dist::S
end
function normalize(dist::SemiMetric)
isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
end
isnormalized(dist::Normalize) = true
function compare(s1, s2, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
out = 1.0 - d / len2
out < min_score ? 0.0 : out
d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
out = d / len2
out > max_dist ? 1.0 : out
end
function compare(s1, s2, dist::QGramDistance; min_score = 0.0)
function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
(ismissing(s1) | ismissing(s2)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
len1 <= dist.dist.q - 1 && return convert(Float64, !(s1 == s2))
if typeof(dist.dist) <: QGram
evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
else
1.0 - evaluate(dist, s1, s2)
evaluate(dist.dist, s1, s2)
end
end
"""
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
@ -52,19 +46,22 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost threshold. Default to 0.7
maxlength::Integer # max length of common prefix. Default to 4
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
end
function Winkler(dist; p = 0.1, threshold = 0.7, maxlength = 4)
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
Winkler(dist, 0.1, 0.7, 4)
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
end
isnormalized(dist::Winkler) = true
function compare(s1, s2, dist::Winkler; min_score = 0.0)
function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
# cannot do min_score because of boosting threshold
score = compare(s1, s2, dist.dist)
if score >= dist.threshold
score = evaluate(dist.dist, s1, s2)
if score <= 1 - dist.threshold
l = common_prefix(s1, s2)[1]
score += min(l, dist.maxlength) * dist.p * (1 - score)
score -= min(l, dist.maxlength) * dist.p * score
end
return score
end
@ -88,27 +85,30 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
"""
struct Partial{S <: SemiMetric} <: SemiMetric
dist::S
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::Partial) = true
function compare(s1, s2, dist::Partial; min_score = 0.0)
function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
len1 == 0 && return 1.0
out = 0.0
len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
len1 == 0 && return 0.0
out = 1.0
for x in qgrams(s2, len1)
curr = compare(s1, x, dist.dist; min_score = min_score)
out = max(out, curr)
min_score = max(out, min_score)
curr = evaluate(dist.dist, s1, x, max_dist)
out = min(out, curr)
max_dist = min(out, max_dist)
end
return out
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
len1 == len2 && return evaluate(dist.dist, s1, s2)
out = 1.0
for r in matching_blocks(s1, s2)
# Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
@ -120,10 +120,9 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
s2_start += len2 - s2_end
s2_end += len2 - s2_end
end
i2_start = nextind(s2, 0, s2_start)
i2_end = nextind(s2, 0, s2_end)
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
out = max(out, curr)
curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
out = min(out, curr)
end
return out
end
@ -147,13 +146,16 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
"""
struct TokenSort{S <: SemiMetric} <: SemiMetric
dist::S
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenSort) = true
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1, s2, dist::TokenSort; min_score = 0.0)
function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(s1, s2, dist.dist; min_score = min_score)
evaluate(dist.dist, s1, s2, max_dist)
end
@ -175,23 +177,26 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
"""
struct TokenSet{S <: SemiMetric} <: SemiMetric
dist::S
TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenSet) = true
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1, s2, dist::TokenSet; min_score = 0.0)
function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
s2 = join(v2, " ")
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
min_score = max(min_score, score_01)
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
min_score = max(min_score, score_02)
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
max(score_01, score_02, score_12)
isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
score_01 = evaluate(dist.dist, s0, s1, max_dist)
max_dist = min(max_dist, score_01)
score_02 = evaluate(dist.dist, s0, s2, max_dist)
max_dist = min(max_dist, score_02)
score_12 = evaluate(dist.dist, s1, s2, max_dist)
min(score_01, score_02, score_12)
end
@ -214,36 +219,35 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
"""
struct TokenMax{S <: SemiMetric} <: SemiMetric
dist::S
TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
end
function compare(s1, s2, dist::TokenMax; min_score = 0.0)
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
isnormalized(dist::TokenMax) = true
function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
score = compare(s1, s2, dist.dist; min_score = min_score)
min_score = max(min_score, score)
score = evaluate(dist.dist, s1, s2, max_dist)
min_score = min(max_dist, score)
unbase_scale = 0.95
# if one string is much shorter than the other, use partial
if length(s2) >= 1.5 * length(s1)
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
min_score = min_score / partial_scale)
min_score = max(min_score, score_partial)
score_sort = unbase_scale * partial_scale *
compare(s1, s2, TokenSort(Partial(dist.dist));
min_score = min_score / (unbase_scale * partial_scale))
min_score = max(min_score, score_sort)
score_set = unbase_scale * partial_scale *
compare(s1, s2, TokenSet(Partial(dist.dist));
min_score = min_score / (unbase_scale * partial_scale))
return max(score, score_partial, score_sort, score_set)
score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
min_score = min(max_dist, score_partial)
score_sort = 1 - unbase_scale * partial_scale *
(1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale * partial_scale *
(1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
return min(score, score_partial, score_sort, score_set)
else
score_sort = unbase_scale *
compare(s1, s2, TokenSort(dist.dist);
min_score = min_score / unbase_scale)
min_score = max(min_score, score_sort)
score_set = unbase_scale *
compare(s1, s2, TokenSet(dist.dist);
min_score = min_score / unbase_scale)
return max(score, score_sort, score_set)
score_sort = 1 - unbase_scale *
(1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
max_dist = min(max_dist, score_sort)
score_set = 1 - unbase_scale *
(1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
return min(score, score_sort, score_set)
end
end

View File

@ -18,12 +18,15 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
#q-grams of AbstractVector
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
# so it does not seem to be worth it.
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
view(qgram.s, state:(state + qgram.q - 1)), state + 1
end
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
"""
Return an iterator on the q-gram of a string
@ -120,7 +123,7 @@ struct Cosine <: QGramDistance
q::Int
end
function evaluate(dist::Cosine, s1, s2)
function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
norm1, norm2, prodnorm = 0, 0, 0
@ -147,7 +150,7 @@ struct Jaccard <: QGramDistance
q::Int
end
function evaluate(dist::Jaccard, s1, s2)
function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -174,7 +177,7 @@ struct SorensenDice <: QGramDistance
q::Int
end
function evaluate(dist::SorensenDice, s1, s2)
function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0
@ -201,7 +204,7 @@ struct Overlap <: QGramDistance
q::Int
end
function evaluate(dist::Overlap, s1, s2)
function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
ndistinct1, ndistinct2, nintersect = 0, 0, 0

View File

@ -21,8 +21,6 @@ function reorder(s1, s2)
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
end
function common_prefix(s1, s2)
x1 = iterate(s1)
x2 = iterate(s2)
@ -36,4 +34,29 @@ function common_prefix(s1, s2)
x2 = iterate(s2, state2)
end
return l, x1, x2
end
end
function _take(s, n::Integer)
Base.Iterators.take(s, n)
end
function _take(s::AbstractString, n::Integer)
SubString(s, firstindex(s), nextind(s, 0, n))
end
function _drop(s, n::Integer)
Base.Iterators.drop(s, n)
end
function _drop(s::AbstractString, n::Integer)
SubString(s, nextind(s, 0, n + 1), lastindex(s))
end
function _slice(s, n1::Integer, n2::Integer)
Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
end
function _slice(s::AbstractString, n1::Integer, n2::Integer)
SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2))
end

View File

@ -24,13 +24,13 @@ using StringDistances, Test
compare("aüa", "aua", DamerauLevenshtein())
# Winkler
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) 0.0 atol = 1e-4
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0