allow any iterator in. Define evaluate for modifiers.
parent
a949f7bd62
commit
5cbbfc5bde
|
@ -10,9 +10,11 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/
|
||||||
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is:
|
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is:
|
||||||
|
|
||||||
```julia
|
```julia
|
||||||
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
compare(s1, s2, dist::StringDistance)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
where `s1` and `s2` can be any iterator with a `length` method (e.g. `AbstractString`, `GraphemeIterator`, `AbstractVector`...).
|
||||||
|
|
||||||
- Edit Distances
|
- Edit Distances
|
||||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||||
|
|
|
@ -3,24 +3,37 @@ module StringDistances
|
||||||
using Distances
|
using Distances
|
||||||
import Distances: evaluate, result_type
|
import Distances: evaluate, result_type
|
||||||
|
|
||||||
|
isnormalized(dist::SemiMetric) = false
|
||||||
|
|
||||||
|
|
||||||
include("utils.jl")
|
include("utils.jl")
|
||||||
include("edit.jl")
|
include("edit.jl")
|
||||||
include("qgram.jl")
|
include("qgram.jl")
|
||||||
include("compare.jl")
|
include("modifier.jl")
|
||||||
|
|
||||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
|
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
|
||||||
include("find.jl")
|
|
||||||
|
|
||||||
##############################################################################
|
"""
|
||||||
##
|
compare(s1, s2, dist)
|
||||||
## Distances API
|
|
||||||
##
|
|
||||||
##############################################################################
|
|
||||||
|
|
||||||
|
return a similarity score between 0 and 1 for the strings `s1` and
|
||||||
|
`s2` based on the distance `dist`.
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
```julia-repl
|
||||||
|
julia> compare("martha", "marhta", Levenshtein())
|
||||||
|
0.6666666666666667
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||||
|
1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
|
||||||
|
end
|
||||||
|
|
||||||
|
# distance API
|
||||||
function result_type(dist::StringDistance, s1, s2)
|
function result_type(dist::StringDistance, s1, s2)
|
||||||
typeof(evaluate(dist, "", ""))
|
typeof(evaluate(dist, "", ""))
|
||||||
end
|
end
|
||||||
|
include("find.jl")
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
##
|
##
|
||||||
|
|
27
src/edit.jl
27
src/edit.jl
|
@ -12,10 +12,11 @@ where ``m`` is the number of matching characters and
|
||||||
``t`` is half the number of transpositions.
|
``t`` is half the number of transpositions.
|
||||||
"""
|
"""
|
||||||
struct Jaro <: SemiMetric end
|
struct Jaro <: SemiMetric end
|
||||||
|
isnormalized(::Jaro) = true
|
||||||
|
|
||||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||||
## accepts any iterator, including AbstractString
|
## accepts any iterator, including AbstractString
|
||||||
function evaluate(dist::Jaro, s1, s2)
|
function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
@ -87,7 +88,7 @@ struct Levenshtein <: Metric end
|
||||||
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||||
# This is important for find_all
|
# This is important for find_all
|
||||||
## accepts any iterator, including AbstractString
|
## accepts any iterator, including AbstractString
|
||||||
function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing)
|
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
@ -142,7 +143,7 @@ struct DamerauLevenshtein <: SemiMetric end
|
||||||
|
|
||||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||||
## accepts any iterator, including AbstractString
|
## accepts any iterator, including AbstractString
|
||||||
function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing)
|
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
|
@ -226,20 +227,20 @@ region on either side of the longest common subsequence.
|
||||||
"""
|
"""
|
||||||
struct RatcliffObershelp <: SemiMetric end
|
struct RatcliffObershelp <: SemiMetric end
|
||||||
|
|
||||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
isnormalized(::RatcliffObershelp) = true
|
||||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
|
||||||
|
|
||||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
|
||||||
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
||||||
end
|
end
|
||||||
|
|
||||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
function matching_blocks(s1, s2)
|
||||||
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
||||||
end
|
end
|
||||||
|
|
||||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2,
|
||||||
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||||
a = longest_common_pattern(s1, s2, len1 , len2)
|
a = longest_common_pattern(s1, s2, len1 , len2)
|
||||||
# exit if there is no common substring
|
# exit if there is no common substring
|
||||||
|
@ -247,18 +248,18 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
||||||
# add the info of the common to the existing set
|
# add the info of the common to the existing set
|
||||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||||
# add the longest common substring that happens before
|
# add the longest common substring that happens before
|
||||||
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
s1before = _take(s1, a[1] - 1)
|
||||||
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
s2before = _take(s2, a[2] - 1)
|
||||||
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
||||||
# add the longest common substring that happens after
|
# add the longest common substring that happens after
|
||||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
s1after = _drop(s1, a[1] + a[3] - 1)
|
||||||
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
s2after = _drop(s2, a[2] + a[3] - 1)
|
||||||
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
|
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
|
||||||
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||||
return x
|
return x
|
||||||
end
|
end
|
||||||
|
|
||||||
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
|
||||||
if len1 > len2
|
if len1 > len2
|
||||||
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
||||||
else
|
else
|
||||||
|
|
|
@ -1,42 +1,36 @@
|
||||||
"""
|
struct Normalize{S <: SemiMetric} <: SemiMetric
|
||||||
compare(s1, s2, dist)
|
dist::S
|
||||||
|
|
||||||
return a similarity score between 0 and 1 for the strings `s1` and
|
|
||||||
`s2` based on the distance `dist`.
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
```julia-repl
|
|
||||||
julia> compare("martha", "marhta", Levenshtein())
|
|
||||||
0.6666666666666667
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
function compare(s1, s2, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
|
|
||||||
1.0 - evaluate(dist, s1, s2)
|
|
||||||
end
|
end
|
||||||
|
function normalize(dist::SemiMetric)
|
||||||
|
isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
|
||||||
|
end
|
||||||
|
isnormalized(dist::Normalize) = true
|
||||||
|
|
||||||
function compare(s1, s2, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
|
|
||||||
|
function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len2 == 0 && return 1.0
|
len2 == 0 && return 1.0
|
||||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
|
||||||
out = 1.0 - d / len2
|
out = d / len2
|
||||||
out < min_score ? 0.0 : out
|
out > max_dist ? 1.0 : out
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1, s2, dist::QGramDistance; min_score = 0.0)
|
function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
# When string length < q for qgram distance, returns s1 == s2
|
# When string length < q for qgram distance, returns s1 == s2
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
|
len1 <= dist.dist.q - 1 && return convert(Float64, !(s1 == s2))
|
||||||
if typeof(dist) <: QGram
|
if typeof(dist.dist) <: QGram
|
||||||
1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||||
else
|
else
|
||||||
1.0 - evaluate(dist, s1, s2)
|
evaluate(dist.dist, s1, s2)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
|
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
|
||||||
|
|
||||||
|
@ -52,19 +46,22 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
|
||||||
p::Float64 # scaling factor. Default to 0.1
|
p::Float64 # scaling factor. Default to 0.1
|
||||||
threshold::Float64 # boost threshold. Default to 0.7
|
threshold::Float64 # boost threshold. Default to 0.7
|
||||||
maxlength::Integer # max length of common prefix. Default to 4
|
maxlength::Integer # max length of common prefix. Default to 4
|
||||||
|
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
|
||||||
end
|
end
|
||||||
|
|
||||||
function Winkler(dist; p = 0.1, threshold = 0.7, maxlength = 4)
|
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||||
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
||||||
Winkler(dist, 0.1, 0.7, 4)
|
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
|
||||||
end
|
end
|
||||||
|
isnormalized(dist::Winkler) = true
|
||||||
|
|
||||||
function compare(s1, s2, dist::Winkler; min_score = 0.0)
|
|
||||||
|
function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
|
||||||
# cannot do min_score because of boosting threshold
|
# cannot do min_score because of boosting threshold
|
||||||
score = compare(s1, s2, dist.dist)
|
score = evaluate(dist.dist, s1, s2)
|
||||||
if score >= dist.threshold
|
if score <= 1 - dist.threshold
|
||||||
l = common_prefix(s1, s2)[1]
|
l = common_prefix(s1, s2)[1]
|
||||||
score += min(l, dist.maxlength) * dist.p * (1 - score)
|
score -= min(l, dist.maxlength) * dist.p * score
|
||||||
end
|
end
|
||||||
return score
|
return score
|
||||||
end
|
end
|
||||||
|
@ -88,27 +85,30 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
|
||||||
"""
|
"""
|
||||||
struct Partial{S <: SemiMetric} <: SemiMetric
|
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
|
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||||
end
|
end
|
||||||
|
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
|
||||||
|
isnormalized(dist::Partial) = true
|
||||||
|
|
||||||
function compare(s1, s2, dist::Partial; min_score = 0.0)
|
function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
|
||||||
len1 == 0 && return 1.0
|
len1 == 0 && return 0.0
|
||||||
out = 0.0
|
out = 1.0
|
||||||
for x in qgrams(s2, len1)
|
for x in qgrams(s2, len1)
|
||||||
curr = compare(s1, x, dist.dist; min_score = min_score)
|
curr = evaluate(dist.dist, s1, x, max_dist)
|
||||||
out = max(out, curr)
|
out = min(out, curr)
|
||||||
min_score = max(out, min_score)
|
max_dist = min(out, max_dist)
|
||||||
end
|
end
|
||||||
return out
|
return out
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
len1 == len2 && return evaluate(dist.dist, s1, s2)
|
||||||
out = 0.0
|
out = 1.0
|
||||||
for r in matching_blocks(s1, s2)
|
for r in matching_blocks(s1, s2)
|
||||||
# Make sure the substring of s2 has length len1
|
# Make sure the substring of s2 has length len1
|
||||||
s2_start = r[2] - r[1] + 1
|
s2_start = r[2] - r[1] + 1
|
||||||
|
@ -120,10 +120,9 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
|
||||||
s2_start += len2 - s2_end
|
s2_start += len2 - s2_end
|
||||||
s2_end += len2 - s2_end
|
s2_end += len2 - s2_end
|
||||||
end
|
end
|
||||||
i2_start = nextind(s2, 0, s2_start)
|
curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
|
||||||
i2_end = nextind(s2, 0, s2_end)
|
|
||||||
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
|
out = min(out, curr)
|
||||||
out = max(out, curr)
|
|
||||||
end
|
end
|
||||||
return out
|
return out
|
||||||
end
|
end
|
||||||
|
@ -147,13 +146,16 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
|
||||||
"""
|
"""
|
||||||
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
|
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||||
end
|
end
|
||||||
|
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
|
||||||
|
isnormalized(dist::TokenSort) = true
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
function compare(s1, s2, dist::TokenSort; min_score = 0.0)
|
function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||||
s1 = join(sort!(split(s1)), " ")
|
s1 = join(sort!(split(s1)), " ")
|
||||||
s2 = join(sort!(split(s2)), " ")
|
s2 = join(sort!(split(s2)), " ")
|
||||||
compare(s1, s2, dist.dist; min_score = min_score)
|
evaluate(dist.dist, s1, s2, max_dist)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -175,23 +177,26 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
|
||||||
"""
|
"""
|
||||||
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
|
TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||||
end
|
end
|
||||||
|
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
|
||||||
|
isnormalized(dist::TokenSet) = true
|
||||||
|
|
||||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||||
function compare(s1, s2, dist::TokenSet; min_score = 0.0)
|
function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||||
v1 = unique!(sort!(split(s1)))
|
v1 = unique!(sort!(split(s1)))
|
||||||
v2 = unique!(sort!(split(s2)))
|
v2 = unique!(sort!(split(s2)))
|
||||||
v0 = intersect(v1, v2)
|
v0 = intersect(v1, v2)
|
||||||
s0 = join(v0, " ")
|
s0 = join(v0, " ")
|
||||||
s1 = join(v1, " ")
|
s1 = join(v1, " ")
|
||||||
s2 = join(v2, " ")
|
s2 = join(v2, " ")
|
||||||
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
|
||||||
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
|
score_01 = evaluate(dist.dist, s0, s1, max_dist)
|
||||||
min_score = max(min_score, score_01)
|
max_dist = min(max_dist, score_01)
|
||||||
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
|
score_02 = evaluate(dist.dist, s0, s2, max_dist)
|
||||||
min_score = max(min_score, score_02)
|
max_dist = min(max_dist, score_02)
|
||||||
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
|
score_12 = evaluate(dist.dist, s1, s2, max_dist)
|
||||||
max(score_01, score_02, score_12)
|
min(score_01, score_02, score_12)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -214,36 +219,35 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
|
||||||
"""
|
"""
|
||||||
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
||||||
dist::S
|
dist::S
|
||||||
|
TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||||
end
|
end
|
||||||
|
|
||||||
function compare(s1, s2, dist::TokenMax; min_score = 0.0)
|
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
|
||||||
|
isnormalized(dist::TokenMax) = true
|
||||||
|
|
||||||
|
function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||||
s1, s2 = reorder(s1, s2)
|
s1, s2 = reorder(s1, s2)
|
||||||
len1, len2 = length(s1), length(s2)
|
len1, len2 = length(s1), length(s2)
|
||||||
score = compare(s1, s2, dist.dist; min_score = min_score)
|
score = evaluate(dist.dist, s1, s2, max_dist)
|
||||||
min_score = max(min_score, score)
|
min_score = min(max_dist, score)
|
||||||
unbase_scale = 0.95
|
unbase_scale = 0.95
|
||||||
# if one string is much shorter than the other, use partial
|
# if one string is much shorter than the other, use partial
|
||||||
if length(s2) >= 1.5 * length(s1)
|
if length(s2) >= 1.5 * length(s1)
|
||||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||||
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
|
score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||||
min_score = min_score / partial_scale)
|
min_score = min(max_dist, score_partial)
|
||||||
min_score = max(min_score, score_partial)
|
score_sort = 1 - unbase_scale * partial_scale *
|
||||||
score_sort = unbase_scale * partial_scale *
|
(1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||||
compare(s1, s2, TokenSort(Partial(dist.dist));
|
max_dist = min(max_dist, score_sort)
|
||||||
min_score = min_score / (unbase_scale * partial_scale))
|
score_set = 1 - unbase_scale * partial_scale *
|
||||||
min_score = max(min_score, score_sort)
|
(1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||||
score_set = unbase_scale * partial_scale *
|
return min(score, score_partial, score_sort, score_set)
|
||||||
compare(s1, s2, TokenSet(Partial(dist.dist));
|
|
||||||
min_score = min_score / (unbase_scale * partial_scale))
|
|
||||||
return max(score, score_partial, score_sort, score_set)
|
|
||||||
else
|
else
|
||||||
score_sort = unbase_scale *
|
score_sort = 1 - unbase_scale *
|
||||||
compare(s1, s2, TokenSort(dist.dist);
|
(1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||||
min_score = min_score / unbase_scale)
|
max_dist = min(max_dist, score_sort)
|
||||||
min_score = max(min_score, score_sort)
|
score_set = 1 - unbase_scale *
|
||||||
score_set = unbase_scale *
|
(1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||||
compare(s1, s2, TokenSet(dist.dist);
|
return min(score, score_sort, score_set)
|
||||||
min_score = min_score / unbase_scale)
|
|
||||||
return max(score, score_sort, score_set)
|
|
||||||
end
|
end
|
||||||
end
|
end
|
11
src/qgram.jl
11
src/qgram.jl
|
@ -18,12 +18,15 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
||||||
|
|
||||||
|
|
||||||
#q-grams of AbstractVector
|
#q-grams of AbstractVector
|
||||||
|
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
|
||||||
|
# so it does not seem to be worth it.
|
||||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||||
end
|
end
|
||||||
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Return an iterator on the q-gram of a string
|
Return an iterator on the q-gram of a string
|
||||||
|
|
||||||
|
@ -120,7 +123,7 @@ struct Cosine <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Cosine, s1, s2)
|
function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
norm1, norm2, prodnorm = 0, 0, 0
|
norm1, norm2, prodnorm = 0, 0, 0
|
||||||
|
@ -147,7 +150,7 @@ struct Jaccard <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Jaccard, s1, s2)
|
function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
|
@ -174,7 +177,7 @@ struct SorensenDice <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::SorensenDice, s1, s2)
|
function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
|
@ -201,7 +204,7 @@ struct Overlap <: QGramDistance
|
||||||
q::Int
|
q::Int
|
||||||
end
|
end
|
||||||
|
|
||||||
function evaluate(dist::Overlap, s1, s2)
|
function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
|
||||||
(ismissing(s1) | ismissing(s2)) && return missing
|
(ismissing(s1) | ismissing(s2)) && return missing
|
||||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||||
|
|
27
src/utils.jl
27
src/utils.jl
|
@ -21,8 +21,6 @@ function reorder(s1, s2)
|
||||||
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
|
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function common_prefix(s1, s2)
|
function common_prefix(s1, s2)
|
||||||
x1 = iterate(s1)
|
x1 = iterate(s1)
|
||||||
x2 = iterate(s2)
|
x2 = iterate(s2)
|
||||||
|
@ -37,3 +35,28 @@ function common_prefix(s1, s2)
|
||||||
end
|
end
|
||||||
return l, x1, x2
|
return l, x1, x2
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
function _take(s, n::Integer)
|
||||||
|
Base.Iterators.take(s, n)
|
||||||
|
end
|
||||||
|
function _take(s::AbstractString, n::Integer)
|
||||||
|
SubString(s, firstindex(s), nextind(s, 0, n))
|
||||||
|
end
|
||||||
|
|
||||||
|
function _drop(s, n::Integer)
|
||||||
|
Base.Iterators.drop(s, n)
|
||||||
|
end
|
||||||
|
function _drop(s::AbstractString, n::Integer)
|
||||||
|
SubString(s, nextind(s, 0, n + 1), lastindex(s))
|
||||||
|
end
|
||||||
|
|
||||||
|
function _slice(s, n1::Integer, n2::Integer)
|
||||||
|
Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
|
||||||
|
end
|
||||||
|
function _slice(s::AbstractString, n1::Integer, n2::Integer)
|
||||||
|
SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2))
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,13 +24,13 @@ using StringDistances, Test
|
||||||
compare("aüa", "aua", DamerauLevenshtein())
|
compare("aüa", "aua", DamerauLevenshtein())
|
||||||
|
|
||||||
# Winkler
|
# Winkler
|
||||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
|
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4
|
||||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
|
@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4
|
||||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
|
@test compare("dixon", "dicksonx", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.81333 atol = 1e-4
|
||||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
|
@test compare("william", "williams", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.975 atol = 1e-4
|
||||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
@test compare("", "foo", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4
|
||||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
|
@test compare("a", "a", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 1.0 atol = 1e-4
|
||||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
@test compare("abc", "xyz", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4
|
||||||
|
|
||||||
# RatcliffObershelp
|
# RatcliffObershelp
|
||||||
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
||||||
|
|
Loading…
Reference in New Issue