allow any iterator in. Define evaluate for modifiers.
parent
a949f7bd62
commit
5cbbfc5bde
|
@ -10,9 +10,11 @@ The package is registered in the [`General`](https://github.com/JuliaRegistries/
|
|||
The function `compare` returns a similarity score between two strings. The function always returns a score between 0 and 1, with a value of 0 being completely different and a value of 1 being completely similar. Its syntax is:
|
||||
|
||||
```julia
|
||||
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
||||
compare(s1, s2, dist::StringDistance)
|
||||
```
|
||||
|
||||
where `s1` and `s2` can be any iterator with a `length` method (e.g. `AbstractString`, `GraphemeIterator`, `AbstractVector`...).
|
||||
|
||||
- Edit Distances
|
||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
|
|
|
@ -3,24 +3,37 @@ module StringDistances
|
|||
using Distances
|
||||
import Distances: evaluate, result_type
|
||||
|
||||
isnormalized(dist::SemiMetric) = false
|
||||
|
||||
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("compare.jl")
|
||||
include("modifier.jl")
|
||||
|
||||
const StringDistance = Union{Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, QGramDistance, Winkler, Partial, TokenSort, TokenSet, TokenMax}
|
||||
include("find.jl")
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Distances API
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::StringDistance; min_score = 0.0)
|
||||
1 - evaluate(normalize(dist), s1, s2, 1 - min_score)
|
||||
end
|
||||
|
||||
# distance API
|
||||
function result_type(dist::StringDistance, s1, s2)
|
||||
typeof(evaluate(dist, "", ""))
|
||||
end
|
||||
|
||||
include("find.jl")
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
|
27
src/edit.jl
27
src/edit.jl
|
@ -12,10 +12,11 @@ where ``m`` is the number of matching characters and
|
|||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: SemiMetric end
|
||||
isnormalized(::Jaro) = true
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::Jaro, s1, s2)
|
||||
function evaluate(dist::Jaro, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -87,7 +88,7 @@ struct Levenshtein <: Metric end
|
|||
# This makes it possible to differentiate distance equalt to max_dist vs strictly higher
|
||||
# This is important for find_all
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::Levenshtein, s1, s2; max_dist = nothing)
|
||||
function evaluate(dist::Levenshtein, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -142,7 +143,7 @@ struct DamerauLevenshtein <: SemiMetric end
|
|||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
## accepts any iterator, including AbstractString
|
||||
function evaluate(dist::DamerauLevenshtein, s1, s2; max_dist = nothing)
|
||||
function evaluate(dist::DamerauLevenshtein, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
|
@ -226,20 +227,20 @@ region on either side of the longest common subsequence.
|
|||
"""
|
||||
struct RatcliffObershelp <: SemiMetric end
|
||||
|
||||
evaluate(::RatcliffObershelp, ::Missing, ::AbstractString) = missing
|
||||
evaluate(::RatcliffObershelp, ::AbstractString, ::Missing) = missing
|
||||
isnormalized(::RatcliffObershelp) = true
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::RatcliffObershelp, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
||||
end
|
||||
|
||||
function matching_blocks(s1::AbstractString, s2::AbstractString)
|
||||
function matching_blocks(s1, s2)
|
||||
matching_blocks!(Set{Tuple{Int, Int, Int}}(), s1, s2, length(s1), length(s2), 1, 1)
|
||||
end
|
||||
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::AbstractString,
|
||||
function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1, s2,
|
||||
len1::Integer, len2::Integer, start1::Integer, start2::Integer)
|
||||
a = longest_common_pattern(s1, s2, len1 , len2)
|
||||
# exit if there is no common substring
|
||||
|
@ -247,18 +248,18 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
|
|||
# add the info of the common to the existing set
|
||||
push!(x, (a[1] + start1 - 1, a[2] + start2 - 1, a[3]))
|
||||
# add the longest common substring that happens before
|
||||
s1before = SubString(s1, 1, nextind(s1, 0, a[1] - 1))
|
||||
s2before = SubString(s2, 1, nextind(s2, 0, a[2] - 1))
|
||||
s1before = _take(s1, a[1] - 1)
|
||||
s2before = _take(s2, a[2] - 1)
|
||||
matching_blocks!(x, s1before, s2before, a[1] - 1, a[2] - 1, start1, start2)
|
||||
# add the longest common substring that happens after
|
||||
s1after = SubString(s1, nextind(s1, 0, a[1] + a[3]), lastindex(s1))
|
||||
s2after = SubString(s2, nextind(s2, 0, a[2] + a[3]), lastindex(s2))
|
||||
s1after = _drop(s1, a[1] + a[3] - 1)
|
||||
s2after = _drop(s2, a[2] + a[3] - 1)
|
||||
matching_blocks!(x, s1after, s2after, len1 - (a[1] + a[3]) + 1,
|
||||
len2 - (a[2] + a[3]) + 1, start1 + a[1] + a[3] - 1, start2 + a[2] + a[3] - 1)
|
||||
return x
|
||||
end
|
||||
|
||||
function longest_common_pattern(s1::AbstractString, s2::AbstractString, len1::Integer, len2::Integer)
|
||||
function longest_common_pattern(s1, s2, len1::Integer, len2::Integer)
|
||||
if len1 > len2
|
||||
start2, start1, len = longest_common_pattern(s2, s1, len2, len1)
|
||||
else
|
||||
|
|
|
@ -1,42 +1,36 @@
|
|||
"""
|
||||
compare(s1, s2, dist)
|
||||
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1, s2, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
struct Normalize{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
end
|
||||
function normalize(dist::SemiMetric)
|
||||
isnormalized(dist) ? dist : Normalize{typeof(dist)}(dist)
|
||||
end
|
||||
isnormalized(dist::Normalize) = true
|
||||
|
||||
function compare(s1, s2, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
|
||||
|
||||
function evaluate(dist::Normalize{<: Union{Levenshtein, DamerauLevenshtein}}, s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
d = evaluate(dist, s1, s2; max_dist = ceil(Int, len2 * (1 - min_score)))
|
||||
out = 1.0 - d / len2
|
||||
out < min_score ? 0.0 : out
|
||||
d = evaluate(dist.dist, s1, s2, ceil(Int, len2 * max_dist))
|
||||
out = d / len2
|
||||
out > max_dist ? 1.0 : out
|
||||
end
|
||||
|
||||
function compare(s1, s2, dist::QGramDistance; min_score = 0.0)
|
||||
function evaluate(dist::Normalize{<: QGramDistance}, s1, s2, max_dist = 1.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
|
||||
if typeof(dist) <: QGram
|
||||
1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
||||
len1 <= dist.dist.q - 1 && return convert(Float64, !(s1 == s2))
|
||||
if typeof(dist.dist) <: QGram
|
||||
evaluate(dist.dist, s1, s2) / (len1 + len2 - 2 * dist.dist.q + 2)
|
||||
else
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
evaluate(dist.dist, s1, s2)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
"""
|
||||
Winkler(dist; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
|
||||
|
||||
|
@ -52,19 +46,22 @@ struct Winkler{S <: SemiMetric} <: SemiMetric
|
|||
p::Float64 # scaling factor. Default to 0.1
|
||||
threshold::Float64 # boost threshold. Default to 0.7
|
||||
maxlength::Integer # max length of common prefix. Default to 4
|
||||
Winkler{S}(dist::S, p, threshold, maxlength) where {S <: SemiMetric} = new(dist, p, threshold, maxlength)
|
||||
end
|
||||
|
||||
function Winkler(dist; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||
function Winkler(dist::SemiMetric; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
||||
Winkler(dist, 0.1, 0.7, 4)
|
||||
Winkler{typeof(normalize(dist))}(normalize(dist), 0.1, 0.7, 4)
|
||||
end
|
||||
isnormalized(dist::Winkler) = true
|
||||
|
||||
function compare(s1, s2, dist::Winkler; min_score = 0.0)
|
||||
|
||||
function evaluate(dist::Winkler, s1, s2, max_dist = 1.0)
|
||||
# cannot do min_score because of boosting threshold
|
||||
score = compare(s1, s2, dist.dist)
|
||||
if score >= dist.threshold
|
||||
score = evaluate(dist.dist, s1, s2)
|
||||
if score <= 1 - dist.threshold
|
||||
l = common_prefix(s1, s2)[1]
|
||||
score += min(l, dist.maxlength) * dist.p * (1 - score)
|
||||
score -= min(l, dist.maxlength) * dist.p * score
|
||||
end
|
||||
return score
|
||||
end
|
||||
|
@ -88,27 +85,30 @@ julia> compare(s1, s2, Partial(RatcliffObershelp()))
|
|||
"""
|
||||
struct Partial{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
Partial{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
Partial(dist::SemiMetric) = Partial{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::Partial) = true
|
||||
|
||||
function compare(s1, s2, dist::Partial; min_score = 0.0)
|
||||
function evaluate(dist::Partial, s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
len1 == 0 && return 1.0
|
||||
out = 0.0
|
||||
len1 == len2 && return evaluate(dist.dist, s1, s2, max_dist)
|
||||
len1 == 0 && return 0.0
|
||||
out = 1.0
|
||||
for x in qgrams(s2, len1)
|
||||
curr = compare(s1, x, dist.dist; min_score = min_score)
|
||||
out = max(out, curr)
|
||||
min_score = max(out, min_score)
|
||||
curr = evaluate(dist.dist, s1, x, max_dist)
|
||||
out = min(out, curr)
|
||||
max_dist = min(out, max_dist)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
||||
function evaluate(dist::Partial{RatcliffObershelp}, s1, s2, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
out = 0.0
|
||||
len1 == len2 && return evaluate(dist.dist, s1, s2)
|
||||
out = 1.0
|
||||
for r in matching_blocks(s1, s2)
|
||||
# Make sure the substring of s2 has length len1
|
||||
s2_start = r[2] - r[1] + 1
|
||||
|
@ -120,10 +120,9 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
|
|||
s2_start += len2 - s2_end
|
||||
s2_end += len2 - s2_end
|
||||
end
|
||||
i2_start = nextind(s2, 0, s2_start)
|
||||
i2_end = nextind(s2, 0, s2_end)
|
||||
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
|
||||
out = max(out, curr)
|
||||
curr = evaluate(dist.dist, s1, _slice(s2, s2_start - 1, s2_end))
|
||||
|
||||
out = min(out, curr)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
@ -147,13 +146,16 @@ julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
|
|||
"""
|
||||
struct TokenSort{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
TokenSort{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
TokenSort(dist::SemiMetric) = TokenSort{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenSort) = true
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function compare(s1, s2, dist::TokenSort; min_score = 0.0)
|
||||
function evaluate(dist::TokenSort, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
compare(s1, s2, dist.dist; min_score = min_score)
|
||||
evaluate(dist.dist, s1, s2, max_dist)
|
||||
end
|
||||
|
||||
|
||||
|
@ -175,23 +177,26 @@ julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
|
|||
"""
|
||||
struct TokenSet{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
TokenSet{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
TokenSet(dist::SemiMetric) = TokenSet{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenSet) = true
|
||||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function compare(s1, s2, dist::TokenSet; min_score = 0.0)
|
||||
function evaluate(dist::TokenSet, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
s2 = join(v2, " ")
|
||||
isempty(s0) && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
score_01 = compare(s0, s1, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score_01)
|
||||
score_02 = compare(s0, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score_02)
|
||||
score_12 = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
max(score_01, score_02, score_12)
|
||||
isempty(s0) && return evaluate(dist.dist, s1, s2, max_dist)
|
||||
score_01 = evaluate(dist.dist, s0, s1, max_dist)
|
||||
max_dist = min(max_dist, score_01)
|
||||
score_02 = evaluate(dist.dist, s0, s2, max_dist)
|
||||
max_dist = min(max_dist, score_02)
|
||||
score_12 = evaluate(dist.dist, s1, s2, max_dist)
|
||||
min(score_01, score_02, score_12)
|
||||
end
|
||||
|
||||
|
||||
|
@ -214,36 +219,35 @@ julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
|
|||
"""
|
||||
struct TokenMax{S <: SemiMetric} <: SemiMetric
|
||||
dist::S
|
||||
TokenMax{S}(dist::S) where {S <: SemiMetric} = new(dist)
|
||||
end
|
||||
|
||||
function compare(s1, s2, dist::TokenMax; min_score = 0.0)
|
||||
TokenMax(dist::SemiMetric) = TokenMax{typeof(normalize(dist))}(normalize(dist))
|
||||
isnormalized(dist::TokenMax) = true
|
||||
|
||||
function evaluate(dist::TokenMax, s1::AbstractString, s2::AbstractString, max_dist = 1.0)
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
score = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
min_score = max(min_score, score)
|
||||
score = evaluate(dist.dist, s1, s2, max_dist)
|
||||
min_score = min(max_dist, score)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much shorter than the other, use partial
|
||||
if length(s2) >= 1.5 * length(s1)
|
||||
partial_scale = length(s2) > (8 * length(s1)) ? 0.6 : 0.9
|
||||
score_partial = partial_scale * compare(s1, s2, Partial(dist.dist);
|
||||
min_score = min_score / partial_scale)
|
||||
min_score = max(min_score, score_partial)
|
||||
score_sort = unbase_scale * partial_scale *
|
||||
compare(s1, s2, TokenSort(Partial(dist.dist));
|
||||
min_score = min_score / (unbase_scale * partial_scale))
|
||||
min_score = max(min_score, score_sort)
|
||||
score_set = unbase_scale * partial_scale *
|
||||
compare(s1, s2, TokenSet(Partial(dist.dist));
|
||||
min_score = min_score / (unbase_scale * partial_scale))
|
||||
return max(score, score_partial, score_sort, score_set)
|
||||
score_partial = 1 - partial_scale * (1 - evaluate(Partial(dist.dist), s1, s2, 1 - (1 - max_dist) / partial_scale))
|
||||
min_score = min(max_dist, score_partial)
|
||||
score_sort = 1 - unbase_scale * partial_scale *
|
||||
(1 - evaluate(TokenSort(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale * partial_scale *
|
||||
(1 - evaluate(TokenSet(Partial(dist.dist)), s1, s2, 1 - (1 - max_dist) / (unbase_scale * partial_scale)))
|
||||
return min(score, score_partial, score_sort, score_set)
|
||||
else
|
||||
score_sort = unbase_scale *
|
||||
compare(s1, s2, TokenSort(dist.dist);
|
||||
min_score = min_score / unbase_scale)
|
||||
min_score = max(min_score, score_sort)
|
||||
score_set = unbase_scale *
|
||||
compare(s1, s2, TokenSet(dist.dist);
|
||||
min_score = min_score / unbase_scale)
|
||||
return max(score, score_sort, score_set)
|
||||
score_sort = 1 - unbase_scale *
|
||||
(1 - evaluate(TokenSort(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
max_dist = min(max_dist, score_sort)
|
||||
score_set = 1 - unbase_scale *
|
||||
(1 - evaluate(TokenSet(dist.dist), s1, s2, 1 - (1 - max_dist) / unbase_scale))
|
||||
return min(score, score_sort, score_set)
|
||||
end
|
||||
end
|
11
src/qgram.jl
11
src/qgram.jl
|
@ -18,12 +18,15 @@ Base.eltype(qgram::QGramIterator{S}) where {S <: AbstractString} = SubString{S}
|
|||
|
||||
|
||||
#q-grams of AbstractVector
|
||||
# Alternatively, I could also use partition in IterTools but it creates a vector for each iteration
|
||||
# so it does not seem to be worth it.
|
||||
function Base.iterate(qgram::QGramIterator{<: AbstractVector}, state = firstindex(qgram.s))
|
||||
state + qgram.q - 1 > lastindex(qgram.s) && return nothing
|
||||
view(qgram.s, state:(state + qgram.q - 1)), state + 1
|
||||
end
|
||||
Base.eltype(qgram::QGramIterator{<: AbstractVector}) = typeof(first(qgram))
|
||||
|
||||
|
||||
"""
|
||||
Return an iterator on the q-gram of a string
|
||||
|
||||
|
@ -120,7 +123,7 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, s1, s2)
|
||||
function evaluate(dist::Cosine, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
|
@ -147,7 +150,7 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, s1, s2)
|
||||
function evaluate(dist::Jaccard, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -174,7 +177,7 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, s1, s2)
|
||||
function evaluate(dist::SorensenDice, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
@ -201,7 +204,7 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, s1, s2)
|
||||
function evaluate(dist::Overlap, s1, s2, max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
itr = values(count_map(qgrams(s1, dist.q), qgrams(s2, dist.q)))
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
|
|
29
src/utils.jl
29
src/utils.jl
|
@ -21,8 +21,6 @@ function reorder(s1, s2)
|
|||
(length(s1) <= length(s2)) ? (s1, s2) : (s2, s1)
|
||||
end
|
||||
|
||||
|
||||
|
||||
function common_prefix(s1, s2)
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
|
@ -36,4 +34,29 @@ function common_prefix(s1, s2)
|
|||
x2 = iterate(s2, state2)
|
||||
end
|
||||
return l, x1, x2
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
function _take(s, n::Integer)
|
||||
Base.Iterators.take(s, n)
|
||||
end
|
||||
function _take(s::AbstractString, n::Integer)
|
||||
SubString(s, firstindex(s), nextind(s, 0, n))
|
||||
end
|
||||
|
||||
function _drop(s, n::Integer)
|
||||
Base.Iterators.drop(s, n)
|
||||
end
|
||||
function _drop(s::AbstractString, n::Integer)
|
||||
SubString(s, nextind(s, 0, n + 1), lastindex(s))
|
||||
end
|
||||
|
||||
function _slice(s, n1::Integer, n2::Integer)
|
||||
Base.Iterators.take(Base.Iterators.drop(s, n1), n2 - n1)
|
||||
end
|
||||
function _slice(s::AbstractString, n1::Integer, n2::Integer)
|
||||
SubString(s, nextind(s, 0, n1 + 1), nextind(s, 0, n2))
|
||||
end
|
||||
|
||||
|
||||
|
|
|
@ -24,13 +24,13 @@ using StringDistances, Test
|
|||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
|
||||
# Winkler
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
|
||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
|
||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
|
||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.84 atol = 1e-4
|
||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.81333 atol = 1e-4
|
||||
@test compare("william", "williams", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.975 atol = 1e-4
|
||||
@test compare("", "foo", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("a", "a", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "xyz", Winkler(Jaro(), p = 0.1, threshold = 0.0, maxlength = 4)) ≈ 0.0 atol = 1e-4
|
||||
|
||||
# RatcliffObershelp
|
||||
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
||||
|
|
Loading…
Reference in New Issue