parent
8f9ab747a4
commit
a575eeab6a
|
@ -1,15 +1,13 @@
|
|||
name = "StringDistances"
|
||||
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
||||
version = "0.5.1"
|
||||
version = "0.5.2"
|
||||
|
||||
[deps]
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
|
||||
[compat]
|
||||
julia = "1"
|
||||
DataStructures = "0.14, 0.15, 0.16, 0.17"
|
||||
Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
|
||||
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
|
||||
|
||||
[extras]
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||
|
||||
This Julia package computes various distances between `AbstractString`s
|
||||
This Julia package computes various distances between AbstractStrings
|
||||
|
||||
## Installation
|
||||
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
|
||||
|
@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro()))
|
|||
compare("martha", "marhta", TokenMax(RatcliffObershelp()))
|
||||
```
|
||||
|
||||
In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
|
||||
A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
|
||||
|
||||
## Find
|
||||
- `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
|
||||
|
|
|
@ -2,7 +2,6 @@ module StringDistances
|
|||
|
||||
using Distances
|
||||
import Distances: evaluate, result_type
|
||||
using DataStructures # for SortedSet in TokenSort
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
|
121
src/compare.jl
121
src/compare.jl
|
@ -1,8 +1,14 @@
|
|||
"""
|
||||
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
|
||||
|
||||
compare returns a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the distance `dist`
|
||||
return a similarity score between 0 and 1 for the strings `s1` and
|
||||
`s2` based on the `StringDistance` `dist`
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> compare("martha", "marhta", Levenshtein())
|
||||
0.6666666666666667
|
||||
```
|
||||
"""
|
||||
function compare(s1::AbstractString, s2::AbstractString,
|
||||
dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
|
||||
|
@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString,
|
|||
end
|
||||
|
||||
"""
|
||||
Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
|
||||
Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
|
||||
|
||||
Winkler is a `StringDistance` modifier that boosts the similarity score between
|
||||
two strings by a scale `p` when the strings share a common prefix with lenth lower
|
||||
than `l` (the boost is only applied the similarity score above `boosting_threshold`)
|
||||
Creates the `Winkler{dist, p, threshold, maxlength}` distance
|
||||
|
||||
`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the
|
||||
similarity score between two strings, when their original similarity score is above some `threshold`.
|
||||
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
|
||||
length of their common prefix and `score` denotes the original score
|
||||
"""
|
||||
struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
|
||||
dist::T1
|
||||
p::T2 # scaling factor. Default to 0.1
|
||||
boosting_threshold::T3 # boost threshold. Default to 0.7
|
||||
l::Integer # length of common prefix. Default to 4
|
||||
function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
|
||||
p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
|
||||
new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
|
||||
end
|
||||
struct Winkler{S <: StringDistance} <: StringDistance
|
||||
dist::S
|
||||
p::Float64 # scaling factor. Default to 0.1
|
||||
threshold::Float64 # boost threshold. Default to 0.7
|
||||
maxlength::Integer # max length of common prefix. Default to 4
|
||||
end
|
||||
|
||||
function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4)
|
||||
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
|
||||
Winkler(dist, 0.1, 0.7, 4)
|
||||
end
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
|
||||
|
||||
# hard to use min_score because of whether there is boost or not in the end
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0)
|
||||
l = remove_prefix(s1, s2, dist.l)[1]
|
||||
# cannot do min_score because of boosting threshold
|
||||
score = compare(s1, s2, dist.dist)
|
||||
if score >= dist.boosting_threshold
|
||||
score += l * dist.p * (1 - score)
|
||||
if score >= dist.threshold
|
||||
l = common_prefix(s1, s2)[1]
|
||||
score += min(l, dist.maxlength) * dist.p * (1 - score)
|
||||
end
|
||||
return score
|
||||
end
|
||||
|
||||
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
|
||||
|
||||
|
||||
"""
|
||||
Partial(dist::StringDistance)
|
||||
|
||||
Partial is a `StringDistance` modifier that returns the maximal similarity score
|
||||
between the shorter string and substrings of the longer string
|
||||
Creates the `Partial{dist}` distance
|
||||
|
||||
`Partial{dist}` modifies the string distance `dist` to return the
|
||||
maximal similarity score between the shorter string and substrings of the longer string
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> compare(s1, s2, Partial(RatcliffObershelp()))
|
||||
0.4516129032258065
|
||||
```
|
||||
"""
|
||||
struct Partial{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
struct Partial{S <: StringDistance} <: StringDistance
|
||||
dist::S
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
|
||||
|
@ -121,8 +137,19 @@ end
|
|||
"""
|
||||
TokenSort(dist::StringDistance)
|
||||
|
||||
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
|
||||
by reording words alphabetically.
|
||||
Creates the `TokenSort{dist}` distance
|
||||
|
||||
`TokenSort{dist}` modifies the string distance `dist` to adjust for differences
|
||||
in word orders by reording words alphabetically.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s1 = "New York Mets vs Atlanta Braves"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
|
||||
1.0
|
||||
```
|
||||
"""
|
||||
struct TokenSort{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
|
@ -139,8 +166,18 @@ end
|
|||
"""
|
||||
TokenSet(dist::StringDistance)
|
||||
|
||||
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
|
||||
and word numbers by comparing the intersection of two strings with each string.
|
||||
Creates the `TokenSet{dist}` distance
|
||||
|
||||
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
|
||||
in word orders and word numbers, by comparing the intersection of two strings with each string.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
|
||||
1.0
|
||||
```
|
||||
"""
|
||||
struct TokenSet{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
|
@ -148,8 +185,8 @@ end
|
|||
|
||||
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
|
||||
v1 = SortedSet(split(s1))
|
||||
v2 = SortedSet(split(s2))
|
||||
v1 = unique!(sort!(split(s1)))
|
||||
v2 = unique!(sort!(split(s2)))
|
||||
v0 = intersect(v1, v2)
|
||||
s0 = join(v0, " ")
|
||||
s1 = join(v1, " ")
|
||||
|
@ -167,12 +204,22 @@ end
|
|||
"""
|
||||
TokenMax(dist::StringDistance)
|
||||
|
||||
TokenSort is a `StringDistance` modifier that combines similarlity scores using the base
|
||||
distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on
|
||||
string lengths.
|
||||
Creates the `TokenMax{dist}` distance
|
||||
|
||||
`TokenMax{dist}` combines similarity scores of the base distance `dist`,
|
||||
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
|
||||
[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths.
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> s1 = "New York Mets vs Atlanta"
|
||||
julia> s2 = "Atlanta Braves vs New York Mets"
|
||||
julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
|
||||
0.95
|
||||
```
|
||||
"""
|
||||
struct TokenMax{T <: StringDistance} <: StringDistance
|
||||
dist::T
|
||||
struct TokenMax{S <: StringDistance} <: StringDistance
|
||||
dist::S
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
|
||||
|
|
|
@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
|
|||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
k, x1, x2start = common_prefix(s1, s2)
|
||||
x1 == nothing && return len2 - k
|
||||
# distance initialized to first row of matrix
|
||||
# => distance between "" and s2[1:i}
|
||||
|
@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
# prefix common to both strings can be ignored
|
||||
k, x1, x2start = remove_prefix(s1, s2)
|
||||
k, x1, x2start = common_prefix(s1, s2)
|
||||
(x1 == nothing) && return len2 - k
|
||||
v0 = collect(1:(len2 - k))
|
||||
v2 = similar(v0)
|
||||
|
|
37
src/find.jl
37
src/find.jl
|
@ -5,16 +5,28 @@
|
|||
highest similarity score with `s` according to the distance `dist`.
|
||||
It returns `(nothing, nothing)` if none of the elements has a similarity score
|
||||
higher or equal to `min_score` (default to 0.0).
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
|
||||
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = ""Newark"
|
||||
julia> iter = ["New York", "Princeton", "San Francisco"]
|
||||
julia> findmax(s, iter, Levenshtein())
|
||||
("NewYork", 1)
|
||||
julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
|
||||
(nothing, nothing)
|
||||
```
|
||||
"""
|
||||
function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
|
||||
min_score = Threads.Atomic{typeof(min_score)}(min_score)
|
||||
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
|
||||
scores = [0.0 for _ in 1:Threads.nthreads()]
|
||||
is = [0 for _ in 1:Threads.nthreads()]
|
||||
Threads.@threads for i in collect(keys(itr))
|
||||
score = compare(s, itr[i], dist; min_score = min_score[])
|
||||
score_old = Threads.atomic_max!(min_score, score)
|
||||
score = compare(s, itr[i], dist; min_score = min_score_atomic[])
|
||||
score_old = Threads.atomic_max!(min_score_atomic, score)
|
||||
if score >= score_old
|
||||
scores[Threads.threadid()] = score
|
||||
is[Threads.threadid()] = i
|
||||
|
@ -30,8 +42,21 @@ end
|
|||
`findall` returns the vector of indices for elements of `itr` that have a
|
||||
similarity score higher or equal than `min_score` according to the distance `dist`.
|
||||
If there are no such elements, return an empty array.
|
||||
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
|
||||
|
||||
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
|
||||
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
|
||||
|
||||
### Examples
|
||||
```julia-repl
|
||||
julia> using StringDistances
|
||||
julia> s = "Newark"
|
||||
julia> iter = ["Newwark", "Princeton", "San Francisco"]
|
||||
julia> findall(s, iter, Levenshtein())
|
||||
1-element Array{Int64,1}:
|
||||
1
|
||||
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
|
||||
0-element Array{Int64,1}
|
||||
```
|
||||
"""
|
||||
function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
|
||||
out = [Int[] for _ in 1:Threads.nthreads()]
|
||||
|
|
22
src/qgram.jl
22
src/qgram.jl
|
@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end
|
|||
|
||||
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
|
||||
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
|
||||
evaluate(dist, x)
|
||||
evaluate(dist, values(x))
|
||||
end
|
||||
|
||||
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
|
||||
|
@ -98,9 +98,9 @@ struct QGram <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, count_dict)
|
||||
function evaluate(dist::QGram, itr)
|
||||
n = 0
|
||||
for (n1, n2) in values(count_dict)
|
||||
for (n1, n2) in itr
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
n
|
||||
|
@ -122,9 +122,9 @@ struct Cosine <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, count_dict)
|
||||
function evaluate(dist::Cosine, itr)
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in values(count_dict)
|
||||
for (n1, n2) in itr
|
||||
norm1 += n1^2
|
||||
norm2 += n2^2
|
||||
prodnorm += n1 * n2
|
||||
|
@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, count_dict)
|
||||
function evaluate(dist::Jaccard, itr)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in values(count_dict)
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, count_dict)
|
||||
function evaluate(dist::SorensenDice, itr)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in values(count_dict)
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -197,9 +197,9 @@ struct Overlap <: QGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, count_dict)
|
||||
function evaluate(dist::Overlap, itr)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in values(count_dict)
|
||||
for (n1, n2) in itr
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
|
15
src/utils.jl
15
src/utils.jl
|
@ -1,6 +1,5 @@
|
|||
# String with Length
|
||||
# This allows to compute length once and only once
|
||||
struct StringWithLength{T<:AbstractString} <: AbstractString
|
||||
# This type allows to compute length once and for all
|
||||
struct StringWithLength{T <: AbstractString} <: AbstractString
|
||||
s::T
|
||||
l::Int
|
||||
end
|
||||
|
@ -21,19 +20,17 @@ function reorder(s1::AbstractString, s2::AbstractString)
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
## Find common prefixes (up to lim. -1 means Inf)
|
||||
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
|
||||
l = 0
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString)
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
while (x1 !== nothing) & (x2 !== nothing) & (l < lim || lim < 0)
|
||||
l = 0
|
||||
while (x1 !== nothing) & (x2 !== nothing)
|
||||
ch1, state1 = x1
|
||||
ch2, state2 = x2
|
||||
ch1 != ch2 && break
|
||||
l += 1
|
||||
x1 = iterate(s1, state1)
|
||||
x2 = iterate(s2, state2)
|
||||
l += 1
|
||||
end
|
||||
return l, x1, x2
|
||||
end
|
|
@ -97,6 +97,9 @@ using StringDistances, Test
|
|||
|
||||
# check find_best and find_all
|
||||
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
|
||||
@test findmax("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
|
||||
@test findmax("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
|
||||
|
||||
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
|
||||
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
|
||||
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
|
||||
|
|
Loading…
Reference in New Issue