rmv datastructures + add docs

pull/22/head v0.5.2
matthieugomez 2019-12-13 10:33:06 -05:00
parent 8f9ab747a4
commit a575eeab6a
9 changed files with 141 additions and 72 deletions

View File

@ -1,15 +1,13 @@
name = "StringDistances"
uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.5.1"
version = "0.5.2"
[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
[compat]
julia = "1"
DataStructures = "0.14, 0.15, 0.16, 0.17"
Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"
[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

View File

@ -1,7 +1,7 @@
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
This Julia package computes various distances between `AbstractString`s
This Julia package computes various distances between AbstractStrings
## Installation
The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro()))
compare("martha", "marhta", TokenMax(RatcliffObershelp()))
```
In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
## Find
- `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:

View File

@ -2,7 +2,6 @@ module StringDistances
using Distances
import Distances: evaluate, result_type
using DataStructures # for SortedSet in TokenSort
##############################################################################
##

View File

@ -1,8 +1,14 @@
"""
compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)
compare returns a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the distance `dist`
return a similarity score between 0 and 1 for the strings `s1` and
`s2` based on the `StringDistance` `dist`
### Examples
```julia-repl
julia> compare("martha", "marhta", Levenshtein())
0.6666666666666667
```
"""
function compare(s1::AbstractString, s2::AbstractString,
dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString,
end
"""
Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)
Winkler is a `StringDistance` modifier that boosts the similarity score between
two strings by a scale `p` when the strings share a common prefix with lenth lower
than `l` (the boost is only applied the similarity score above `boosting_threshold`)
Creates the `Winkler{dist, p, threshold, maxlength}` distance
`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the
similarity score between two strings, when their original similarity score is above some `threshold`.
The boost is equal to `min(l, maxlength) * p * (1 - score)` where `l` denotes the
length of their common prefix and `score` denotes the original score
"""
struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
dist::T1
p::T2 # scaling factor. Default to 0.1
boosting_threshold::T3 # boost threshold. Default to 0.7
l::Integer # length of common prefix. Default to 4
function Winkler(dist::T1, p::T2, boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
end
struct Winkler{S <: StringDistance} <: StringDistance
dist::S
p::Float64 # scaling factor. Default to 0.1
threshold::Float64 # boost threshold. Default to 0.7
maxlength::Integer # max length of common prefix. Default to 4
end
function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4)
p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
Winkler(dist, 0.1, 0.7, 4)
end
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
# hard to use min_score because of whether there is boost or not in the end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0)
l = remove_prefix(s1, s2, dist.l)[1]
# cannot do min_score because of boosting threshold
score = compare(s1, s2, dist.dist)
if score >= dist.boosting_threshold
score += l * dist.p * (1 - score)
if score >= dist.threshold
l = common_prefix(s1, s2)[1]
score += min(l, dist.maxlength) * dist.p * (1 - score)
end
return score
end
JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
"""
Partial(dist::StringDistance)
Partial is a `StringDistance` modifier that returns the maximal similarity score
between the shorter string and substrings of the longer string
Creates the `Partial{dist}` distance
`Partial{dist}` modifies the string distance `dist` to return the
maximal similarity score between the shorter string and substrings of the longer string
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, Partial(RatcliffObershelp()))
0.4516129032258065
```
"""
struct Partial{T <: StringDistance} <: StringDistance
dist::T
struct Partial{S <: StringDistance} <: StringDistance
dist::S
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
@ -121,8 +137,19 @@ end
"""
TokenSort(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
by reording words alphabetically.
Creates the `TokenSort{dist}` distance
`TokenSort{dist}` modifies the string distance `dist` to adjust for differences
in word orders by reording words alphabetically.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s1 = "New York Mets vs Atlanta Braves"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
1.0
```
"""
struct TokenSort{T <: StringDistance} <: StringDistance
dist::T
@ -139,8 +166,18 @@ end
"""
TokenSet(dist::StringDistance)
TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
and word numbers by comparing the intersection of two strings with each string.
Creates the `TokenSet{dist}` distance
`TokenSet{dist}` modifies the string distance `dist` to adjust for differences
in word orders and word numbers, by comparing the intersection of two strings with each string.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
1.0
```
"""
struct TokenSet{T <: StringDistance} <: StringDistance
dist::T
@ -148,8 +185,8 @@ end
# http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
v1 = SortedSet(split(s1))
v2 = SortedSet(split(s2))
v1 = unique!(sort!(split(s1)))
v2 = unique!(sort!(split(s2)))
v0 = intersect(v1, v2)
s0 = join(v0, " ")
s1 = join(v1, " ")
@ -167,12 +204,22 @@ end
"""
TokenMax(dist::StringDistance)
TokenSort is a `StringDistance` modifier that combines similarlity scores using the base
distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on
string lengths.
Creates the `TokenMax{dist}` distance
`TokenMax{dist}` combines similarity scores of the base distance `dist`,
its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its
[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths.
### Examples
```julia-repl
julia> s1 = "New York Mets vs Atlanta"
julia> s2 = "Atlanta Braves vs New York Mets"
julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
0.95
```
"""
struct TokenMax{T <: StringDistance} <: StringDistance
dist::T
struct TokenMax{S <: StringDistance} <: StringDistance
dist::S
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)

View File

@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
k, x1, x2start = common_prefix(s1, s2)
x1 == nothing && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
# prefix common to both strings can be ignored
k, x1, x2start = remove_prefix(s1, s2)
k, x1, x2start = common_prefix(s1, s2)
(x1 == nothing) && return len2 - k
v0 = collect(1:(len2 - k))
v2 = similar(v0)

View File

@ -5,16 +5,28 @@
highest similarity score with `s` according to the distance `dist`.
It returns `(nothing, nothing)` if none of the elements has a similarity score
higher or equal to `min_score` (default to 0.0).
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
### Examples
```julia-repl
julia> using StringDistances
julia> s = ""Newark"
julia> iter = ["New York", "Princeton", "San Francisco"]
julia> findmax(s, iter, Levenshtein())
("NewYork", 1)
julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
(nothing, nothing)
```
"""
function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
min_score = Threads.Atomic{typeof(min_score)}(min_score)
min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
scores = [0.0 for _ in 1:Threads.nthreads()]
is = [0 for _ in 1:Threads.nthreads()]
Threads.@threads for i in collect(keys(itr))
score = compare(s, itr[i], dist; min_score = min_score[])
score_old = Threads.atomic_max!(min_score, score)
score = compare(s, itr[i], dist; min_score = min_score_atomic[])
score_old = Threads.atomic_max!(min_score_atomic, score)
if score >= score_old
scores[Threads.threadid()] = score
is[Threads.threadid()] = i
@ -30,8 +42,21 @@ end
`findall` returns the vector of indices for elements of `itr` that have a
similarity score higher or equal than `min_score` according to the distance `dist`.
If there are no such elements, return an empty array.
The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances
It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances
(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
### Examples
```julia-repl
julia> using StringDistances
julia> s = "Newark"
julia> iter = ["Newwark", "Princeton", "San Francisco"]
julia> findall(s, iter, Levenshtein())
1-element Array{Int64,1}:
1
julia> findall(s, iter, Levenshtein(); min_score = 0.9)
0-element Array{Int64,1}
```
"""
function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
out = [Int[] for _ in 1:Threads.nthreads()]

View File

@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end
function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
evaluate(dist, values(x))
end
# For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2,
@ -98,9 +98,9 @@ struct QGram <: QGramDistance
q::Int
end
function evaluate(dist::QGram, count_dict)
function evaluate(dist::QGram, itr)
n = 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
n += abs(n1 - n2)
end
n
@ -122,9 +122,9 @@ struct Cosine <: QGramDistance
q::Int
end
function evaluate(dist::Cosine, count_dict)
function evaluate(dist::Cosine, itr)
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance
q::Int
end
function evaluate(dist::Jaccard, count_dict)
function evaluate(dist::Jaccard, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance
q::Int
end
function evaluate(dist::SorensenDice, count_dict)
function evaluate(dist::SorensenDice, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -197,9 +197,9 @@ struct Overlap <: QGramDistance
q::Int
end
function evaluate(dist::Overlap, count_dict)
function evaluate(dist::Overlap, itr)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in values(count_dict)
for (n1, n2) in itr
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)

View File

@ -1,6 +1,5 @@
# String with Length
# This allows to compute length once and only once
struct StringWithLength{T<:AbstractString} <: AbstractString
# This type allows to compute length once and for all
struct StringWithLength{T <: AbstractString} <: AbstractString
s::T
l::Int
end
@ -21,19 +20,17 @@ function reorder(s1::AbstractString, s2::AbstractString)
end
end
## Find common prefixes (up to lim. -1 means Inf)
function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
l = 0
function common_prefix(s1::AbstractString, s2::AbstractString)
x1 = iterate(s1)
x2 = iterate(s2)
while (x1 !== nothing) & (x2 !== nothing) & (l < lim || lim < 0)
l = 0
while (x1 !== nothing) & (x2 !== nothing)
ch1, state1 = x1
ch2, state2 = x2
ch1 != ch2 && break
l += 1
x1 = iterate(s1, state1)
x2 = iterate(s2, state2)
l += 1
end
return l, x1, x2
end

View File

@ -97,6 +97,9 @@ using StringDistances, Test
# check find_best and find_all
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
@test findmax("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
@test findmax("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]