rmv datastructures + add docs

2019-12-13 10:33:06 -05:00 · 2019-12-13 10:33:06 -05:00 · a575eeab6a
parent 8f9ab747a4
commit a575eeab6a
9 changed files with 141 additions and 72 deletions
--- a/Project.toml
+++ b/Project.toml
@ -1,15 +1,13 @@
 name = "StringDistances"
 uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
-version = "0.5.1"
+version = "0.5.2"

 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"

 [compat]
 julia = "1"
-DataStructures = "0.14, 0.15, 0.16, 0.17"
-Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
+Distances = "0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8"

 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 [![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
 [![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)

-This Julia package computes various distances between `AbstractString`s
+This Julia package computes various distances between AbstractStrings

 ## Installation
 The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add StringDistances`.
@ -46,7 +46,7 @@ compare("martha", "marhta", TokenSet(Jaro()))
 compare("martha", "marhta", TokenMax(RatcliffObershelp()))
 ```

-In case the word order does not matter, a good distance is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).
+A good distance to match strings composed of multiple words (like addresses) is `TokenMax(Levenshtein())` (see [fuzzywuzzy](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/)).

 ## Find
 - `findmax` returns the value and index of the element in `itr` with the highest similarity score with `s`. Its syntax is:
--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -2,7 +2,6 @@ module StringDistances

 using Distances
 import Distances: evaluate, result_type
-using DataStructures  # for SortedSet in TokenSort

 ##############################################################################
 ##
--- a/src/compare.jl
+++ b/src/compare.jl
@ -1,8 +1,14 @@
 """
    compare(s1::AbstractString, s2::AbstractString, dist::StringDistance)

-compare returns a similarity score between 0 and 1 for the strings `s1` and 
-`s2` based on the distance `dist`
+return a similarity score between 0 and 1 for the strings `s1` and 
+`s2` based on the `StringDistance` `dist`
+
+### Examples
+```julia-repl
+julia> compare("martha", "marhta", Levenshtein())
+0.6666666666666667
+```
 """
 function compare(s1::AbstractString, s2::AbstractString, 
    dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
@ -38,46 +44,56 @@ function compare(s1::AbstractString, s2::AbstractString,
 end

 """
-   Winkler(dist::StringDistance, p::Real = 0.1, boosting_threshold::Real = 0.7, l::Integer = 4)
+   Winkler(dist::StringDistance; p::Real = 0.1, threshold::Real = 0.7, maxlength::Integer = 4)

-Winkler is a `StringDistance` modifier that boosts the similarity score between 
-two strings by a scale `p` when the strings share a common prefix with lenth lower 
-than `l` (the boost is only applied the similarity score above `boosting_threshold`)
+Creates the `Winkler{dist, p, threshold, maxlength}` distance
+
+`Winkler{dist, p, threshold, length)` modifies the string distance `dist` to boost the 
+similarity score between  two strings, when their original similarity score is above some `threshold`.
+The boost is equal to `min(l,  maxlength) * p * (1 - score)` where `l` denotes the 
+length of their common prefix and `score` denotes the original score
 """
-struct Winkler{T1 <: StringDistance, T2 <: Real, T3 <: Real, T4 <: Integer} <: StringDistance
-    dist::T1
-    p::T2          # scaling factor. Default to 0.1
-    boosting_threshold::T3      # boost threshold. Default to 0.7
-    l::Integer                  # length of common prefix. Default to 4
-    function Winkler(dist::T1, p::T2,  boosting_threshold::T3, l::T4) where {T1, T2, T3, T4}
-        p * l >= 1 && throw("scaling factor times length of common prefix must be lower than one")
-        new{T1, T2, T3, T4}(dist, p, boosting_threshold, l)
-    end
+struct Winkler{S <: StringDistance} <: StringDistance
+    dist::S
+    p::Float64          # scaling factor. Default to 0.1
+    threshold::Float64  # boost threshold. Default to 0.7
+    maxlength::Integer      # max length of common prefix. Default to 4
+end
+
+function Winkler(dist::StringDistance; p = 0.1, threshold = 0.7, maxlength = 4)
+    p * maxlength <= 1 || throw("scaling factor times maxlength of common prefix must be lower than one")
+    Winkler(dist, 0.1, 0.7, 4)
 end
-Winkler(x) = Winkler(x, 0.1, 0.7, 4)

-# hard to use min_score because of whether there is boost or not in the end
 function compare(s1::AbstractString, s2::AbstractString, dist::Winkler; min_score = 0.0)
-    l = remove_prefix(s1, s2, dist.l)[1]
    # cannot do min_score because of boosting threshold
    score = compare(s1, s2, dist.dist)
-    if score >= dist.boosting_threshold
-        score += l * dist.p * (1 - score)
+    if score >= dist.threshold
+        l = common_prefix(s1, s2)[1]
+        score += min(l, dist.maxlength) * dist.p * (1 - score)
    end
    return score
 end

-JaroWinkler() = Winkler(Jaro(), 0.1, 0.7)
-

 """
   Partial(dist::StringDistance)

-Partial is a `StringDistance` modifier that returns the maximal similarity score 
-between the shorter string and substrings of the longer string
+Creates the `Partial{dist}` distance
+
+`Partial{dist}` modifies the string distance `dist` to return the 
+maximal similarity score  between the shorter string and substrings of the longer string
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, Partial(RatcliffObershelp()))
+0.4516129032258065
+```
 """
-struct Partial{T <: StringDistance} <: StringDistance
-    dist::T
+struct Partial{S <: StringDistance} <: StringDistance
+    dist::S
 end

 function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
@ -121,8 +137,19 @@ end
 """
   TokenSort(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
-by reording words alphabetically.
+Creates the `TokenSort{dist}` distance
+
+`TokenSort{dist}` modifies the string distance `dist` to adjust for differences 
+in word orders by reording words alphabetically.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s1 = "New York Mets vs Atlanta Braves"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenSort(RatcliffObershelp()))
+1.0
+```
 """
 struct TokenSort{T <: StringDistance} <: StringDistance
    dist::T
@ -139,8 +166,18 @@ end
 """
   TokenSet(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that adjusts for differences in word orders
-and word numbers by comparing the intersection of two strings with each string.
+Creates the `TokenSet{dist}` distance
+
+`TokenSet{dist}` modifies the string distance `dist` to adjust for differences 
+in  word orders and word numbers, by comparing the intersection of two strings with each string.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenSet(RatcliffObershelp()))
+1.0
+```
 """
 struct TokenSet{T <: StringDistance} <: StringDistance
    dist::T
@ -148,8 +185,8 @@ end

 # http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
 function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
-    v1 = SortedSet(split(s1))
-    v2 = SortedSet(split(s2))
+    v1 = unique!(sort!(split(s1)))
+    v2 = unique!(sort!(split(s2)))
    v0 = intersect(v1, v2)
    s0 = join(v0, " ")
    s1 = join(v1, " ")
@ -167,12 +204,22 @@ end
 """
   TokenMax(dist::StringDistance)

-TokenSort is a `StringDistance` modifier that combines similarlity scores using the base 
-distance, its Partial, TokenSort and TokenSet modifiers, with penalty terms depending on 
-string lengths.
+Creates the `TokenMax{dist}` distance
+
+`TokenMax{dist}` combines similarity scores of the base distance `dist`,
+its [`Partial`](@ref) modifier, its [`TokenSort`](@ref) modifier, and its 
+[`TokenSet`](@ref) modifier, with penalty terms depending on string lengths.
+
+### Examples
+```julia-repl
+julia> s1 = "New York Mets vs Atlanta"
+julia> s2 = "Atlanta Braves vs New York Mets"
+julia> compare(s1, s2, TokenMax(RatcliffObershelp()))
+0.95
+```
 """
-struct TokenMax{T <: StringDistance} <: StringDistance
-    dist::T
+struct TokenMax{S <: StringDistance} <: StringDistance
+    dist::S
 end

 function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
--- a/src/edit.jl
+++ b/src/edit.jl
@ -89,7 +89,7 @@ function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; max
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
-    k, x1, x2start = remove_prefix(s1, s2)
+    k, x1, x2start = common_prefix(s1, s2)
    x1 == nothing && return len2 - k
    # distance initialized to first row of matrix
    # => distance between "" and s2[1:i}
@ -141,7 +141,7 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
    len1, len2 = length(s1), length(s2)
    max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
    # prefix common to both strings can be ignored
-    k, x1, x2start = remove_prefix(s1, s2)
+    k, x1, x2start = common_prefix(s1, s2)
    (x1 == nothing) && return len2 - k
    v0 = collect(1:(len2 - k))
    v2 = similar(v0)
--- a/src/find.jl
+++ b/src/find.jl
@ -5,16 +5,28 @@
 highest similarity score with `s` according to the distance `dist`. 
 It returns `(nothing, nothing)` if none of the elements has a similarity score 
 higher or equal to `min_score` (default to 0.0).
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
-(as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
+(as well as their modifications via [`Partial`](@ref), [`TokenSort`](@ref), [`TokenSet`](@ref), or [`TokenMax`](@ref)).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = ""Newark"
+julia> iter = ["New York", "Princeton", "San Francisco"]
+julia> findmax(s, iter, Levenshtein())
+("NewYork", 1)
+julia> findmax(s, iter, Levenshtein(); min_score = 0.9)
+(nothing, nothing)
+```
 """
 function Base.findmax(s::AbstractString, itr, dist::StringDistance; min_score = 0.0)
-    min_score = Threads.Atomic{typeof(min_score)}(min_score)
+    min_score_atomic = Threads.Atomic{typeof(min_score)}(min_score)
    scores = [0.0 for _ in 1:Threads.nthreads()]
    is = [0 for _ in 1:Threads.nthreads()]
    Threads.@threads for i in collect(keys(itr))
-        score = compare(s, itr[i], dist; min_score = min_score[])
-        score_old = Threads.atomic_max!(min_score, score)
+        score = compare(s, itr[i], dist; min_score = min_score_atomic[])
+        score_old = Threads.atomic_max!(min_score_atomic, score)
        if score >= score_old
            scores[Threads.threadid()] = score
            is[Threads.threadid()] = i
@ -30,8 +42,21 @@ end
 `findall` returns the vector of indices for elements of `itr` that have a 
 similarity score higher or equal than `min_score` according to the distance `dist`.
 If there are no such elements, return an empty array. 
-The function is optimized for `Levenshtein` and `DamerauLevenshtein` distances 
+
+It is particularly optimized for [`Levenshtein`](@ref) and [`DamerauLevenshtein`](@ref) distances 
 (as well as their modifications via `Partial`, `TokenSort`, `TokenSet`, or `TokenMax`).
+
+### Examples
+```julia-repl
+julia> using StringDistances
+julia> s = "Newark"
+julia> iter = ["Newwark", "Princeton", "San Francisco"]
+julia> findall(s, iter, Levenshtein())
+1-element Array{Int64,1}:
+ 1
+julia> findall(s, iter, Levenshtein(); min_score = 0.9)
+0-element Array{Int64,1}
+```
 """
 function Base.findall(s::AbstractString, itr, dist::StringDistance; min_score = 0.8)
    out = [Int[] for _ in 1:Threads.nthreads()]
--- a/src/qgram.jl
+++ b/src/qgram.jl
@ -48,7 +48,7 @@ abstract type QGramDistance <: StringDistance end

 function evaluate(dist::QGramDistance, s1::AbstractString, s2::AbstractString)
 	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
-	evaluate(dist, x)
+	evaluate(dist, values(x))
 end

 # For two iterators x1 and x2, this returns a dictionary which, for each element in x1 or x2, 
@ -98,9 +98,9 @@ struct QGram <: QGramDistance
 	q::Int
 end

-function evaluate(dist::QGram, count_dict)
+function evaluate(dist::QGram, itr)
 	n = 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		n += abs(n1 - n2)
 	end
 	n
@ -122,9 +122,9 @@ struct Cosine <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Cosine, count_dict)
+function evaluate(dist::Cosine, itr)
 	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		norm1 += n1^2
 		norm2 += n2^2
 		prodnorm += n1 * n2
@ -147,9 +147,9 @@ struct Jaccard <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Jaccard, count_dict)
+function evaluate(dist::Jaccard, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -172,9 +172,9 @@ struct SorensenDice <: QGramDistance
 	q::Int
 end

-function evaluate(dist::SorensenDice, count_dict)
+function evaluate(dist::SorensenDice, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -197,9 +197,9 @@ struct Overlap <: QGramDistance
 	q::Int
 end

-function evaluate(dist::Overlap, count_dict)
+function evaluate(dist::Overlap, itr)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in values(count_dict)
+	for (n1, n2) in itr
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
--- a/src/utils.jl
+++ b/src/utils.jl
@ -1,6 +1,5 @@
-# String with Length
-# This allows to compute length once and only once
-struct StringWithLength{T<:AbstractString} <: AbstractString
+# This type allows to compute length once and for all
+struct StringWithLength{T <: AbstractString} <: AbstractString
    s::T
    l::Int
 end
@ -21,19 +20,17 @@ function reorder(s1::AbstractString, s2::AbstractString)
    end
 end

- 
-## Find common prefixes (up to lim. -1 means Inf)
-function remove_prefix(s1::AbstractString, s2::AbstractString, lim::Integer = -1)
-    l = 0
+function common_prefix(s1::AbstractString, s2::AbstractString)
    x1 = iterate(s1)
    x2 = iterate(s2)
-    while (x1 !== nothing) & (x2 !== nothing) & (l < lim || lim < 0)
+    l = 0
+    while (x1 !== nothing) & (x2 !== nothing)
        ch1, state1 = x1
        ch2, state2 = x2
        ch1 != ch2 && break
+        l += 1
        x1 = iterate(s1, state1)
        x2 = iterate(s2, state2)
-        l += 1
    end
    return l, x1, x2
 end
--- a/test/modifiers.jl
+++ b/test/modifiers.jl
@ -97,6 +97,9 @@ using StringDistances, Test

 	# check find_best and find_all
 	@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
+	@test findmax("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
+	@test findmax("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
+
 	@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein(); min_score = 0.99) == (nothing, nothing)
 	@test findmax("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
 	@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]