clean

2019-08-18 12:52:37 -04:00 · 2019-08-18 12:52:37 -04:00 · 68702d8aa1
parent 6dc8056e37
commit 68702d8aa1
8 changed files with 124 additions and 116 deletions
--- a/README.md
+++ b/README.md
@ -20,10 +20,10 @@ compare("martha", "marhta", Hamming())
 ## Distances

 #### Edit Distances
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
 - [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
 - [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
+- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
+- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
 - [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`


--- a/benchmark/.sublime2Terminal.jl
+++ b/benchmark/.sublime2Terminal.jl
@ -1 +1 @@
-@time f(Jaccard(2), x, y)
+@time f(Jaccard(2), x, y)
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -6,13 +6,20 @@ y = map(Random.randstring, rand(5:25,500_000))
 function f(t, x, y)
    [evaluate(t, x[i], y[i]) for i in 1:length(x)]
 end
+@time f(Hamming(), x, y)
+@time f(Jaro(), x, y)
+@time f(Levenshtein(), x, y)
+# 0.3s. A big faster than StringDist
+@time f(DamerauLevenshtein(), x, y)
+@time f(RatcliffObershelp(), x, y)
+@time f(Jaccard(2), x, y)
+# 1.6s 2-3x slower compared to StringDist

 # a bist faster than StringDist
@time f(Levenshtein(), x, y)
 #  355.984 ms (1500004 allocations: 223.24 MiB)
@time f(RatcliffObershelp(), x, y)

-# 2-3x slower compared to StringDist
@time f(Jaccard(2), x, y)
 # 1.6s

--- a/src/StringDistances.jl
+++ b/src/StringDistances.jl
@ -27,16 +27,16 @@ Partial,
 TokenSort,
 TokenSet,
 TokenMax,
-qgram_iterator
+qgram

 ##############################################################################
 ##
 ## include
 ##
 ##############################################################################
-include("distances/utils.jl")
-include("distances/edit.jl")
-include("distances/qgram.jl")
+include("utils.jl")
+include("edit.jl")
+include("qgram.jl")
 include("compare.jl")

 end
--- a/src/compare.jl
+++ b/src/compare.jl
@ -45,17 +45,14 @@ Winkler is a `PreMetric` modifier that boosts the similarity score between two s
 """
 struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
    dist::T1
-    scaling_factor::T2      # scaling factor. Default to 0.1
+    scaling_factor::T2          # scaling factor. Default to 0.1
    boosting_threshold::T3      # boost threshold. Default to 0.7
 end
-
-# restrict to distance between 0 and 1
 Winkler(x) = Winkler(x, 0.1, 0.7)

 function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
    score = compare(s1, s2, dist.dist)
    l = common_prefix(s1, s2, 4)[1]
-    # common prefix adjustment
    if score >= dist.boosting_threshold
        score += l * dist.scaling_factor * (1 - score)
    end
@ -77,27 +74,24 @@ struct Partial{T <: PreMetric} <: PreMetric
    dist::T
 end

-# general
 function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
    s2, len2, s1, len1 = reorder(s1, s2)
    len1 == len2 && return compare(s1, s2, dist.dist)
    len1 == 0 && return compare("", "", dist.dist)
    out = 0.0
-    for x in qgram_iterator(s2, len1)
+    for x in qgram(s2, len1)
        curr = compare(s1, x, dist.dist)
        out = max(out, curr)
    end
    return out
 end

-# Specialization for RatcliffObershelp distance
-# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
 function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
    s2, len2, s1, len1 = reorder(s1, s2)
    len1 == len2 && return compare(s1, s2, dist.dist)
    out = 0.0
    for r in matching_blocks(s1, s2)
-        # here I difffer from fuzz.py by making sure the substring of s2 has length len1
+        # Make sure the substring of s2 has length len1
        s2_start = r[2] - r[1] + 1
        s2_end = s2_start + len1 - 1
        if s2_start <= 0
@ -183,9 +177,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
    dist0 = compare(s1, s2, dist.dist)
    s2, len2, s1, len1 = reorder(s1, s2)
    unbase_scale = 0.95
-    # if one string is much much shorter than the other
+    # if one string is much shorter than the other, use partial
    if len2 >= 1.5 * len1
-        # if strings are of dissimilar length, use partials
        partial = compare(s1, s2, Partial(dist.dist)) 
        ptsor = compare(s1, s2, TokenSort(Partial(dist.dist))) 
        ptser = compare(s1, s2, TokenSet(Partial(dist.dist))) 
--- a/src/distances/edit.jl
+++ b/src/distances/edit.jl
@ -12,10 +12,84 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
    return current
 end

+##############################################################################
+##
+## Jaro
+##
+##############################################################################
+"""
+    Jaro()
+
+Creates the Jaro metric
+
+The Jaro distance is defined as
+
+
+``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
+
+where ``m`` is the number of matching characters and 
+``t`` is half the number of transpositions.
+"""
+struct Jaro <: SemiMetric end
+
+## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
+function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
+    s2, len2, s1, len1 = reorder(s1, s2)
+    # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
+    len2 == 0 && return 0.0
+    maxdist = max(0, div(len2, 2) - 1)
+    flag = fill(false, len2)
+    prevstate1 = firstindex(s1)
+    i1_match = prevstate1 * ones(Int, len1)
+    #  m counts number matching characters
+    m = 0 
+    i1 = 1
+    i2 = 1
+    x1 = iterate(s1)
+    x2 = iterate(s2)
+    while x1 !== nothing
+        ch1, state1 = x1
+        if i2 <= i1 - maxdist - 1
+            ch2, state2 = x2
+            i2 += 1
+            x2 = iterate(s2, state2)
+        end 
+        i2curr = i2
+        x2curr = x2
+        while x2curr !== nothing
+            (i2curr > i1 + maxdist) && break
+            ch2, state2 = x2curr
+            if (ch1 == ch2) & !flag[i2curr] 
+                m += 1
+                flag[i2curr] = true
+                i1_match[m] = prevstate1
+                break
+            end
+            x2curr = iterate(s2, state2) 
+            i2curr += 1
+        end
+        x1 = iterate(s1, state1)
+        i1 += 1
+        prevstate1 = state1
+    end
+    m == 0 && return 1.0
+    # t counts number of transpositions
+    t = 0
+    i1 = 0
+    i2 = 0
+    for ch2 in s2
+        i2 += 1
+        if flag[i2]
+            i1 += 1
+            t += ch2 != iterate(s1, i1_match[i1])[1]
+        end
+    end
+    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
+end
+
 ##############################################################################
 ##
 ## Levenshtein
-## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
 ##
 ##############################################################################
 """
@ -27,6 +101,7 @@ The Levenshtein distance is the minimum number of operations (consisting of inse
 """
 struct Levenshtein <: SemiMetric end

+## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
 function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
    s2, len2, s1, len1 = reorder(s1, s2)
    # prefix common to both strings can be ignored
@ -64,7 +139,6 @@ end
 ##############################################################################
 ##
 ## Damerau Levenshtein
-## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 ##
 ##############################################################################
 """
@ -76,6 +150,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
 """
 struct DamerauLevenshtein <: SemiMetric end

+## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
 function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
    s2, len2, s1, len1 = reorder(s1, s2)
    # prefix common to both strings can be ignored
@ -132,80 +207,6 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
    return current
 end

-##############################################################################
-##
-## Jaro
-## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
-##
-##############################################################################
-"""
-    Jaro()
-
-Creates the Jaro metric
-
-The Jaro distance is defined as
-
-
-``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
-
-where ``m`` is the number of matching characters and 
-``t`` is half the number of transpositions.
-"""
-struct Jaro <: SemiMetric end
-
-function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
-    s2, len2, s1, len1 = reorder(s1, s2)
-    # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
-    len2 == 0 && return 0.0
-    maxdist = max(0, div(len2, 2) - 1)
-    flag = fill(false, len2)
-    prevstate1 = firstindex(s1)
-    i1_match = prevstate1 * ones(Int, len1)
-    #  m counts number matching characters
-    m = 0 
-    i1 = 1
-    i2 = 1
-    x1 = iterate(s1)
-    x2 = iterate(s2)
-    while x1 !== nothing
-        ch1, state1 = x1
-        if i2 <= i1 - maxdist - 1
-            ch2, state2 = x2
-            i2 += 1
-            x2 = iterate(s2, state2)
-        end 
-        i2curr = i2
-        x2curr = x2
-        while x2curr !== nothing
-            (i2curr > i1 + maxdist) && break
-            ch2, state2 = x2curr
-            if (ch1 == ch2) & !flag[i2curr] 
-                m += 1
-                flag[i2curr] = true
-                i1_match[m] = prevstate1
-                break
-            end
-            x2curr = iterate(s2, state2) 
-            i2curr += 1
-        end
-        x1 = iterate(s1, state1)
-        i1 += 1
-        prevstate1 = state1
-    end
-    m == 0 && return 1.0
-    # t counts number of transpotsitions
-    t = 0
-    i1 = 0
-    i2 = 0
-    for ch2 in s2
-        i2 += 1
-        if flag[i2]
-            i1 += 1
-            t += ch2 != iterate(s1, i1_match[i1])[1]
-        end
-    end
-    return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
-end

 ##############################################################################
 ##
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -32,27 +32,34 @@ Return an iterator that iterates on the QGram of the string
 ## Examples
 ```julia
 using StringDistances
-for x in qgram_iterator("hello", 2)
+for x in qgram("hello", 2)
 	@show x
 end
 ```
 """
-function qgram_iterator(s::AbstractString, q::Integer)
+function qgram(s::AbstractString, q::Integer)
 	QGramIterator{typeof(s)}(s, length(s), q)
 end

 ##############################################################################
 ##
-## For two iterators x1 x2, count_map(x1, x2) returns an iterator 
-## that returns,  for each element in union{x1, x2}, the numbers of 
-## times it appears in x1 and the number of times it appears in x2
+## 
+## 
+## 
 ##
 ##############################################################################
-# I use a faster way to change a dictionary key
-# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
+"""
+   count_map(x1, x2)
+
+For two iterators `x1` and `x2`, `count_map(x1, x2)`  returns an dictionary 
+that returns,  for each element in `x1` or `x2`, a tuple with the numbers of 
+times it appears in `x1` and the number of times it appears in `x2`
+"""
 function count_map(s1, s2)
-	K = Union{eltype(s1), eltype(s2)}
-	d = Dict{K, NTuple{2, Int}}()
+	K = promote_type(eltype(s1), eltype(s2))
+	d = Dict{K, Tuple{Int, Int}}()
+	# I use a faster way to change a dictionary key
+	# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
 	sizehint!(d, length(s1) + length(s2))
 	for x1 in s1
 		index = Base.ht_keyindex2!(d, x1)
@ -74,7 +81,7 @@ function count_map(s1, s2)
 			@inbounds Base._setindex!(d, (0, 1), x2, -index)
 		end
 	end
-	return values(d)
+	return d
 end

 #= Trie
@ -126,7 +133,7 @@ end
 abstract type AbstractQGramDistance <: SemiMetric end

 function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
-	x = count_map(qgram_iterator(s1, dist.q), qgram_iterator(s2, dist.q))
+	x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
 	evaluate(dist, x)
 end

@ -155,9 +162,9 @@ struct QGram <: AbstractQGramDistance
 	q::Int
 end

-function evaluate(dist::QGram, countiterator)
+function evaluate(dist::QGram, count_dict)
 	n = 0
-	for (n1, n2) in countiterator
+	for (n1, n2) in values(count_dict)
 		n += abs(n1 - n2)
 	end
 	n
@ -184,9 +191,9 @@ struct Cosine <: AbstractQGramDistance
 	q::Int
 end

-function evaluate(dist::Cosine, countiterator)
+function evaluate(dist::Cosine, count_dict)
 	norm1, norm2, prodnorm = 0, 0, 0
-	for (n1, n2) in countiterator
+	for (n1, n2) in values(count_dict)
 		norm1 += n1^2
 		norm2 += n2^2
 		prodnorm += n1 * n2
@ -214,9 +221,9 @@ struct Jaccard <: AbstractQGramDistance
 	q::Int
 end

-function evaluate(dist::Jaccard, countiterator)
+function evaluate(dist::Jaccard, count_dict)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in countiterator
+	for (n1, n2) in values(count_dict)
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -244,9 +251,9 @@ struct SorensenDice <: AbstractQGramDistance
 	q::Int
 end

-function evaluate(dist::SorensenDice, countiterator)
+function evaluate(dist::SorensenDice, count_dict)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in countiterator
+	for (n1, n2) in values(count_dict)
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
@ -275,9 +282,9 @@ struct Overlap <: AbstractQGramDistance
 	q::Int
 end

-function evaluate(dist::Overlap, countiterator)
+function evaluate(dist::Overlap, count_dict)
 	ndistinct1, ndistinct2, nintersect = 0, 0, 0
-	for (n1, n2) in countiterator
+	for (n1, n2) in values(count_dict)
 		ndistinct1 += n1 > 0
 		ndistinct2 += n2 > 0
 		nintersect += (n1 > 0) & (n2 > 0)
--- a/src/distances/utils.jl
+++ b/src/distances/utils.jl