simplify code + allow distance in third arg

2019-08-15 11:07:12 -04:00 · 2019-08-15 11:07:12 -04:00 · 183e83b0b9
parent 4a3a591af6
commit 183e83b0b9
5 changed files with 42 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -10,9 +10,9 @@ The function `compare` returns  a similarity score between two strings. The func

 ```julia
 using StringDistances
-compare(Hamming(), "martha", "martha")
+compare("martha", "martha", Hamming())
 #> 1.0
-compare(Hamming(), "martha", "marhta")
+compare("martha", "marhta", Hamming())
 #> 0.6666666666666667
 ```

@ -46,14 +46,14 @@ The package includes distance "modifiers", that can be applied to any distance.
 - [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes.  The Winkler adjustment was originally defined for the Jaro similarity score but this package defines it for any string distance.

 	```julia
-	compare(Jaro(), "martha", "marhta")
+	compare("martha", "marhta", Jaro())
 	#> 0.9444444444444445
-	compare(Winkler(Jaro()), "martha", "marhta")
+	compare("martha", "marhta", Winkler(Jaro()))
 	#> 0.9611111111111111

-	compare(QGram(2), "william", "williams")
+	compare("william", "williams", QGram(2))
 	#> 0.9230769230769231
-	compare(Winkler(QGram(2)), "william", "williams")
+	compare("william", "williams", Winkler(QGram(2)))
 	#> 0.9538461538461539
 	```

@ -62,27 +62,27 @@ The package includes distance "modifiers", that can be applied to any distance.
 	- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the maximal similarity score between the shorter string and substrings of the longer string.

 		```julia
-		compare(Levenshtein(), "New York Yankees", "Yankees")
+		compare("New York Yankees", "Yankees", Levenshtein())
 		#> 0.4375
-		compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
+		compare("New York Yankees", "Yankees", Partial(Levenshtein()))
 		#> 1.0
 		```

 	- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically. 

 		```julia
-		compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
+		compare("mariners vs angels", "angels vs mariners", RatcliffObershelp())
 		#> 0.44444
-		compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
+		compare("mariners vs angels", "angels vs mariners", TokenSort(RatcliffObershelp())
 		#> 1.0
 		```

 	- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.

 		```julia
-		compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
+		compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro())
 		#> 0.559904
-		compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
+		compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro()))
 		#> 0.944444
 		```

@ -90,7 +90,7 @@ The package includes distance "modifiers", that can be applied to any distance.
 	- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths.

 		```julia
-		compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners")
+		compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
 		#> 0.855
 		```
 ## Compare vs Evaluate
@ -98,7 +98,7 @@ The function `compare` returns a similarity score: a value of 0 means completely
 In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.

 ```julia
-compare(Levenshtein(), "New York", "New York")
+compare("New York", "New York", Levenshtein())
 #> 1.0
 evaluate(Levenshtein(), "New York", "New York")
 #> 0
--- a/benchmark/.sublime2Terminal.jl
+++ b/benchmark/.sublime2Terminal.jl
@ -1 +1 @@
-@time f(Jaccard(2), x, y)
+@time f(RatcliffObershelp(), x, y)
--- a/benchmark/benchmark.jl
+++ b/benchmark/benchmark.jl
@ -7,15 +7,14 @@ function f(t, x, y)
    [evaluate(t, x[i], y[i]) for i in 1:length(x)]
 end

-# same speed as StringDist
-@time f(Levenshtein(), x, y)
-@time f(Jaro(), x, y)
+# a bist faster than StringDist
+@btime f(Levenshtein(), x, y)
+#  355.984 ms (1500004 allocations: 223.24 MiB)
@time f(RatcliffObershelp(), x, y)

-# 4x slower compared to StringDist
+# 2-3x slower compared to StringDist
@time f(Jaccard(2), x, y)
-@time f(Cosine(2), x, y)
-@time f(QGram(2), x, y)
+# 1.6s

 #

@ -29,7 +28,9 @@ library(stringdist)
 x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) 
 y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
 system.time(stringdist(x,y,method='lv', nthread = 1))
+#  0.472
 system.time(stringdist(x,y,method='jaccard', nthread = 1))
+# 0.739
 system.time(stringdist(x,y,method='cosine', nthread = 1))
 system.time(stringdist(x,y,method='qgram', nthread = 1))

--- a/src/compare.jl
+++ b/src/compare.jl
@ -5,6 +5,11 @@
 ##
 ##############################################################################

+function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
+    compare(dist, s1, s2)
+end
+
+
 function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
    1.0 - evaluate(dist, s1, s2)
 end
@ -15,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
    len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
 end

-function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
+function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N}
    # When string length < q for qgram distance, returns s1 == s2
    len1 = length(s1) ; len2 = length(s2)
-    min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2)
+    min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2)
    if typeof(dist) <: QGram
-        1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2)
+        1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2)
    else
        1 - evaluate(dist, s1, s2)
    end
--- a/src/distances/qgram.jl
+++ b/src/distances/qgram.jl
@ -4,13 +4,11 @@
 ## Define a type that iterates through q-grams of a string
 ##
 ##############################################################################
-
+# N is the number of characters in the QGram
 struct QGramIterator{S <: AbstractString, N}
 	s::S # grapheme
 	l::Int # length of string
 end
-# N is the number of characters in the QGram
-param(x::QGramIterator{S, N}) where {S, N} = N

 function Base.iterate(qgram::QGramIterator{S, N}, 
 	state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
@ -25,65 +23,53 @@ Base.eltype(qgram::QGramIterator) = String

 ##############################################################################
 ##
-## CountedIterator that use Dictionary
-##
-## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
-## v1 and v2 must be sorted vectors
+## For two iterators x1 x2, count_map(x1, x2) returns an iterator that returns,  for each element in union{x1, x2}, the numbers of times it appears in x1 and the number of times it appears in x2
 ##
 ##############################################################################
-struct CountIteratorDictionary{T}
-	d::T
-end

+# I use a faster way to change a dictionary key
 # see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
-function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
-	K = String
+function count_map(s1, s2) where {S1, S2, N}
+	K = Union{eltype(s1), eltype(s2)}
 	d = Dict{K, NTuple{2, Int}}()
 	sizehint!(d, length(s1) + length(s2))
-	for ch10 in s1
-		ch1 = convert(K, ch10)
-		!isequal(ch1, ch10) && throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
+	for ch1 in s1
 		index = Base.ht_keyindex2!(d, ch1)
 		if index > 0
 			d.age += 1
 			@inbounds d.keys[index] = ch1
 			@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
 		else
-			Base._setindex!(d, (1, 0), ch1, -index)
+			@inbounds Base._setindex!(d, (1, 0), ch1, -index)
 		end
 	end
-	for ch20 in s2
-		ch2 = convert(K, ch20)
-		!isequal(ch2, ch20) && throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
+	for ch2 in s2
 		index = Base.ht_keyindex2!(d, ch2)
 		if index > 0
 			d.age += 1
 			@inbounds d.keys[index] = ch2
 			@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
 		else
-			Base._setindex!(d, (0, 1), ch2, -index)
+			@inbounds Base._setindex!(d, (0, 1), ch2, -index)
 		end
 	end
 	return values(d)
 end


-
 ##############################################################################
 ##
 ## Distance on strings is computed by set distance on qgram sets
 ##
 ##############################################################################
 abstract type AbstractQGram{N} <: SemiMetric end
-param(x::AbstractQGram{N}) where N = N

-function qgram_iterator(dist::AbstractQGram, s::AbstractString)
-	QGramIterator{typeof(s), param(dist)}(s, length(s))
+function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N}
+	QGramIterator{typeof(s), N}(s, length(s))
 end

 function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
-	evaluate(dist, 
-		CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
+	evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
 end

 ##############################################################################