diff --git a/README.md b/README.md index bd5bb03..19e661d 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ The function `compare` returns a similarity score between two strings. The func ```julia using StringDistances -compare(Hamming(), "martha", "martha") +compare("martha", "martha", Hamming()) #> 1.0 -compare(Hamming(), "martha", "marhta") +compare("martha", "marhta", Hamming()) #> 0.6666666666666667 ``` @@ -46,14 +46,14 @@ The package includes distance "modifiers", that can be applied to any distance. - [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but this package defines it for any string distance. ```julia - compare(Jaro(), "martha", "marhta") + compare("martha", "marhta", Jaro()) #> 0.9444444444444445 - compare(Winkler(Jaro()), "martha", "marhta") + compare("martha", "marhta", Winkler(Jaro())) #> 0.9611111111111111 - compare(QGram(2), "william", "williams") + compare("william", "williams", QGram(2)) #> 0.9230769230769231 - compare(Winkler(QGram(2)), "william", "williams") + compare("william", "williams", Winkler(QGram(2))) #> 0.9538461538461539 ``` @@ -62,27 +62,27 @@ The package includes distance "modifiers", that can be applied to any distance. - [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the maximal similarity score between the shorter string and substrings of the longer string. ```julia - compare(Levenshtein(), "New York Yankees", "Yankees") + compare("New York Yankees", "Yankees", Levenshtein()) #> 0.4375 - compare(Partial(Levenshtein()), "New York Yankees", "Yankees") + compare("New York Yankees", "Yankees", Partial(Levenshtein())) #> 1.0 ``` - [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically. ```julia - compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners") + compare("mariners vs angels", "angels vs mariners", RatcliffObershelp()) #> 0.44444 - compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners") + compare("mariners vs angels", "angels vs mariners", TokenSort(RatcliffObershelp()) #> 1.0 ``` - [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string. ```julia - compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners") + compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro()) #> 0.559904 - compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners") + compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro())) #> 0.944444 ``` @@ -90,7 +90,7 @@ The package includes distance "modifiers", that can be applied to any distance. - [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths. ```julia - compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") + compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp())) #> 0.855 ``` ## Compare vs Evaluate @@ -98,7 +98,7 @@ The function `compare` returns a similarity score: a value of 0 means completely In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded. ```julia -compare(Levenshtein(), "New York", "New York") +compare("New York", "New York", Levenshtein()) #> 1.0 evaluate(Levenshtein(), "New York", "New York") #> 0 diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl index 1185d75..055a06b 100644 --- a/benchmark/.sublime2Terminal.jl +++ b/benchmark/.sublime2Terminal.jl @@ -1 +1 @@ -@time f(Jaccard(2), x, y) +@time f(RatcliffObershelp(), x, y) diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index a1117bc..d8108ca 100644 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -7,15 +7,14 @@ function f(t, x, y) [evaluate(t, x[i], y[i]) for i in 1:length(x)] end -# same speed as StringDist -@time f(Levenshtein(), x, y) -@time f(Jaro(), x, y) +# a bist faster than StringDist +@btime f(Levenshtein(), x, y) +# 355.984 ms (1500004 allocations: 223.24 MiB) @time f(RatcliffObershelp(), x, y) -# 4x slower compared to StringDist +# 2-3x slower compared to StringDist @time f(Jaccard(2), x, y) -@time f(Cosine(2), x, y) -@time f(QGram(2), x, y) +# 1.6s # @@ -29,7 +28,9 @@ library(stringdist) x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) system.time(stringdist(x,y,method='lv', nthread = 1)) +# 0.472 system.time(stringdist(x,y,method='jaccard', nthread = 1)) +# 0.739 system.time(stringdist(x,y,method='cosine', nthread = 1)) system.time(stringdist(x,y,method='qgram', nthread = 1)) diff --git a/src/compare.jl b/src/compare.jl index 1bbf42d..e6365a1 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -5,6 +5,11 @@ ## ############################################################################## +function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric) + compare(dist, s1, s2) +end + + function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) 1.0 - evaluate(dist, s1, s2) end @@ -15,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein}, len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len end -function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) +function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N} # When string length < q for qgram distance, returns s1 == s2 len1 = length(s1) ; len2 = length(s2) - min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2) + min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2) if typeof(dist) <: QGram - 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2) + 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2) else 1 - evaluate(dist, s1, s2) end diff --git a/src/distances/qgram.jl b/src/distances/qgram.jl index b90579d..ba93c7f 100755 --- a/src/distances/qgram.jl +++ b/src/distances/qgram.jl @@ -4,13 +4,11 @@ ## Define a type that iterates through q-grams of a string ## ############################################################################## - +# N is the number of characters in the QGram struct QGramIterator{S <: AbstractString, N} s::S # grapheme l::Int # length of string end -# N is the number of characters in the QGram -param(x::QGramIterator{S, N}) where {S, N} = N function Base.iterate(qgram::QGramIterator{S, N}, state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N} @@ -25,65 +23,53 @@ Base.eltype(qgram::QGramIterator) = String ############################################################################## ## -## CountedIterator that use Dictionary -## -## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2 -## v1 and v2 must be sorted vectors +## For two iterators x1 x2, count_map(x1, x2) returns an iterator that returns, for each element in union{x1, x2}, the numbers of times it appears in x1 and the number of times it appears in x2 ## ############################################################################## -struct CountIteratorDictionary{T} - d::T -end +# I use a faster way to change a dictionary key # see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380 -function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N} - K = String +function count_map(s1, s2) where {S1, S2, N} + K = Union{eltype(s1), eltype(s2)} d = Dict{K, NTuple{2, Int}}() sizehint!(d, length(s1) + length(s2)) - for ch10 in s1 - ch1 = convert(K, ch10) - !isequal(ch1, ch10) && throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K")) + for ch1 in s1 index = Base.ht_keyindex2!(d, ch1) if index > 0 d.age += 1 @inbounds d.keys[index] = ch1 @inbounds d.vals[index] = (d.vals[index][1] + 1, 0) else - Base._setindex!(d, (1, 0), ch1, -index) + @inbounds Base._setindex!(d, (1, 0), ch1, -index) end end - for ch20 in s2 - ch2 = convert(K, ch20) - !isequal(ch2, ch20) && throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K")) + for ch2 in s2 index = Base.ht_keyindex2!(d, ch2) if index > 0 d.age += 1 @inbounds d.keys[index] = ch2 @inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1) else - Base._setindex!(d, (0, 1), ch2, -index) + @inbounds Base._setindex!(d, (0, 1), ch2, -index) end end return values(d) end - ############################################################################## ## ## Distance on strings is computed by set distance on qgram sets ## ############################################################################## abstract type AbstractQGram{N} <: SemiMetric end -param(x::AbstractQGram{N}) where N = N -function qgram_iterator(dist::AbstractQGram, s::AbstractString) - QGramIterator{typeof(s), param(dist)}(s, length(s)) +function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N} + QGramIterator{typeof(s), N}(s, length(s)) end function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString) - evaluate(dist, - CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2))) + evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2))) end ##############################################################################