simplify code + allow distance in third arg

pull/17/head v0.3.3
matthieugomez 2019-08-15 11:07:12 -04:00
parent 4a3a591af6
commit 183e83b0b9
5 changed files with 42 additions and 50 deletions

View File

@ -10,9 +10,9 @@ The function `compare` returns a similarity score between two strings. The func
```julia
using StringDistances
compare(Hamming(), "martha", "martha")
compare("martha", "martha", Hamming())
#> 1.0
compare(Hamming(), "martha", "marhta")
compare("martha", "marhta", Hamming())
#> 0.6666666666666667
```
@ -46,14 +46,14 @@ The package includes distance "modifiers", that can be applied to any distance.
- [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but this package defines it for any string distance.
```julia
compare(Jaro(), "martha", "marhta")
compare("martha", "marhta", Jaro())
#> 0.9444444444444445
compare(Winkler(Jaro()), "martha", "marhta")
compare("martha", "marhta", Winkler(Jaro()))
#> 0.9611111111111111
compare(QGram(2), "william", "williams")
compare("william", "williams", QGram(2))
#> 0.9230769230769231
compare(Winkler(QGram(2)), "william", "williams")
compare("william", "williams", Winkler(QGram(2)))
#> 0.9538461538461539
```
@ -62,27 +62,27 @@ The package includes distance "modifiers", that can be applied to any distance.
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the maximal similarity score between the shorter string and substrings of the longer string.
```julia
compare(Levenshtein(), "New York Yankees", "Yankees")
compare("New York Yankees", "Yankees", Levenshtein())
#> 0.4375
compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
compare("New York Yankees", "Yankees", Partial(Levenshtein()))
#> 1.0
```
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically.
```julia
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
compare("mariners vs angels", "angels vs mariners", RatcliffObershelp())
#> 0.44444
compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
compare("mariners vs angels", "angels vs mariners", TokenSort(RatcliffObershelp())
#> 1.0
```
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
```julia
compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro())
#> 0.559904
compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro()))
#> 0.944444
```
@ -90,7 +90,7 @@ The package includes distance "modifiers", that can be applied to any distance.
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths.
```julia
compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
#> 0.855
```
## Compare vs Evaluate
@ -98,7 +98,7 @@ The function `compare` returns a similarity score: a value of 0 means completely
In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.
```julia
compare(Levenshtein(), "New York", "New York")
compare("New York", "New York", Levenshtein())
#> 1.0
evaluate(Levenshtein(), "New York", "New York")
#> 0

View File

@ -1 +1 @@
@time f(Jaccard(2), x, y)
@time f(RatcliffObershelp(), x, y)

View File

@ -7,15 +7,14 @@ function f(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
end
# same speed as StringDist
@time f(Levenshtein(), x, y)
@time f(Jaro(), x, y)
# a bist faster than StringDist
@btime f(Levenshtein(), x, y)
# 355.984 ms (1500004 allocations: 223.24 MiB)
@time f(RatcliffObershelp(), x, y)
# 4x slower compared to StringDist
# 2-3x slower compared to StringDist
@time f(Jaccard(2), x, y)
@time f(Cosine(2), x, y)
@time f(QGram(2), x, y)
# 1.6s
#
@ -29,7 +28,9 @@ library(stringdist)
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
system.time(stringdist(x,y,method='lv', nthread = 1))
# 0.472
system.time(stringdist(x,y,method='jaccard', nthread = 1))
# 0.739
system.time(stringdist(x,y,method='cosine', nthread = 1))
system.time(stringdist(x,y,method='qgram', nthread = 1))

View File

@ -5,6 +5,11 @@
##
##############################################################################
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
compare(dist, s1, s2)
end
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
1.0 - evaluate(dist, s1, s2)
end
@ -15,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
end
function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N}
# When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2)
min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2)
min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2)
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2)
else
1 - evaluate(dist, s1, s2)
end

View File

@ -4,13 +4,11 @@
## Define a type that iterates through q-grams of a string
##
##############################################################################
# N is the number of characters in the QGram
struct QGramIterator{S <: AbstractString, N}
s::S # grapheme
l::Int # length of string
end
# N is the number of characters in the QGram
param(x::QGramIterator{S, N}) where {S, N} = N
function Base.iterate(qgram::QGramIterator{S, N},
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
@ -25,65 +23,53 @@ Base.eltype(qgram::QGramIterator) = String
##############################################################################
##
## CountedIterator that use Dictionary
##
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
## For two iterators x1 x2, count_map(x1, x2) returns an iterator that returns, for each element in union{x1, x2}, the numbers of times it appears in x1 and the number of times it appears in x2
##
##############################################################################
struct CountIteratorDictionary{T}
d::T
end
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
K = String
function count_map(s1, s2) where {S1, S2, N}
K = Union{eltype(s1), eltype(s2)}
d = Dict{K, NTuple{2, Int}}()
sizehint!(d, length(s1) + length(s2))
for ch10 in s1
ch1 = convert(K, ch10)
!isequal(ch1, ch10) && throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
for ch1 in s1
index = Base.ht_keyindex2!(d, ch1)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch1
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
else
Base._setindex!(d, (1, 0), ch1, -index)
@inbounds Base._setindex!(d, (1, 0), ch1, -index)
end
end
for ch20 in s2
ch2 = convert(K, ch20)
!isequal(ch2, ch20) && throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
for ch2 in s2
index = Base.ht_keyindex2!(d, ch2)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch2
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
else
Base._setindex!(d, (0, 1), ch2, -index)
@inbounds Base._setindex!(d, (0, 1), ch2, -index)
end
end
return values(d)
end
##############################################################################
##
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract type AbstractQGram{N} <: SemiMetric end
param(x::AbstractQGram{N}) where N = N
function qgram_iterator(dist::AbstractQGram, s::AbstractString)
QGramIterator{typeof(s), param(dist)}(s, length(s))
function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N}
QGramIterator{typeof(s), N}(s, length(s))
end
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
evaluate(dist,
CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
end
##############################################################################