simplify code + allow distance in third arg

pull/17/head v0.3.3
matthieugomez 2019-08-15 11:07:12 -04:00
parent 4a3a591af6
commit 183e83b0b9
5 changed files with 42 additions and 50 deletions

View File

@ -10,9 +10,9 @@ The function `compare` returns a similarity score between two strings. The func
using StringDistances
compare(Hamming(), "martha", "martha")
compare("martha", "martha", Hamming())
#> 1.0
compare(Hamming(), "martha", "marhta")
compare("martha", "marhta", Hamming())
#> 0.6666666666666667
@ -46,14 +46,14 @@ The package includes distance "modifiers", that can be applied to any distance.
- [Winkler]( boosts the similary score of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but this package defines it for any string distance.
compare(Jaro(), "martha", "marhta")
compare("martha", "marhta", Jaro())
#> 0.9444444444444445
compare(Winkler(Jaro()), "martha", "marhta")
compare("martha", "marhta", Winkler(Jaro()))
#> 0.9611111111111111
compare(QGram(2), "william", "williams")
compare("william", "williams", QGram(2))
#> 0.9230769230769231
compare(Winkler(QGram(2)), "william", "williams")
compare("william", "williams", Winkler(QGram(2)))
#> 0.9538461538461539
@ -62,27 +62,27 @@ The package includes distance "modifiers", that can be applied to any distance.
- [Partial]( returns the maximal similarity score between the shorter string and substrings of the longer string.
compare(Levenshtein(), "New York Yankees", "Yankees")
compare("New York Yankees", "Yankees", Levenshtein())
#> 0.4375
compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
compare("New York Yankees", "Yankees", Partial(Levenshtein()))
#> 1.0
- [TokenSort]( adjusts for differences in word orders by reording words alphabetically.
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
compare("mariners vs angels", "angels vs mariners", RatcliffObershelp())
#> 0.44444
compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
compare("mariners vs angels", "angels vs mariners", TokenSort(RatcliffObershelp())
#> 1.0
- [TokenSet]( adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro())
#> 0.559904
compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro()))
#> 0.944444
@ -90,7 +90,7 @@ The package includes distance "modifiers", that can be applied to any distance.
- [TokenMax]( combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths.
compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners")
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
#> 0.855
## Compare vs Evaluate
@ -98,7 +98,7 @@ The function `compare` returns a similarity score: a value of 0 means completely
In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.
compare(Levenshtein(), "New York", "New York")
compare("New York", "New York", Levenshtein())
#> 1.0
evaluate(Levenshtein(), "New York", "New York")
#> 0

View File

@ -1 +1 @@
@time f(Jaccard(2), x, y)
@time f(RatcliffObershelp(), x, y)

View File

@ -7,15 +7,14 @@ function f(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
# same speed as StringDist
@time f(Levenshtein(), x, y)
@time f(Jaro(), x, y)
# a bist faster than StringDist
@btime f(Levenshtein(), x, y)
# 355.984 ms (1500004 allocations: 223.24 MiB)
@time f(RatcliffObershelp(), x, y)
# 4x slower compared to StringDist
# 2-3x slower compared to StringDist
@time f(Jaccard(2), x, y)
@time f(Cosine(2), x, y)
@time f(QGram(2), x, y)
# 1.6s
@ -29,7 +28,9 @@ library(stringdist)
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
system.time(stringdist(x,y,method='lv', nthread = 1))
# 0.472
system.time(stringdist(x,y,method='jaccard', nthread = 1))
# 0.739
system.time(stringdist(x,y,method='cosine', nthread = 1))
system.time(stringdist(x,y,method='qgram', nthread = 1))

View File

@ -5,6 +5,11 @@
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
compare(dist, s1, s2)
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
1.0 - evaluate(dist, s1, s2)
@ -15,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N}
# When string length < q for qgram distance, returns s1 == s2
len1 = length(s1) ; len2 = length(s2)
min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2)
min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2)
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2)
1 - evaluate(dist, s1, s2)

View File

@ -4,13 +4,11 @@
## Define a type that iterates through q-grams of a string
# N is the number of characters in the QGram
struct QGramIterator{S <: AbstractString, N}
s::S # grapheme
l::Int # length of string
# N is the number of characters in the QGram
param(x::QGramIterator{S, N}) where {S, N} = N
function Base.iterate(qgram::QGramIterator{S, N},
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
@ -25,65 +23,53 @@ Base.eltype(qgram::QGramIterator) = String
## CountedIterator that use Dictionary
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
## v1 and v2 must be sorted vectors
## For two iterators x1 x2, count_map(x1, x2) returns an iterator that returns, for each element in union{x1, x2}, the numbers of times it appears in x1 and the number of times it appears in x2
struct CountIteratorDictionary{T}
# I use a faster way to change a dictionary key
# see setindex! in
function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
K = String
function count_map(s1, s2) where {S1, S2, N}
K = Union{eltype(s1), eltype(s2)}
d = Dict{K, NTuple{2, Int}}()
sizehint!(d, length(s1) + length(s2))
for ch10 in s1
ch1 = convert(K, ch10)
!isequal(ch1, ch10) && throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
for ch1 in s1
index = Base.ht_keyindex2!(d, ch1)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch1
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
Base._setindex!(d, (1, 0), ch1, -index)
@inbounds Base._setindex!(d, (1, 0), ch1, -index)
for ch20 in s2
ch2 = convert(K, ch20)
!isequal(ch2, ch20) && throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
for ch2 in s2
index = Base.ht_keyindex2!(d, ch2)
if index > 0
d.age += 1
@inbounds d.keys[index] = ch2
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
Base._setindex!(d, (0, 1), ch2, -index)
@inbounds Base._setindex!(d, (0, 1), ch2, -index)
return values(d)
## Distance on strings is computed by set distance on qgram sets
abstract type AbstractQGram{N} <: SemiMetric end
param(x::AbstractQGram{N}) where N = N
function qgram_iterator(dist::AbstractQGram, s::AbstractString)
QGramIterator{typeof(s), param(dist)}(s, length(s))
function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N}
QGramIterator{typeof(s), N}(s, length(s))
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))