parent
4a3a591af6
commit
183e83b0b9
28
README.md
28
README.md
|
@ -10,9 +10,9 @@ The function `compare` returns a similarity score between two strings. The func
|
|||
|
||||
```julia
|
||||
using StringDistances
|
||||
compare(Hamming(), "martha", "martha")
|
||||
compare("martha", "martha", Hamming())
|
||||
#> 1.0
|
||||
compare(Hamming(), "martha", "marhta")
|
||||
compare("martha", "marhta", Hamming())
|
||||
#> 0.6666666666666667
|
||||
```
|
||||
|
||||
|
@ -46,14 +46,14 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
- [Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) boosts the similary score of strings with common prefixes. The Winkler adjustment was originally defined for the Jaro similarity score but this package defines it for any string distance.
|
||||
|
||||
```julia
|
||||
compare(Jaro(), "martha", "marhta")
|
||||
compare("martha", "marhta", Jaro())
|
||||
#> 0.9444444444444445
|
||||
compare(Winkler(Jaro()), "martha", "marhta")
|
||||
compare("martha", "marhta", Winkler(Jaro()))
|
||||
#> 0.9611111111111111
|
||||
|
||||
compare(QGram(2), "william", "williams")
|
||||
compare("william", "williams", QGram(2))
|
||||
#> 0.9230769230769231
|
||||
compare(Winkler(QGram(2)), "william", "williams")
|
||||
compare("william", "williams", Winkler(QGram(2)))
|
||||
#> 0.9538461538461539
|
||||
```
|
||||
|
||||
|
@ -62,27 +62,27 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
- [Partial](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) returns the maximal similarity score between the shorter string and substrings of the longer string.
|
||||
|
||||
```julia
|
||||
compare(Levenshtein(), "New York Yankees", "Yankees")
|
||||
compare("New York Yankees", "Yankees", Levenshtein())
|
||||
#> 0.4375
|
||||
compare(Partial(Levenshtein()), "New York Yankees", "Yankees")
|
||||
compare("New York Yankees", "Yankees", Partial(Levenshtein()))
|
||||
#> 1.0
|
||||
```
|
||||
|
||||
- [TokenSort](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders by reording words alphabetically.
|
||||
|
||||
```julia
|
||||
compare(RatcliffObershelp(), "mariners vs angels", "angels vs mariners")
|
||||
compare("mariners vs angels", "angels vs mariners", RatcliffObershelp())
|
||||
#> 0.44444
|
||||
compare(TokenSort(RatcliffObershelp()),"mariners vs angels", "angels vs mariners")
|
||||
compare("mariners vs angels", "angels vs mariners", TokenSort(RatcliffObershelp())
|
||||
#> 1.0
|
||||
```
|
||||
|
||||
- [TokenSet](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) adjusts for differences in word orders and word numbers by comparing the intersection of two strings with each string.
|
||||
|
||||
```julia
|
||||
compare(Jaro(),"mariners vs angels", "los angeles angels at seattle mariners")
|
||||
compare("mariners vs angels", "los angeles angels at seattle mariners", Jaro())
|
||||
#> 0.559904
|
||||
compare(TokenSet(Jaro()),"mariners vs angels", "los angeles angels at seattle mariners")
|
||||
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Jaro()))
|
||||
#> 0.944444
|
||||
```
|
||||
|
||||
|
@ -90,7 +90,7 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
- [TokenMax](http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/) combines scores using the base distance, the `Partial`, `TokenSort` and `TokenSet` modifiers, with penalty terms depending on string lengths.
|
||||
|
||||
```julia
|
||||
compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners")
|
||||
compare("mariners vs angels", "los angeles angels at seattle mariners", TokenMax(RatcliffObershelp()))
|
||||
#> 0.855
|
||||
```
|
||||
## Compare vs Evaluate
|
||||
|
@ -98,7 +98,7 @@ The function `compare` returns a similarity score: a value of 0 means completely
|
|||
In contrast, the function `evaluate` returns the litteral distance between two strings, with a value of 0 being completely similar. some distances are between 0 and 1. Others are unbouded.
|
||||
|
||||
```julia
|
||||
compare(Levenshtein(), "New York", "New York")
|
||||
compare("New York", "New York", Levenshtein())
|
||||
#> 1.0
|
||||
evaluate(Levenshtein(), "New York", "New York")
|
||||
#> 0
|
||||
|
|
|
@ -1 +1 @@
|
|||
@time f(Jaccard(2), x, y)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
|
|
|
@ -7,15 +7,14 @@ function f(t, x, y)
|
|||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||
end
|
||||
|
||||
# same speed as StringDist
|
||||
@time f(Levenshtein(), x, y)
|
||||
@time f(Jaro(), x, y)
|
||||
# a bist faster than StringDist
|
||||
@btime f(Levenshtein(), x, y)
|
||||
# 355.984 ms (1500004 allocations: 223.24 MiB)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
|
||||
# 4x slower compared to StringDist
|
||||
# 2-3x slower compared to StringDist
|
||||
@time f(Jaccard(2), x, y)
|
||||
@time f(Cosine(2), x, y)
|
||||
@time f(QGram(2), x, y)
|
||||
# 1.6s
|
||||
|
||||
#
|
||||
|
||||
|
@ -29,7 +28,9 @@ library(stringdist)
|
|||
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
||||
system.time(stringdist(x,y,method='lv', nthread = 1))
|
||||
# 0.472
|
||||
system.time(stringdist(x,y,method='jaccard', nthread = 1))
|
||||
# 0.739
|
||||
system.time(stringdist(x,y,method='cosine', nthread = 1))
|
||||
system.time(stringdist(x,y,method='qgram', nthread = 1))
|
||||
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::PreMetric)
|
||||
compare(dist, s1, s2)
|
||||
end
|
||||
|
||||
|
||||
function compare(dist::PreMetric, s1::AbstractString, s2::AbstractString)
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
@ -15,12 +20,12 @@ function compare(dist::Union{Hamming, Levenshtein, DamerauLevenshtein},
|
|||
len == 0 ? 1.0 : 1.0 - evaluate(dist, s1, s2) / len
|
||||
end
|
||||
|
||||
function compare(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
function compare(dist::AbstractQGram{N}, s1::AbstractString, s2::AbstractString) where {N}
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
len1 = length(s1) ; len2 = length(s2)
|
||||
min(len1, len2) <= (param(dist) - 1) && return convert(Float64, s1 == s2)
|
||||
min(len1, len2) <= (N - 1) && return convert(Float64, s1 == s2)
|
||||
if typeof(dist) <: QGram
|
||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * param(dist) + 2)
|
||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * N + 2)
|
||||
else
|
||||
1 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
|
|
@ -4,13 +4,11 @@
|
|||
## Define a type that iterates through q-grams of a string
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
# N is the number of characters in the QGram
|
||||
struct QGramIterator{S <: AbstractString, N}
|
||||
s::S # grapheme
|
||||
l::Int # length of string
|
||||
end
|
||||
# N is the number of characters in the QGram
|
||||
param(x::QGramIterator{S, N}) where {S, N} = N
|
||||
|
||||
function Base.iterate(qgram::QGramIterator{S, N},
|
||||
state = (1, qgram.l < N ? ncodeunits(qgram.s) + 1 : nextind(qgram.s, 0, N))) where {S, N}
|
||||
|
@ -25,65 +23,53 @@ Base.eltype(qgram::QGramIterator) = String
|
|||
|
||||
##############################################################################
|
||||
##
|
||||
## CountedIterator that use Dictionary
|
||||
##
|
||||
## For each element in union{v1, v2}, this iterator output numbers of times it appears in v1 and the number of times it appears in v2
|
||||
## v1 and v2 must be sorted vectors
|
||||
## For two iterators x1 x2, count_map(x1, x2) returns an iterator that returns, for each element in union{x1, x2}, the numbers of times it appears in x1 and the number of times it appears in x2
|
||||
##
|
||||
##############################################################################
|
||||
struct CountIteratorDictionary{T}
|
||||
d::T
|
||||
end
|
||||
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
function CountIteratorDictionary(s1::QGramIterator{S1, N}, s2::QGramIterator{S2, N}) where {S1, S2, N}
|
||||
K = String
|
||||
function count_map(s1, s2) where {S1, S2, N}
|
||||
K = Union{eltype(s1), eltype(s2)}
|
||||
d = Dict{K, NTuple{2, Int}}()
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
for ch10 in s1
|
||||
ch1 = convert(K, ch10)
|
||||
!isequal(ch1, ch10) && throw(ArgumentError("$(limitrepr(ch10)) is not a valid key for type $K"))
|
||||
for ch1 in s1
|
||||
index = Base.ht_keyindex2!(d, ch1)
|
||||
if index > 0
|
||||
d.age += 1
|
||||
@inbounds d.keys[index] = ch1
|
||||
@inbounds d.vals[index] = (d.vals[index][1] + 1, 0)
|
||||
else
|
||||
Base._setindex!(d, (1, 0), ch1, -index)
|
||||
@inbounds Base._setindex!(d, (1, 0), ch1, -index)
|
||||
end
|
||||
end
|
||||
for ch20 in s2
|
||||
ch2 = convert(K, ch20)
|
||||
!isequal(ch2, ch20) && throw(ArgumentError("$(limitrepr(ch20)) is not a valid key for type $K"))
|
||||
for ch2 in s2
|
||||
index = Base.ht_keyindex2!(d, ch2)
|
||||
if index > 0
|
||||
d.age += 1
|
||||
@inbounds d.keys[index] = ch2
|
||||
@inbounds d.vals[index] = (d.vals[index][1], d.vals[index][2] + 1)
|
||||
else
|
||||
Base._setindex!(d, (0, 1), ch2, -index)
|
||||
@inbounds Base._setindex!(d, (0, 1), ch2, -index)
|
||||
end
|
||||
end
|
||||
return values(d)
|
||||
end
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Distance on strings is computed by set distance on qgram sets
|
||||
##
|
||||
##############################################################################
|
||||
abstract type AbstractQGram{N} <: SemiMetric end
|
||||
param(x::AbstractQGram{N}) where N = N
|
||||
|
||||
function qgram_iterator(dist::AbstractQGram, s::AbstractString)
|
||||
QGramIterator{typeof(s), param(dist)}(s, length(s))
|
||||
function qgram_iterator(dist::AbstractQGram{N}, s::AbstractString) where {N}
|
||||
QGramIterator{typeof(s), N}(s, length(s))
|
||||
end
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractString, s2::AbstractString)
|
||||
evaluate(dist,
|
||||
CountIteratorDictionary(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
|
||||
evaluate(dist, count_map(qgram_iterator(dist, s1), qgram_iterator(dist, s2)))
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
|
Loading…
Reference in New Issue