pull/17/head
matthieugomez 2019-08-18 12:52:37 -04:00
parent 6dc8056e37
commit 68702d8aa1
8 changed files with 124 additions and 116 deletions

View File

@ -20,10 +20,10 @@ compare("martha", "marhta", Hamming())
## Distances
#### Edit Distances
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`

View File

@ -1 +1 @@
@time f(Jaccard(2), x, y)
@time f(Jaccard(2), x, y)

View File

@ -6,13 +6,20 @@ y = map(Random.randstring, rand(5:25,500_000))
function f(t, x, y)
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
end
@time f(Hamming(), x, y)
@time f(Jaro(), x, y)
@time f(Levenshtein(), x, y)
# 0.3s. A big faster than StringDist
@time f(DamerauLevenshtein(), x, y)
@time f(RatcliffObershelp(), x, y)
@time f(Jaccard(2), x, y)
# 1.6s 2-3x slower compared to StringDist
# a bist faster than StringDist
@time f(Levenshtein(), x, y)
# 355.984 ms (1500004 allocations: 223.24 MiB)
@time f(RatcliffObershelp(), x, y)
# 2-3x slower compared to StringDist
@time f(Jaccard(2), x, y)
# 1.6s

View File

@ -27,16 +27,16 @@ Partial,
TokenSort,
TokenSet,
TokenMax,
qgram_iterator
qgram
##############################################################################
##
## include
##
##############################################################################
include("distances/utils.jl")
include("distances/edit.jl")
include("distances/qgram.jl")
include("utils.jl")
include("edit.jl")
include("qgram.jl")
include("compare.jl")
end

View File

@ -45,17 +45,14 @@ Winkler is a `PreMetric` modifier that boosts the similarity score between two s
"""
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
dist::T1
scaling_factor::T2 # scaling factor. Default to 0.1
scaling_factor::T2 # scaling factor. Default to 0.1
boosting_threshold::T3 # boost threshold. Default to 0.7
end
# restrict to distance between 0 and 1
Winkler(x) = Winkler(x, 0.1, 0.7)
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
score = compare(s1, s2, dist.dist)
l = common_prefix(s1, s2, 4)[1]
# common prefix adjustment
if score >= dist.boosting_threshold
score += l * dist.scaling_factor * (1 - score)
end
@ -77,27 +74,24 @@ struct Partial{T <: PreMetric} <: PreMetric
dist::T
end
# general
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(s1, s2, dist.dist)
len1 == 0 && return compare("", "", dist.dist)
out = 0.0
for x in qgram_iterator(s2, len1)
for x in qgram(s2, len1)
curr = compare(s1, x, dist.dist)
out = max(out, curr)
end
return out
end
# Specialization for RatcliffObershelp distance
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
s2, len2, s1, len1 = reorder(s1, s2)
len1 == len2 && return compare(s1, s2, dist.dist)
out = 0.0
for r in matching_blocks(s1, s2)
# here I difffer from fuzz.py by making sure the substring of s2 has length len1
# Make sure the substring of s2 has length len1
s2_start = r[2] - r[1] + 1
s2_end = s2_start + len1 - 1
if s2_start <= 0
@ -183,9 +177,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
dist0 = compare(s1, s2, dist.dist)
s2, len2, s1, len1 = reorder(s1, s2)
unbase_scale = 0.95
# if one string is much much shorter than the other
# if one string is much shorter than the other, use partial
if len2 >= 1.5 * len1
# if strings are of dissimilar length, use partials
partial = compare(s1, s2, Partial(dist.dist))
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))

View File

@ -12,10 +12,84 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
return current
end
##############################################################################
##
## Jaro
##
##############################################################################
"""
Jaro()
Creates the Jaro metric
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
prevstate1 = firstindex(s1)
i1_match = prevstate1 * ones(Int, len1)
# m counts number matching characters
m = 0
i1 = 1
i2 = 1
x1 = iterate(s1)
x2 = iterate(s2)
while x1 !== nothing
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
i2 += 1
x2 = iterate(s2, state2)
end
i2curr = i2
x2curr = x2
while x2curr !== nothing
(i2curr > i1 + maxdist) && break
ch2, state2 = x2curr
if (ch1 == ch2) & !flag[i2curr]
m += 1
flag[i2curr] = true
i1_match[m] = prevstate1
break
end
x2curr = iterate(s2, state2)
i2curr += 1
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
m == 0 && return 1.0
# t counts number of transpositions
t = 0
i1 = 0
i2 = 0
for ch2 in s2
i2 += 1
if flag[i2]
i1 += 1
t += ch2 != iterate(s1, i1_match[i1])[1]
end
end
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
##############################################################################
##
## Levenshtein
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
##
##############################################################################
"""
@ -27,6 +101,7 @@ The Levenshtein distance is the minimum number of operations (consisting of inse
"""
struct Levenshtein <: SemiMetric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
# prefix common to both strings can be ignored
@ -64,7 +139,6 @@ end
##############################################################################
##
## Damerau Levenshtein
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################
"""
@ -76,6 +150,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
"""
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
# prefix common to both strings can be ignored
@ -132,80 +207,6 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
return current
end
##############################################################################
##
## Jaro
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
##
##############################################################################
"""
Jaro()
Creates the Jaro metric
The Jaro distance is defined as
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
where ``m`` is the number of matching characters and
``t`` is half the number of transpositions.
"""
struct Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
s2, len2, s1, len1 = reorder(s1, s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
len2 == 0 && return 0.0
maxdist = max(0, div(len2, 2) - 1)
flag = fill(false, len2)
prevstate1 = firstindex(s1)
i1_match = prevstate1 * ones(Int, len1)
# m counts number matching characters
m = 0
i1 = 1
i2 = 1
x1 = iterate(s1)
x2 = iterate(s2)
while x1 !== nothing
ch1, state1 = x1
if i2 <= i1 - maxdist - 1
ch2, state2 = x2
i2 += 1
x2 = iterate(s2, state2)
end
i2curr = i2
x2curr = x2
while x2curr !== nothing
(i2curr > i1 + maxdist) && break
ch2, state2 = x2curr
if (ch1 == ch2) & !flag[i2curr]
m += 1
flag[i2curr] = true
i1_match[m] = prevstate1
break
end
x2curr = iterate(s2, state2)
i2curr += 1
end
x1 = iterate(s1, state1)
i1 += 1
prevstate1 = state1
end
m == 0 && return 1.0
# t counts number of transpotsitions
t = 0
i1 = 0
i2 = 0
for ch2 in s2
i2 += 1
if flag[i2]
i1 += 1
t += ch2 != iterate(s1, i1_match[i1])[1]
end
end
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
##############################################################################
##

View File

@ -32,27 +32,34 @@ Return an iterator that iterates on the QGram of the string
## Examples
```julia
using StringDistances
for x in qgram_iterator("hello", 2)
for x in qgram("hello", 2)
@show x
end
```
"""
function qgram_iterator(s::AbstractString, q::Integer)
function qgram(s::AbstractString, q::Integer)
QGramIterator{typeof(s)}(s, length(s), q)
end
##############################################################################
##
## For two iterators x1 x2, count_map(x1, x2) returns an iterator
## that returns, for each element in union{x1, x2}, the numbers of
## times it appears in x1 and the number of times it appears in x2
##
##
##
##
##############################################################################
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
"""
count_map(x1, x2)
For two iterators `x1` and `x2`, `count_map(x1, x2)` returns an dictionary
that returns, for each element in `x1` or `x2`, a tuple with the numbers of
times it appears in `x1` and the number of times it appears in `x2`
"""
function count_map(s1, s2)
K = Union{eltype(s1), eltype(s2)}
d = Dict{K, NTuple{2, Int}}()
K = promote_type(eltype(s1), eltype(s2))
d = Dict{K, Tuple{Int, Int}}()
# I use a faster way to change a dictionary key
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
sizehint!(d, length(s1) + length(s2))
for x1 in s1
index = Base.ht_keyindex2!(d, x1)
@ -74,7 +81,7 @@ function count_map(s1, s2)
@inbounds Base._setindex!(d, (0, 1), x2, -index)
end
end
return values(d)
return d
end
#= Trie
@ -126,7 +133,7 @@ end
abstract type AbstractQGramDistance <: SemiMetric end
function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
x = count_map(qgram_iterator(s1, dist.q), qgram_iterator(s2, dist.q))
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
end
@ -155,9 +162,9 @@ struct QGram <: AbstractQGramDistance
q::Int
end
function evaluate(dist::QGram, countiterator)
function evaluate(dist::QGram, count_dict)
n = 0
for (n1, n2) in countiterator
for (n1, n2) in values(count_dict)
n += abs(n1 - n2)
end
n
@ -184,9 +191,9 @@ struct Cosine <: AbstractQGramDistance
q::Int
end
function evaluate(dist::Cosine, countiterator)
function evaluate(dist::Cosine, count_dict)
norm1, norm2, prodnorm = 0, 0, 0
for (n1, n2) in countiterator
for (n1, n2) in values(count_dict)
norm1 += n1^2
norm2 += n2^2
prodnorm += n1 * n2
@ -214,9 +221,9 @@ struct Jaccard <: AbstractQGramDistance
q::Int
end
function evaluate(dist::Jaccard, countiterator)
function evaluate(dist::Jaccard, count_dict)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in countiterator
for (n1, n2) in values(count_dict)
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -244,9 +251,9 @@ struct SorensenDice <: AbstractQGramDistance
q::Int
end
function evaluate(dist::SorensenDice, countiterator)
function evaluate(dist::SorensenDice, count_dict)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in countiterator
for (n1, n2) in values(count_dict)
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)
@ -275,9 +282,9 @@ struct Overlap <: AbstractQGramDistance
q::Int
end
function evaluate(dist::Overlap, countiterator)
function evaluate(dist::Overlap, count_dict)
ndistinct1, ndistinct2, nintersect = 0, 0, 0
for (n1, n2) in countiterator
for (n1, n2) in values(count_dict)
ndistinct1 += n1 > 0
ndistinct2 += n2 > 0
nintersect += (n1 > 0) & (n2 > 0)