clean
parent
6dc8056e37
commit
68702d8aa1
|
@ -20,10 +20,10 @@ compare("martha", "marhta", Hamming())
|
|||
## Distances
|
||||
|
||||
#### Edit Distances
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
|
||||
- [Jaro Distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) `Jaro()`
|
||||
- [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) `Levenshtein()`
|
||||
- [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) `Hamming()`
|
||||
- [Damerau-Levenshtein Distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) `DamerauLevenshtein()`
|
||||
- [RatcliffObershelp Distance](https://xlinux.nist.gov/dads/HTML/ratcliffObershelp.html) `RatcliffObershelp()`
|
||||
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
@time f(Jaccard(2), x, y)
|
||||
@time f(Jaccard(2), x, y)
|
|
@ -6,13 +6,20 @@ y = map(Random.randstring, rand(5:25,500_000))
|
|||
function f(t, x, y)
|
||||
[evaluate(t, x[i], y[i]) for i in 1:length(x)]
|
||||
end
|
||||
@time f(Hamming(), x, y)
|
||||
@time f(Jaro(), x, y)
|
||||
@time f(Levenshtein(), x, y)
|
||||
# 0.3s. A big faster than StringDist
|
||||
@time f(DamerauLevenshtein(), x, y)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
@time f(Jaccard(2), x, y)
|
||||
# 1.6s 2-3x slower compared to StringDist
|
||||
|
||||
# a bist faster than StringDist
|
||||
@time f(Levenshtein(), x, y)
|
||||
# 355.984 ms (1500004 allocations: 223.24 MiB)
|
||||
@time f(RatcliffObershelp(), x, y)
|
||||
|
||||
# 2-3x slower compared to StringDist
|
||||
@time f(Jaccard(2), x, y)
|
||||
# 1.6s
|
||||
|
||||
|
|
|
@ -27,16 +27,16 @@ Partial,
|
|||
TokenSort,
|
||||
TokenSet,
|
||||
TokenMax,
|
||||
qgram_iterator
|
||||
qgram
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## include
|
||||
##
|
||||
##############################################################################
|
||||
include("distances/utils.jl")
|
||||
include("distances/edit.jl")
|
||||
include("distances/qgram.jl")
|
||||
include("utils.jl")
|
||||
include("edit.jl")
|
||||
include("qgram.jl")
|
||||
include("compare.jl")
|
||||
|
||||
end
|
||||
|
|
|
@ -45,17 +45,14 @@ Winkler is a `PreMetric` modifier that boosts the similarity score between two s
|
|||
"""
|
||||
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
|
||||
dist::T1
|
||||
scaling_factor::T2 # scaling factor. Default to 0.1
|
||||
scaling_factor::T2 # scaling factor. Default to 0.1
|
||||
boosting_threshold::T3 # boost threshold. Default to 0.7
|
||||
end
|
||||
|
||||
# restrict to distance between 0 and 1
|
||||
Winkler(x) = Winkler(x, 0.1, 0.7)
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
|
||||
score = compare(s1, s2, dist.dist)
|
||||
l = common_prefix(s1, s2, 4)[1]
|
||||
# common prefix adjustment
|
||||
if score >= dist.boosting_threshold
|
||||
score += l * dist.scaling_factor * (1 - score)
|
||||
end
|
||||
|
@ -77,27 +74,24 @@ struct Partial{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
# general
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
len1 == 0 && return compare("", "", dist.dist)
|
||||
out = 0.0
|
||||
for x in qgram_iterator(s2, len1)
|
||||
for x in qgram(s2, len1)
|
||||
curr = compare(s1, x, dist.dist)
|
||||
out = max(out, curr)
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
# Specialization for RatcliffObershelp distance
|
||||
# Code follows https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/fuzz.py
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp})
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
out = 0.0
|
||||
for r in matching_blocks(s1, s2)
|
||||
# here I difffer from fuzz.py by making sure the substring of s2 has length len1
|
||||
# Make sure the substring of s2 has length len1
|
||||
s2_start = r[2] - r[1] + 1
|
||||
s2_end = s2_start + len1 - 1
|
||||
if s2_start <= 0
|
||||
|
@ -183,9 +177,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax)
|
|||
dist0 = compare(s1, s2, dist.dist)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
unbase_scale = 0.95
|
||||
# if one string is much much shorter than the other
|
||||
# if one string is much shorter than the other, use partial
|
||||
if len2 >= 1.5 * len1
|
||||
# if strings are of dissimilar length, use partials
|
||||
partial = compare(s1, s2, Partial(dist.dist))
|
||||
ptsor = compare(s1, s2, TokenSort(Partial(dist.dist)))
|
||||
ptser = compare(s1, s2, TokenSet(Partial(dist.dist)))
|
||||
|
|
|
@ -12,10 +12,84 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
|||
return current
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Jaro
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
Jaro()
|
||||
|
||||
Creates the Jaro metric
|
||||
|
||||
The Jaro distance is defined as
|
||||
|
||||
|
||||
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
|
||||
|
||||
where ``m`` is the number of matching characters and
|
||||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: SemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
flag = fill(false, len2)
|
||||
prevstate1 = firstindex(s1)
|
||||
i1_match = prevstate1 * ones(Int, len1)
|
||||
# m counts number matching characters
|
||||
m = 0
|
||||
i1 = 1
|
||||
i2 = 1
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
while x1 !== nothing
|
||||
ch1, state1 = x1
|
||||
if i2 <= i1 - maxdist - 1
|
||||
ch2, state2 = x2
|
||||
i2 += 1
|
||||
x2 = iterate(s2, state2)
|
||||
end
|
||||
i2curr = i2
|
||||
x2curr = x2
|
||||
while x2curr !== nothing
|
||||
(i2curr > i1 + maxdist) && break
|
||||
ch2, state2 = x2curr
|
||||
if (ch1 == ch2) & !flag[i2curr]
|
||||
m += 1
|
||||
flag[i2curr] = true
|
||||
i1_match[m] = prevstate1
|
||||
break
|
||||
end
|
||||
x2curr = iterate(s2, state2)
|
||||
i2curr += 1
|
||||
end
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevstate1 = state1
|
||||
end
|
||||
m == 0 && return 1.0
|
||||
# t counts number of transpositions
|
||||
t = 0
|
||||
i1 = 0
|
||||
i2 = 0
|
||||
for ch2 in s2
|
||||
i2 += 1
|
||||
if flag[i2]
|
||||
i1 += 1
|
||||
t += ch2 != iterate(s1, i1_match[i1])[1]
|
||||
end
|
||||
end
|
||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Levenshtein
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
|
@ -27,6 +101,7 @@ The Levenshtein distance is the minimum number of operations (consisting of inse
|
|||
"""
|
||||
struct Levenshtein <: SemiMetric end
|
||||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
# prefix common to both strings can be ignored
|
||||
|
@ -64,7 +139,6 @@ end
|
|||
##############################################################################
|
||||
##
|
||||
## Damerau Levenshtein
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
|
@ -76,6 +150,7 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
|
|||
"""
|
||||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
# prefix common to both strings can be ignored
|
||||
|
@ -132,80 +207,6 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractStri
|
|||
return current
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Jaro
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
Jaro()
|
||||
|
||||
Creates the Jaro metric
|
||||
|
||||
The Jaro distance is defined as
|
||||
|
||||
|
||||
``1 - (m / |s1| + m / |s2| + (m - t) / m) / 3``
|
||||
|
||||
where ``m`` is the number of matching characters and
|
||||
``t`` is half the number of transpositions.
|
||||
"""
|
||||
struct Jaro <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||
s2, len2, s1, len1 = reorder(s1, s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
len2 == 0 && return 0.0
|
||||
maxdist = max(0, div(len2, 2) - 1)
|
||||
flag = fill(false, len2)
|
||||
prevstate1 = firstindex(s1)
|
||||
i1_match = prevstate1 * ones(Int, len1)
|
||||
# m counts number matching characters
|
||||
m = 0
|
||||
i1 = 1
|
||||
i2 = 1
|
||||
x1 = iterate(s1)
|
||||
x2 = iterate(s2)
|
||||
while x1 !== nothing
|
||||
ch1, state1 = x1
|
||||
if i2 <= i1 - maxdist - 1
|
||||
ch2, state2 = x2
|
||||
i2 += 1
|
||||
x2 = iterate(s2, state2)
|
||||
end
|
||||
i2curr = i2
|
||||
x2curr = x2
|
||||
while x2curr !== nothing
|
||||
(i2curr > i1 + maxdist) && break
|
||||
ch2, state2 = x2curr
|
||||
if (ch1 == ch2) & !flag[i2curr]
|
||||
m += 1
|
||||
flag[i2curr] = true
|
||||
i1_match[m] = prevstate1
|
||||
break
|
||||
end
|
||||
x2curr = iterate(s2, state2)
|
||||
i2curr += 1
|
||||
end
|
||||
x1 = iterate(s1, state1)
|
||||
i1 += 1
|
||||
prevstate1 = state1
|
||||
end
|
||||
m == 0 && return 1.0
|
||||
# t counts number of transpotsitions
|
||||
t = 0
|
||||
i1 = 0
|
||||
i2 = 0
|
||||
for ch2 in s2
|
||||
i2 += 1
|
||||
if flag[i2]
|
||||
i1 += 1
|
||||
t += ch2 != iterate(s1, i1_match[i1])[1]
|
||||
end
|
||||
end
|
||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
|
@ -32,27 +32,34 @@ Return an iterator that iterates on the QGram of the string
|
|||
## Examples
|
||||
```julia
|
||||
using StringDistances
|
||||
for x in qgram_iterator("hello", 2)
|
||||
for x in qgram("hello", 2)
|
||||
@show x
|
||||
end
|
||||
```
|
||||
"""
|
||||
function qgram_iterator(s::AbstractString, q::Integer)
|
||||
function qgram(s::AbstractString, q::Integer)
|
||||
QGramIterator{typeof(s)}(s, length(s), q)
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## For two iterators x1 x2, count_map(x1, x2) returns an iterator
|
||||
## that returns, for each element in union{x1, x2}, the numbers of
|
||||
## times it appears in x1 and the number of times it appears in x2
|
||||
##
|
||||
##
|
||||
##
|
||||
##
|
||||
##############################################################################
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
"""
|
||||
count_map(x1, x2)
|
||||
|
||||
For two iterators `x1` and `x2`, `count_map(x1, x2)` returns an dictionary
|
||||
that returns, for each element in `x1` or `x2`, a tuple with the numbers of
|
||||
times it appears in `x1` and the number of times it appears in `x2`
|
||||
"""
|
||||
function count_map(s1, s2)
|
||||
K = Union{eltype(s1), eltype(s2)}
|
||||
d = Dict{K, NTuple{2, Int}}()
|
||||
K = promote_type(eltype(s1), eltype(s2))
|
||||
d = Dict{K, Tuple{Int, Int}}()
|
||||
# I use a faster way to change a dictionary key
|
||||
# see setindex! in https://github.com/JuliaLang/julia/blob/master/base/dict.jl#L380
|
||||
sizehint!(d, length(s1) + length(s2))
|
||||
for x1 in s1
|
||||
index = Base.ht_keyindex2!(d, x1)
|
||||
|
@ -74,7 +81,7 @@ function count_map(s1, s2)
|
|||
@inbounds Base._setindex!(d, (0, 1), x2, -index)
|
||||
end
|
||||
end
|
||||
return values(d)
|
||||
return d
|
||||
end
|
||||
|
||||
#= Trie
|
||||
|
@ -126,7 +133,7 @@ end
|
|||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
|
||||
function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
|
||||
x = count_map(qgram_iterator(s1, dist.q), qgram_iterator(s2, dist.q))
|
||||
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
|
||||
evaluate(dist, x)
|
||||
end
|
||||
|
||||
|
@ -155,9 +162,9 @@ struct QGram <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::QGram, countiterator)
|
||||
function evaluate(dist::QGram, count_dict)
|
||||
n = 0
|
||||
for (n1, n2) in countiterator
|
||||
for (n1, n2) in values(count_dict)
|
||||
n += abs(n1 - n2)
|
||||
end
|
||||
n
|
||||
|
@ -184,9 +191,9 @@ struct Cosine <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Cosine, countiterator)
|
||||
function evaluate(dist::Cosine, count_dict)
|
||||
norm1, norm2, prodnorm = 0, 0, 0
|
||||
for (n1, n2) in countiterator
|
||||
for (n1, n2) in values(count_dict)
|
||||
norm1 += n1^2
|
||||
norm2 += n2^2
|
||||
prodnorm += n1 * n2
|
||||
|
@ -214,9 +221,9 @@ struct Jaccard <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Jaccard, countiterator)
|
||||
function evaluate(dist::Jaccard, count_dict)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in countiterator
|
||||
for (n1, n2) in values(count_dict)
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -244,9 +251,9 @@ struct SorensenDice <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::SorensenDice, countiterator)
|
||||
function evaluate(dist::SorensenDice, count_dict)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in countiterator
|
||||
for (n1, n2) in values(count_dict)
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
||||
|
@ -275,9 +282,9 @@ struct Overlap <: AbstractQGramDistance
|
|||
q::Int
|
||||
end
|
||||
|
||||
function evaluate(dist::Overlap, countiterator)
|
||||
function evaluate(dist::Overlap, count_dict)
|
||||
ndistinct1, ndistinct2, nintersect = 0, 0, 0
|
||||
for (n1, n2) in countiterator
|
||||
for (n1, n2) in values(count_dict)
|
||||
ndistinct1 += n1 > 0
|
||||
ndistinct2 += n2 > 0
|
||||
nintersect += (n1 > 0) & (n2 > 0)
|
Loading…
Reference in New Issue