86 lines
2.0 KiB
Julia
86 lines
2.0 KiB
Julia
|
|
using StringDistances, Random
|
|
Random.seed!(2)
|
|
x = map(Random.randstring, rand(5:25,500_000))
|
|
y = map(Random.randstring, rand(5:25,500_000))
|
|
|
|
function f(t, x, y; min_score = 0.0)
|
|
[compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)]
|
|
end
|
|
|
|
function g(dist, x, y)
|
|
[dist(x[i], y[i]) for i in 1:length(x)]
|
|
end
|
|
|
|
|
|
|
|
@time f(Jaro(), x, y)
|
|
#0.3s (now 0.37s)
|
|
@time f(Levenshtein(), x, y)
|
|
# 0.48s
|
|
@time f(Levenshtein(), x, y, min_score = 0.8)
|
|
# 0.11 (now 0.14)
|
|
@time f(DamerauLevenshtein(), x, y)
|
|
# 0.61s.
|
|
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
|
|
# 0.08 (now 0.09)
|
|
@time f(RatcliffObershelp(), x, y)
|
|
# 1.52s
|
|
|
|
|
|
|
|
|
|
@time findnearest(x[1], y, Levenshtein())
|
|
# 0.14
|
|
@time findnearest(x[1], y, DamerauLevenshtein())
|
|
# 0.15
|
|
|
|
@time findnearest(x[1], y, QGram(2))
|
|
# 0.75
|
|
|
|
|
|
|
|
@time findall(x[1], y, Levenshtein())
|
|
# 0.05
|
|
@time findall(x[1], y, DamerauLevenshtein())
|
|
# 0.05
|
|
@time findall(x[1], y, Partial(DamerauLevenshtein()))
|
|
# 0.96
|
|
@time findall(x[1], y, QGram(2))
|
|
# 0.81
|
|
@time findall(x[1], y, TokenSort(DamerauLevenshtein()))
|
|
# 0.27 (now 0.32)
|
|
@time findall(x[1], y, TokenSet(DamerauLevenshtein()))
|
|
# 0.55
|
|
@time findall(x[1], y, TokenMax(DamerauLevenshtein()))
|
|
# 2.25 (now 3.6)
|
|
@time findnearest(x[1], y, DamerauLevenshtein())
|
|
# 0.15
|
|
|
|
x = map(Random.randstring, rand(5:25,1000))
|
|
y = map(Random.randstring, rand(5:25,1000))
|
|
@time pairwise(Levenshtein(), x, y)
|
|
# 0.25 seconds
|
|
@time pairwise(QGram(2), x, y, preprocess = false)
|
|
# 2.126829
|
|
@time pairwise(QGram(2), x, y, preprocess = true)
|
|
# 0.12
|
|
|
|
|
|
|
|
|
|
#= Rcode
|
|
library(stringdist)
|
|
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
|
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
|
|
system.time(stringdist(x,y,method='lv', nthread = 1))
|
|
system.time(stringdist(x,y,method='dl', nthread = 1))
|
|
# 0.472
|
|
system.time(stringdist(x,y,method='jaccard', nthread = 1))
|
|
# 0.739
|
|
system.time(stringdist(x,y,method='cosine', nthread = 1))
|
|
system.time(stringdist(x,y,method='qgram', nthread = 1))
|
|
|
|
=#
|
|
|