clean tests

pull/44/head
matthieugomez 2020-11-14 12:37:04 -08:00
parent f9675fd110
commit d650b62a40
6 changed files with 148 additions and 51 deletions

1
.gitignore vendored
View File

@ -1,4 +1,3 @@
benchmark/
PC25
Manifest.toml
draft

85
benchmark/benchmark.jl Normal file
View File

@ -0,0 +1,85 @@
using StringDistances, Random
Random.seed!(2)
x = map(Random.randstring, rand(5:25,500_000))
y = map(Random.randstring, rand(5:25,500_000))
function f(t, x, y; min_score = 0.0)
[compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)]
end
function g(dist, x, y)
[dist(x[i], y[i]) for i in 1:length(x)]
end
@time f(Jaro(), x, y)
#0.3s (now 0.37s)
@time f(Levenshtein(), x, y)
# 0.48s
@time f(Levenshtein(), x, y, min_score = 0.8)
# 0.11 (now 0.14)
@time f(DamerauLevenshtein(), x, y)
# 0.61s.
@time f(DamerauLevenshtein(), x, y, min_score = 0.8)
# 0.08 (now 0.09)
@time f(RatcliffObershelp(), x, y)
# 1.52s
@time findnearest(x[1], y, Levenshtein())
# 0.14
@time findnearest(x[1], y, DamerauLevenshtein())
# 0.15
@time findnearest(x[1], y, QGram(2))
# 0.75
@time findall(x[1], y, Levenshtein())
# 0.05
@time findall(x[1], y, DamerauLevenshtein())
# 0.05
@time findall(x[1], y, Partial(DamerauLevenshtein()))
# 0.96
@time findall(x[1], y, QGram(2))
# 0.81
@time findall(x[1], y, TokenSort(DamerauLevenshtein()))
# 0.27 (now 0.32)
@time findall(x[1], y, TokenSet(DamerauLevenshtein()))
# 0.55
@time findall(x[1], y, TokenMax(DamerauLevenshtein()))
# 2.25 (now 3.6)
@time findnearest(x[1], y, DamerauLevenshtein())
# 0.15
x = map(Random.randstring, rand(5:25,1000))
y = map(Random.randstring, rand(5:25,1000))
@time pairwise(Levenshtein(), x, y)
# 0.25 seconds
@time pairwise(QGram(2), x, y, preprocess = false)
# 2.126829
@time pairwise(QGram(2), x, y, preprocess = true)
# 0.12
#= Rcode
library(stringdist)
x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse=""))
system.time(stringdist(x,y,method='lv', nthread = 1))
system.time(stringdist(x,y,method='dl', nthread = 1))
# 0.472
system.time(stringdist(x,y,method='jaccard', nthread = 1))
# 0.739
system.time(stringdist(x,y,method='cosine', nthread = 1))
system.time(stringdist(x,y,method='qgram', nthread = 1))
=#

View File

@ -2,7 +2,33 @@
using StringDistances, Unicode, Test
@testset "Modifiers" begin
# Partial
@test Partial(QGram(2))("martha", "marhta") == 6
@test Partial(QGram(2))("martha", missing) === missing
@test Partial(Levenshtein())("martha", "marhta") == 2
@test Partial(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
@test Partial(RatcliffObershelp())("martha", missing) === missing
# TokenSort
@test TokenSort(QGram(2))("martha", "marhta") == 6
@test TokenSort(QGram(2))("martha", missing) === missing
@test TokenSort(Levenshtein())("martha", "marhta") == 2
@test TokenSort(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
# TokenSet
@test TokenSet(QGram(2))("martha", "marhta") == 6
@test TokenSet(QGram(2))("martha", missing) === missing
@test TokenSet(Levenshtein())("martha", "marhta") == 2
@test TokenSet(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
# TokenMax
@test TokenMax(QGram(2))("martha", "marhta") 0.6
@test TokenMax(QGram(2))("martha", missing) === missing
@test TokenMax(Levenshtein())("martha", "marhta") 1/3
@test TokenMax(RatcliffObershelp())("martha", "marhta") 0.16666666 atol = 1e-5
end
@testset "Compare" begin
# Qgram
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@ -19,13 +45,9 @@ using StringDistances, Unicode, Test
# Jaro
@test compare("aüa", "aua", Hamming()) 2/3
@test compare("aüa", "aua", Jaro()) 0.77777777 atol = 1e-4
@test compare("New York Yankees", "", Partial(Jaro())) 0.0
#Levenshtein
compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein())
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
# Winkler
# JaroWinkler
@test compare("martha", "marhta", JaroWinkler()) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", JaroWinkler()) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", JaroWinkler()) 0.81333 atol = 1e-4
@ -34,12 +56,17 @@ using StringDistances, Unicode, Test
@test compare("a", "a", JaroWinkler()) 1.0 atol = 1e-4
@test compare("abc", "xyz", JaroWinkler()) 0.0 atol = 1e-4
#Levenshtein
compare("aüa", "aua", Levenshtein())
@test compare("ok", missing, Levenshtein()) === missing
compare("aüa", "aua", DamerauLevenshtein())
@test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0
@test compare("ab", "de", Partial(DamerauLevenshtein())) == 0
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
@test compare("New York Yankees", "", Partial(Jaro())) 0.0
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
#@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
@ -47,28 +74,26 @@ using StringDistances, Unicode, Test
@test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare(graphemes("New York Mets vs Atlanta Braves"), graphemes("Atlanta Braves vs New York Mets"), Partial(RatcliffObershelp())) compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", Partial(RatcliffObershelp()))
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp()) 5 / 100 atol = 1e-2
@test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp())) 7 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp())) 79 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp())) 88 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp())) 11 / 100 atol = 1e-2
@test compare("mariners", "are mariner playing tomorrow", RatcliffObershelp()) 39 / 100 atol = 1e-2
@test compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp())) 88 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp())) 39 / 100 atol = 1e-2
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp()))) 88 / 100 atol = 1e-2
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
@test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp())) 52 / 100 atol = 1e-2
# check missing
@test compare("ok", missing, Levenshtein()) === missing
# check min
strings = [
@ -98,21 +123,21 @@ using StringDistances, Unicode, Test
end
end
end
end
# check find_best and find_all
@testset "Find*" begin
# findnearest
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1)
@test findnearest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2)
@test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1)
@test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1)
# findall
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[]
@test findall("New York", ["NewYork", "Newark", "San Francisco"], QGram(2); min_score = 0.99) == Int[]
@ -123,12 +148,4 @@ using StringDistances, Unicode, Test
@test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1]
@test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == []
end
end
#= Python code
from fuzzywuzzy import fuzz
fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。")
fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。")
=#
end

View File

@ -1,17 +1,15 @@
using StringDistances, Unicode, Test, Random
@testset "pairwise" begin
TestStrings1 = ["", "abc", "bc", "kitten"]
TestStrings2 = ["mew", "ab"]
@testset "Pairwise" begin
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
TestStrings1 = ["", "abc", "bc", "kitten"]
TestStrings2 = ["mew", "ab"]
TestStrings1missing = ["", "abc", "bc", missing]
TestStrings2missing = ["mew", missing]
@testset "pairwise" begin
for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp,
QGram, Cosine, Jaccard, SorensenDice, Overlap]
for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(),
QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)]
d = (DT <: AbstractQGramDistance) ? DT(2) : DT()
R = pairwise(d, TestStrings1)
@test size(R) == (4, 4)
@ -70,7 +68,7 @@ TestStrings2missing = ["mew", missing]
end
# Ensure same result if preprocessing for QGramDistances
if DT <: AbstractQGramDistance
if d isa AbstractQGramDistance
R4 = pairwise(d, TestStrings1; preprocess = true)
@test typeof(R4) == typeof(R)
@test size(R4) == size(R)
@ -84,6 +82,4 @@ TestStrings2missing = ["mew", missing]
R5 = pairwise(d, TestStrings1missing; preprocess = true)
@test eltype(R5) == Union{result_type(d, String, String), Missing}
end
end
end
end

View File

@ -2,5 +2,5 @@ using StringDistances
using Test
include("distances.jl")
include("pairwise.jl")
include("modifiers.jl")
include("pairwise.jl")