diff --git a/.gitignore b/.gitignore index d8fa608..485acc6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ -benchmark/ PC25 Manifest.toml draft diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl new file mode 100644 index 0000000..f6ad543 --- /dev/null +++ b/benchmark/benchmark.jl @@ -0,0 +1,85 @@ + +using StringDistances, Random +Random.seed!(2) +x = map(Random.randstring, rand(5:25,500_000)) +y = map(Random.randstring, rand(5:25,500_000)) + +function f(t, x, y; min_score = 0.0) + [compare(x[i], y[i], t; min_score = min_score) for i in 1:length(x)] +end + +function g(dist, x, y) + [dist(x[i], y[i]) for i in 1:length(x)] +end + + + +@time f(Jaro(), x, y) +#0.3s (now 0.37s) +@time f(Levenshtein(), x, y) +# 0.48s +@time f(Levenshtein(), x, y, min_score = 0.8) +# 0.11 (now 0.14) +@time f(DamerauLevenshtein(), x, y) +# 0.61s. +@time f(DamerauLevenshtein(), x, y, min_score = 0.8) +# 0.08 (now 0.09) +@time f(RatcliffObershelp(), x, y) +# 1.52s + + + + +@time findnearest(x[1], y, Levenshtein()) +# 0.14 +@time findnearest(x[1], y, DamerauLevenshtein()) +# 0.15 + +@time findnearest(x[1], y, QGram(2)) +# 0.75 + + + +@time findall(x[1], y, Levenshtein()) +# 0.05 +@time findall(x[1], y, DamerauLevenshtein()) +# 0.05 +@time findall(x[1], y, Partial(DamerauLevenshtein())) +# 0.96 +@time findall(x[1], y, QGram(2)) +# 0.81 +@time findall(x[1], y, TokenSort(DamerauLevenshtein())) +# 0.27 (now 0.32) +@time findall(x[1], y, TokenSet(DamerauLevenshtein())) +# 0.55 +@time findall(x[1], y, TokenMax(DamerauLevenshtein())) +# 2.25 (now 3.6) +@time findnearest(x[1], y, DamerauLevenshtein()) +# 0.15 + +x = map(Random.randstring, rand(5:25,1000)) +y = map(Random.randstring, rand(5:25,1000)) +@time pairwise(Levenshtein(), x, y) +# 0.25 seconds +@time pairwise(QGram(2), x, y, preprocess = false) +# 2.126829 +@time pairwise(QGram(2), x, y, preprocess = true) +# 0.12 + + + + +#= Rcode +library(stringdist) +x <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) +y <- sapply(sample(5:25,5 * 1e5,replace=TRUE), function(n) paste(sample(letters,n,replace=TRUE),collapse="")) +system.time(stringdist(x,y,method='lv', nthread = 1)) +system.time(stringdist(x,y,method='dl', nthread = 1)) +# 0.472 +system.time(stringdist(x,y,method='jaccard', nthread = 1)) +# 0.739 +system.time(stringdist(x,y,method='cosine', nthread = 1)) +system.time(stringdist(x,y,method='qgram', nthread = 1)) + +=# + diff --git a/test/performance/pairwise.jl b/benchmark/pairwise.jl similarity index 100% rename from test/performance/pairwise.jl rename to benchmark/pairwise.jl diff --git a/test/modifiers.jl b/test/modifiers.jl index 7c2f73d..9267a71 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -2,7 +2,33 @@ using StringDistances, Unicode, Test @testset "Modifiers" begin + # Partial + @test Partial(QGram(2))("martha", "marhta") == 6 + @test Partial(QGram(2))("martha", missing) === missing + @test Partial(Levenshtein())("martha", "marhta") == 2 + @test Partial(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5 + @test Partial(RatcliffObershelp())("martha", missing) === missing + # TokenSort + @test TokenSort(QGram(2))("martha", "marhta") == 6 + @test TokenSort(QGram(2))("martha", missing) === missing + @test TokenSort(Levenshtein())("martha", "marhta") == 2 + @test TokenSort(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5 + + # TokenSet + @test TokenSet(QGram(2))("martha", "marhta") == 6 + @test TokenSet(QGram(2))("martha", missing) === missing + @test TokenSet(Levenshtein())("martha", "marhta") == 2 + @test TokenSet(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5 + + # TokenMax + @test TokenMax(QGram(2))("martha", "marhta") ≈ 0.6 + @test TokenMax(QGram(2))("martha", missing) === missing + @test TokenMax(Levenshtein())("martha", "marhta") ≈ 1/3 + @test TokenMax(RatcliffObershelp())("martha", "marhta") ≈ 0.16666666 atol = 1e-5 +end + +@testset "Compare" begin # Qgram @test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4 @test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4 @@ -19,13 +45,9 @@ using StringDistances, Unicode, Test # Jaro @test compare("aüa", "aua", Hamming()) ≈ 2/3 @test compare("aüa", "aua", Jaro()) ≈ 0.77777777 atol = 1e-4 + @test compare("New York Yankees", "", Partial(Jaro())) ≈ 0.0 - #Levenshtein - compare("aüa", "aua", Levenshtein()) - compare("aüa", "aua", DamerauLevenshtein()) - @test compare("ab", "de", Partial(DamerauLevenshtein())) == 0 - @test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 - # Winkler + # JaroWinkler @test compare("martha", "marhta", JaroWinkler()) ≈ 0.9611 atol = 1e-4 @test compare("dwayne", "duane", JaroWinkler()) ≈ 0.84 atol = 1e-4 @test compare("dixon", "dicksonx", JaroWinkler()) ≈ 0.81333 atol = 1e-4 @@ -34,12 +56,17 @@ using StringDistances, Unicode, Test @test compare("a", "a", JaroWinkler()) ≈ 1.0 atol = 1e-4 @test compare("abc", "xyz", JaroWinkler()) ≈ 0.0 atol = 1e-4 + #Levenshtein + compare("aüa", "aua", Levenshtein()) + @test compare("ok", missing, Levenshtein()) === missing + compare("aüa", "aua", DamerauLevenshtein()) + @test StringDistances.normalize(Partial(DamerauLevenshtein()))("ab", "cde") == 1.0 + @test compare("ab", "de", Partial(DamerauLevenshtein())) == 0 + # RatcliffObershelp @test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0 @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 compare("aüa", "aua", TokenMax(RatcliffObershelp())) - - @test compare("New York Yankees", "", Partial(Jaro())) ≈ 0.0 @test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0 @test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0 #@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444 @@ -47,28 +74,26 @@ using StringDistances, Unicode, Test @test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8 @test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8 @test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888 - @test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0 + @test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0 @test compare(graphemes("New York Mets vs Atlanta Braves"), graphemes("Atlanta Braves vs New York Mets"), Partial(RatcliffObershelp())) ≈ compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", Partial(RatcliffObershelp())) @test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094 @test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0 @test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0 @test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0 @test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333 - @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 - @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7 - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79 - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88 - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11 - @test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39 - @test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88 - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39 - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88 + @test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp()) ≈ 5 / 100 atol = 1e-2 + @test compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp())) ≈ 7 / 100 atol = 1e-2 + @test compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp())) ≈ 79 / 100 atol = 1e-2 + @test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp())) ≈ 88 / 100 atol = 1e-2 + @test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp())) ≈ 11 / 100 atol = 1e-2 + @test compare("mariners", "are mariner playing tomorrow", RatcliffObershelp()) ≈ 39 / 100 atol = 1e-2 + @test compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp())) ≈ 88 / 100 atol = 1e-2 + @test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp())) ≈ 39 / 100 atol = 1e-2 + @test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp()))) ≈ 88 / 100 atol = 1e-2 # not exactly the same because tokenmax has uses the max of rounded tokenset etc - @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52 + @test compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp())) ≈ 52 / 100 atol = 1e-2 - # check missing - @test compare("ok", missing, Levenshtein()) === missing # check min strings = [ @@ -98,21 +123,21 @@ using StringDistances, Unicode, Test end end end +end - # check find_best and find_all + +@testset "Find*" begin + # findnearest @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ("NewYork", 1) @test findnearest("New York", ["San Francisco", "NewYork", "Newark"], Levenshtein()) == ("NewYork", 2) @test findnearest("New York", ["Newark", "San Francisco", "NewYork"], Levenshtein()) == ("NewYork", 3) - - @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ("NewYork", 1) - @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], StringDistances.normalize(QGram(2))) == ("NewYork", 1) - + @test findnearest("New York", ["NewYork", "Newark", "San Francisco"], QGram(2)) == ("NewYork", 1) + # findall @test findall("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == [1] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == [1, 2] @test findall("New York", ["NewYork", "Newark", "San Francisco"], Jaro(); min_score = 0.99) == Int[] - @test findall("New York", ["NewYork", "Newark", "San Francisco"], QGram(2); min_score = 0.99) == Int[] @@ -123,12 +148,4 @@ using StringDistances, Unicode, Test @test findall("New York", skipmissing(["NewYork", "Newark", missing]), Levenshtein()) == [1] @test findall("New York", skipmissing(Union{AbstractString, Missing}[missing, missing]), Levenshtein()) == [] end - -end - - -#= Python code -from fuzzywuzzy import fuzz -fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") -fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") -=# \ No newline at end of file +end \ No newline at end of file diff --git a/test/pairwise.jl b/test/pairwise.jl index ae4010e..f7a8583 100644 --- a/test/pairwise.jl +++ b/test/pairwise.jl @@ -1,17 +1,15 @@ using StringDistances, Unicode, Test, Random -@testset "pairwise" begin -TestStrings1 = ["", "abc", "bc", "kitten"] -TestStrings2 = ["mew", "ab"] +@testset "Pairwise" begin -TestStrings1missing = ["", "abc", "bc", missing] -TestStrings2missing = ["mew", missing] + TestStrings1 = ["", "abc", "bc", "kitten"] + TestStrings2 = ["mew", "ab"] + TestStrings1missing = ["", "abc", "bc", missing] + TestStrings2missing = ["mew", missing] -@testset "pairwise" begin - for DT in [Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp, - QGram, Cosine, Jaccard, SorensenDice, Overlap] + for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(), + QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)] - d = (DT <: AbstractQGramDistance) ? DT(2) : DT() R = pairwise(d, TestStrings1) @test size(R) == (4, 4) @@ -70,7 +68,7 @@ TestStrings2missing = ["mew", missing] end # Ensure same result if preprocessing for QGramDistances - if DT <: AbstractQGramDistance + if d isa AbstractQGramDistance R4 = pairwise(d, TestStrings1; preprocess = true) @test typeof(R4) == typeof(R) @test size(R4) == size(R) @@ -84,6 +82,4 @@ TestStrings2missing = ["mew", missing] R5 = pairwise(d, TestStrings1missing; preprocess = true) @test eltype(R5) == Union{result_type(d, String, String), Missing} end -end - -end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 42e8f0a..f8a7f7d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,5 +2,5 @@ using StringDistances using Test include("distances.jl") +include("pairwise.jl") include("modifiers.jl") -include("pairwise.jl") \ No newline at end of file