using StringDistances, Unicode, Test, Random @testset "Pairwise" begin TestStrings1 = ["", "abc", "bc", "kitten"] TestStrings2 = ["mew", "ab"] TestStrings1missing = ["", "abc", "bc", missing] TestStrings2missing = ["mew", missing] for d in [Jaro(), Levenshtein(), DamerauLevenshtein(), RatcliffObershelp(), QGram(2), Cosine(2), Jaccard(2), SorensenDice(2), Overlap(2)] R = pairwise(d, TestStrings1) @test size(R) == (4, 4) # No distance on the diagonal, since comparing strings to themselves @test R[1, 1] == 0.0 @test R[2, 2] == 0.0 @test R[3, 3] == 0.0 @test R[4, 4] == 0.0 # Since the distance might be NaN: equalorNaN(x, y) = (x == y) || (isnan(x) && isnan(y)) # First row is comparing "" to the other strings, so: @test equalorNaN(R[1, 2], evaluate(d, "", "abc")) @test equalorNaN(R[1, 3], evaluate(d, "", "bc")) @test equalorNaN(R[1, 4], evaluate(d, "", "kitten")) # Second row is comparing "abc" to the other strings, so: @test equalorNaN(R[2, 3], evaluate(d, "abc", "bc")) @test equalorNaN(R[2, 4], evaluate(d, "abc", "kitten")) # Third row row is comparing "bc" to the other strings, so: @test equalorNaN(R[3, 4], evaluate(d, "bc", "kitten")) # Matrix is symmetric for i in 1:4 for j in (i+1):4 @test equalorNaN(R[i, j], R[j, i]) end end # Test also the assymetric version R2 = pairwise(d, TestStrings1, TestStrings2) @test size(R2) == (4, 2) @test equalorNaN(R2[1, 1], evaluate(d, "", "mew")) @test equalorNaN(R2[1, 2], evaluate(d, "", "ab")) @test equalorNaN(R2[2, 1], evaluate(d, "abc", "mew")) @test equalorNaN(R2[2, 2], evaluate(d, "abc", "ab")) @test equalorNaN(R2[3, 1], evaluate(d, "bc", "mew")) @test equalorNaN(R2[3, 2], evaluate(d, "bc", "ab")) @test equalorNaN(R2[4, 1], evaluate(d, "kitten", "mew")) @test equalorNaN(R2[4, 2], evaluate(d, "kitten", "ab")) R3 = pairwise(d, TestStrings2, TestStrings1) @test size(R3) == (2, 4) for i in 1:length(TestStrings1) for j in 1:length(TestStrings2) @test equalorNaN(R2[i, j], R3[j, i]) end end # Ensure same result if preprocessing for QGramDistances if d isa AbstractQGramDistance R4 = pairwise(d, TestStrings1; preprocess = true) @test typeof(R4) == typeof(R) @test size(R4) == size(R) for i in 1:size(R4, 1) for j in 1:size(R4, 2) @test equalorNaN(R4[i, j], R[i, j]) end end end # ensures missing R5 = pairwise(d, TestStrings1missing; preprocess = true) @test eltype(R5) == Union{result_type(d, String, String), Missing} end end