From 16cf5abb94e172ad4ff90773caef87a64f46afc1 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Thu, 12 Dec 2019 09:38:20 -0500 Subject: [PATCH] improve support for missings --- .github/workflows/CompatHelper.yml | 24 +++ Project.toml | 16 +- src/StringDistances.jl | 11 +- src/compare.jl | 63 +++---- src/edit.jl | 24 ++- src/find.jl | 1 - src/qgram.jl | 10 +- src/utils.jl | 5 +- test/distances.jl | 269 +++++++++++++++-------------- test/modifiers.jl | 255 +++++++++++---------------- 10 files changed, 326 insertions(+), 352 deletions(-) create mode 100644 .github/workflows/CompatHelper.yml diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..68dbe39 --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,24 @@ +name: CompatHelper + +on: + schedule: + - cron: '00 * * * *' + +jobs: + CompatHelper: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1.2.0] + julia-arch: [x86] + os: [ubuntu-latest] + steps: + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/Project.toml b/Project.toml index e6451d6..394c5e8 100644 --- a/Project.toml +++ b/Project.toml @@ -3,12 +3,18 @@ uuid = "88034a9c-02f8-509d-84a9-84ec65e18404" version = "0.5.0" [deps] -DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" [compat] -DataStructures = "0.17" -Distances = "0.8" julia = "1" +DataStructures = "0.14, 0.15, 0.16, 0.17" +Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] + + diff --git a/src/StringDistances.jl b/src/StringDistances.jl index fe33542..40b72d4 100755 --- a/src/StringDistances.jl +++ b/src/StringDistances.jl @@ -1,13 +1,17 @@ module StringDistances + + +using Distances +import Distances: evaluate, result_type +using DataStructures # for SortedSet in TokenSort + ############################################################################## ## ## Export ## ############################################################################## -using DataStructures -import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate -import Distances: evaluate, Hamming, hamming, PreMetric, result_type, SemiMetric + export evaluate, compare, @@ -46,7 +50,6 @@ function result_type(m::Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, Ra typeof(evaluate(m, oneunit(a), oneunit(b))) end - end ############################################################################## diff --git a/src/compare.jl b/src/compare.jl index 9176554..3a26796 100755 --- a/src/compare.jl +++ b/src/compare.jl @@ -10,31 +10,34 @@ compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist` """ -function compare(s1::AbstractString, s2::AbstractString, dist::Hamming; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Hamming; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 1.0 - evaluate(dist, s1, s2) / len2 end -function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing 1.0 - evaluate(dist, s1, s2) end -function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::AbstractQGramDistance; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing # When string length < q for qgram distance, returns s1 == s2 s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 <= dist.q - 1 && return convert(Float64, s1 == s2) if typeof(dist) <: QGram - 1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) + 1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2) else - 1 - evaluate(dist, s1, s2) + 1.0 - evaluate(dist, s1, s2) end end - -function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len2 == 0 && return 1.0 @@ -48,10 +51,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei end end - - - -@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist) +@deprecate compare(dist::PreMetric, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) compare(s1, s2, dist) ############################################################################## ## @@ -76,7 +76,8 @@ end Winkler(x) = Winkler(x, 0.1, 0.7, 4) # hard to use min_score because of whether there is boost or not in the end -function compare(s1::AbstractString, s2::AbstractString, dist::Winkler) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Winkler; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing l = remove_prefix(s1, s2, dist.l)[1] # cannot do min_score because of boosting threshold score = compare(s1, s2, dist.dist) @@ -103,7 +104,8 @@ struct Partial{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score) @@ -117,8 +119,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor return out end -function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp}; - min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial{RatcliffObershelp}; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) len1 == len2 && return compare(s1, s2, dist.dist) @@ -134,7 +136,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO s2_start += len2 - s2_end s2_end += len2 - s2_end end - i2_start = nextind(s2, 0, s2_start) + i2_start = nextind(s2, 0, s2_start) i2_end = nextind(s2, 0, s2_end) curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp()) out = max(out, curr) @@ -157,7 +159,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSort; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1 = join(sort!(split(s1)), " ") s2 = join(sort!(split(s2)), " ") compare(s1, s2, dist.dist; min_score = min_score) @@ -178,7 +181,8 @@ struct TokenSet{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSet; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing v1 = SortedSet(split(s1)) v2 = SortedSet(split(s2)) v0 = intersect(v1, v2) @@ -209,7 +213,8 @@ struct TokenMax{T <: PreMetric} <: PreMetric dist::T end -function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0) +function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenMax; min_score = 0.0) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) dist0 = compare(s1, s2, dist.dist; min_score = min_score) @@ -239,22 +244,4 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_sco min_score = min_score / unbase_scale) return max(dist0, dist1, dist2) end -end - - -############################################################################## -## -## Missing Values -## -############################################################################## - -function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_score = nothing) - missing -end -function compare(::Missing, s2::AbstractString, dist::PreMetric; min_score = nothing) - missing -end -function compare(::Missing, ::Missing, dist::PreMetric; min_score = nothing) - missing -end - +end \ No newline at end of file diff --git a/src/edit.jl b/src/edit.jl index 5384f90..1ceb7b3 100755 --- a/src/edit.jl +++ b/src/edit.jl @@ -4,7 +4,7 @@ ## Hamming ## ############################################################################## -function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Hamming, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) current = abs(length(s2) - length(s1)) for (ch1, ch2) in zip(s1, s2) current += ch1 != ch2 @@ -12,6 +12,11 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString) return current end +evaluate(dist::Hamming, s1::Missing, s2::AbstractString) = missing +evaluate(dist::Hamming, s1::AbstractString, s2::Missing) = missing + + + ############################################################################## ## ## Jaro @@ -33,7 +38,8 @@ where ``m`` is the number of matching characters and struct Jaro <: SemiMetric end ## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html -function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) +function evaluate(dist::Jaro, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) # if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case @@ -85,8 +91,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString) t += ch2 != iterate(s1, i1_match[i1])[1] end end - current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 - return current + return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0 end ############################################################################## @@ -108,8 +113,8 @@ The Levenshtein distance is the minimum number of operations (consisting of inse struct Levenshtein <: SemiMetric end ## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html -function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString; - max_dist = nothing) +function evaluate(dist::Levenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 @@ -163,8 +168,8 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting struct DamerauLevenshtein <: SemiMetric end ## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html -function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString; - max_dist = nothing) +function evaluate(dist::DamerauLevenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing) + (ismissing(s1) | ismissing(s2)) && return missing s1, s2 = reorder(s1, s2) len1, len2 = length(s1), length(s2) max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1 @@ -250,7 +255,8 @@ The distance between two strings is defined as one minus the number of matching """ struct RatcliffObershelp <: PreMetric end -function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString) +function evaluate(dist::RatcliffObershelp, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + (ismissing(s1) | ismissing(s2)) && return missing n_matched = sum(last.(matching_blocks(s1, s2))) len1, len2 = length(s1), length(s2) len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2) diff --git a/src/find.jl b/src/find.jl index 3519a6d..52aeb04 100755 --- a/src/find.jl +++ b/src/find.jl @@ -18,7 +18,6 @@ function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0 end - """ find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8) `find_all` returns the vector with all the elements of `iter` that have a similarity score higher than `min_score` according to the distance `dist`. diff --git a/src/qgram.jl b/src/qgram.jl index 355d8b4..8e47dbc 100755 --- a/src/qgram.jl +++ b/src/qgram.jl @@ -32,7 +32,7 @@ Return an iterator that iterates on the QGram of the string ```julia using StringDistances for x in qgram("hello", 2) - @show x + println(x) end ``` """ @@ -131,7 +131,8 @@ end ############################################################################## abstract type AbstractQGramDistance <: SemiMetric end -function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString) +function evaluate(dist::AbstractQGramDistance, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) + (ismissing(s1) | ismissing(s2)) && return missing x = count_map(qgram(s1, dist.q), qgram(s2, dist.q)) evaluate(dist, x) end @@ -141,11 +142,6 @@ end ## q-gram ## ############################################################################## -""" -For an AbstractString s, denote v(s) the vector on the space of q-grams of length N, that contains the number of times a q-gram appears in s -The q-gram distance is ||v(s1) - v(s2)|| -""" - """ QGram(q::Int) diff --git a/src/utils.jl b/src/utils.jl index 6acc138..3c6a018 100755 --- a/src/utils.jl +++ b/src/utils.jl @@ -5,14 +5,13 @@ struct StringWithLength{T<:AbstractString} <: AbstractString l::Int end string_with_length(s::AbstractString) = StringWithLength(s, length(s)) -string_with_length(s::StringWithLength) = s Base.length(s::StringWithLength) = s.l Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i) -Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2) -Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s) Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n) Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s) Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i) + + function reorder(s1::AbstractString, s2::AbstractString) s1 = string_with_length(s1) s2 = string_with_length(s2) diff --git a/test/distances.jl b/test/distances.jl index b746992..58b0d23 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -3,147 +3,162 @@ using StringDistances, Test @testset "Distances" begin -@testset "Levenshtein" begin - @test evaluate(Levenshtein(), "", "") == 0 - @test evaluate(Levenshtein(), "abc", "") == 3 - @test evaluate(Levenshtein(), "", "abc") == 3 - @test evaluate(Levenshtein(), "bc", "abc") == 1 - @test evaluate(Levenshtein(), "kitten", "sitting") == 3 - @test evaluate(Levenshtein(), "saturday", "sunday") == 3 - @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 - @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 - @test result_type(Levenshtein(), "hello", "world") == Int - @inferred evaluate(Levenshtein(), "", "") -end + @testset "Levenshtein" begin + @test evaluate(Levenshtein(), "", "") == 0 + @test evaluate(Levenshtein(), "abc", "") == 3 + @test evaluate(Levenshtein(), "", "abc") == 3 + @test evaluate(Levenshtein(), "bc", "abc") == 1 + @test evaluate(Levenshtein(), "kitten", "sitting") == 3 + @test evaluate(Levenshtein(), "saturday", "sunday") == 3 + @test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4 + @test evaluate(Levenshtein(), "alborgów", "amoniak") == 6 + @test result_type(Levenshtein(), "hello", "world") == Int + @test ismissing(evaluate(Levenshtein(), "", missing)) + @inferred evaluate(Levenshtein(), "", "") + end -@testset "DamerauLevenshtein" begin - @test evaluate(DamerauLevenshtein(), "", "") == 0 - @test evaluate(DamerauLevenshtein(), "abc", "") == 3 - @test evaluate(DamerauLevenshtein(), "bc", "abc") == 1 - @test evaluate(DamerauLevenshtein(), "fuor", "four") == 1 - @test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2 - @test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17 - @test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2 - @test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2 - @test result_type(DamerauLevenshtein(), "hello", "world") == Int - @inferred evaluate(DamerauLevenshtein(), "", "") -end + @testset "DamerauLevenshtein" begin + @test evaluate(DamerauLevenshtein(), "", "") == 0 + @test evaluate(DamerauLevenshtein(), "abc", "") == 3 + @test evaluate(DamerauLevenshtein(), "bc", "abc") == 1 + @test evaluate(DamerauLevenshtein(), "fuor", "four") == 1 + @test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2 + @test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17 + @test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2 + @test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2 + @test result_type(DamerauLevenshtein(), "hello", "world") == Int + @test ismissing(evaluate(DamerauLevenshtein(), "", missing)) + @inferred evaluate(DamerauLevenshtein(), "", "") + end -@testset "Hamming" begin - @test evaluate(Hamming(), "", "") == 0 - @test evaluate(Hamming(), "", "abc") == 3 - @test evaluate(Hamming(), "abc", "abc") == 0 - @test evaluate(Hamming(), "acc", "abc") == 1 - @test evaluate(Hamming(), "abcd", "abc") == 1 - @test evaluate(Hamming(), "abc", "abcd") == 1 - @test evaluate(Hamming(), "testing", "this is a test") == 13 - @test evaluate(Hamming(), "saturday", "sunday") == 7 - @test result_type(Hamming(), "hello", "world") == Int - @inferred evaluate(Hamming(), "", "") -end + @testset "Hamming" begin + @test evaluate(Hamming(), "", "") == 0 + @test evaluate(Hamming(), "", "abc") == 3 + @test evaluate(Hamming(), "abc", "abc") == 0 + @test evaluate(Hamming(), "acc", "abc") == 1 + @test evaluate(Hamming(), "abcd", "abc") == 1 + @test evaluate(Hamming(), "abc", "abcd") == 1 + @test evaluate(Hamming(), "testing", "this is a test") == 13 + @test evaluate(Hamming(), "saturday", "sunday") == 7 + @test result_type(Hamming(), "hello", "world") == Int + @test ismissing(evaluate(Hamming(), "", missing)) + @inferred evaluate(Hamming(), "", "") + end -@testset "QGram" begin - @test evaluate(QGram(1), "abc", "abc") == 0 - @test evaluate(QGram(1), "", "abc") == 3 - @test evaluate(QGram(1), "abc", "cba") == 0 - @test evaluate(QGram(1), "abc", "ccc") == 4 - @test result_type(QGram(1), "hello", "world") == Int - @inferred evaluate(QGram(1), "", "") -end + @testset "QGram" begin + @test evaluate(QGram(1), "abc", "abc") == 0 + @test evaluate(QGram(1), "", "abc") == 3 + @test evaluate(QGram(1), "abc", "cba") == 0 + @test evaluate(QGram(1), "abc", "ccc") == 4 + @test evaluate(QGram(4), "aü☃", "aüaüafs") == 4 + @test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2 + @test result_type(QGram(1), "hello", "world") == Int + @test ismissing(evaluate(QGram(1), "", missing)) + @inferred evaluate(QGram(1), "", "") + end -@testset "Cosine" begin - @test isnan(evaluate(Cosine(2), "", "abc")) - @test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4 - @test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4 - @test result_type(Cosine(2), "hello", "world") == typeof(float(1)) - @inferred evaluate(Cosine(2), "", "") - @inferred evaluate(Cosine(2), "abc", "ccc") -end + @testset "Cosine" begin + @test isnan(evaluate(Cosine(2), "", "abc")) + @test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4 + @test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4 + @test result_type(Cosine(2), "hello", "world") == typeof(float(1)) + @test ismissing(evaluate(Cosine(2), "", missing)) + @inferred evaluate(Cosine(2), "", "") + end -@testset "Jaccard" begin - @test evaluate(Jaccard(1), "", "abc") ≈ 1.0 - @test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4 - @test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4 - @test result_type(Jaccard(1), "hello", "world") == typeof(float(1)) - @inferred evaluate(Jaccard(1), "", "") -end + @testset "Jaccard" begin + @test evaluate(Jaccard(1), "", "abc") ≈ 1.0 + @test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4 + @test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4 + @test result_type(Jaccard(1), "hello", "world") == typeof(float(1)) + @test ismissing(evaluate(Jaccard(1), "", missing)) + @inferred evaluate(Jaccard(1), "", "") + end -@testset "SorensenDice" begin - @test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4 - @test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4 - @test result_type(SorensenDice(1), "hello", "world") == typeof(float(1)) - @inferred evaluate(SorensenDice(1), "", "") -end + @testset "SorensenDice" begin + @test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4 + @test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4 + @test result_type(SorensenDice(1), "hello", "world") == typeof(float(1)) + @test ismissing(evaluate(SorensenDice(1), "", missing)) + @inferred evaluate(SorensenDice(1), "", "") + end -@testset "Overlap" begin - @test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4 - @test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4 - @test result_type(Overlap(1), "hello", "world") == typeof(float(1)) - @inferred evaluate(Overlap(1), "", "") -end + @testset "Overlap" begin + @test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4 + @test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4 + @test result_type(Overlap(1), "hello", "world") == typeof(float(1)) + @test ismissing(evaluate(Overlap(1), "", missing)) + @inferred evaluate(Overlap(1), "", "") + end -@testset "RatcliffObershelp" begin - @test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154 - @test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579 - @test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 - @test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0 - @test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963 - @test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869 - @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 - @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) - @inferred evaluate(RatcliffObershelp(), "", "") -end + @testset "RatcliffObershelp" begin + @test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154 + @test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579 + @test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666 + @test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0 + @test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963 + @test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869 + @test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762 + @test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1)) + @test ismissing(evaluate(RatcliffObershelp(), "", missing)) + @inferred evaluate(RatcliffObershelp(), "", "") + end -@testset "Jaro" begin - @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 - @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 - @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777 - @test result_type(Jaro(), "hello", "world") == typeof(float(1)) -end + @testset "Jaro" begin + @test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547 + @test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777 + @test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777 + @test result_type(Jaro(), "hello", "world") == typeof(float(1)) + end -strings = [ -("martha", "marhta"), -("dwayne", "duane") , -("dixon", "dicksonx"), -("william", "williams"), -("", "foo"), -("a", "a"), -("abc", "xyz"), -("abc", "ccc"), -("kitten", "sitting"), -("saturday", "sunday"), -("hi, my name is", "my name is"), -("alborgów", "amoniak"), -("cape sand recycling ", "edith ann graham"), -( "jellyifhs", "jellyfish"), -("ifhs", "fish"), -("leia", "leela"), -] + strings = [ + ("martha", "marhta"), + ("dwayne", "duane") , + ("dixon", "dicksonx"), + ("william", "williams"), + ("", "foo"), + ("a", "a"), + ("abc", "xyz"), + ("abc", "ccc"), + ("kitten", "sitting"), + ("saturday", "sunday"), + ("hi, my name is", "my name is"), + ("alborgów", "amoniak"), + ("cape sand recycling ", "edith ann graham"), + ( "jellyifhs", "jellyfish"), + ("ifhs", "fish"), + ("leia", "leela"), + ] -solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), - (DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]), - (Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]), - (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), - (QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]), - (Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]), - (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), - (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) -# Test with R package StringDist -for x in solutions - t, solution = x - for i in 1:length(solution) - if isnan(evaluate(t, strings[i]...)) - @test isnan(solution[i]) - else - @test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4 + solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]), + (DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]), + (Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]), + (QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]), + (QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]), + (Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]), + (Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]), + (Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249])) + # Test with R package StringDist + for x in solutions + t, solution = x + for i in 1:length(solution) + if isnan(evaluate(t, strings[i]...)) + @test isnan(solution[i]) + else + @test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4 + end end end + # test RatcliffObershelp + solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67] + for i in eachindex(strings) + @test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4 + end end - #= R test library(stringdist) strings = matrix(data = c( @@ -174,13 +189,6 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1) - -# test RatcliffObershelp -solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67] -for i in eachindex(strings) - @test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4 -end - #= Fuzzywuzzy usesRatcliffObershelp if python-Levenshtein not installed, fuzzywuzzy uses RatcliffObershelp) from fuzzywuzzy import fuzz strings = [ @@ -205,4 +213,3 @@ for x in strings: print(fuzz.ratio(x[0], x[1])) =# -end diff --git a/test/modifiers.jl b/test/modifiers.jl index 74063da..f75c4a6 100644 --- a/test/modifiers.jl +++ b/test/modifiers.jl @@ -3,170 +3,117 @@ using StringDistances, Test @testset "Modifiers" begin -# Compare -@test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4 -@test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4 -@test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4 + # Hamming + @test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4 + @test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4 + @test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4 + @test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1 + @test compare("New York Yankees", "", Partial(Hamming())) ≈ 1 + compare("aüa", "aua", Hamming()) -@test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4 -@test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4 -@test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4 + # Qgram + @test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4 + @test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4 + @test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4 + compare("aüa", "aua", TokenMax(QGram(2))) + @test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4 + @test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 + @test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 + @test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0 + @test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4 + @test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4 + @test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4 -@test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4 + # Jaro + compare("aüa", "aua", Jaro()) -@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 -@test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4 -@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4 -@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4 -@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4 + #Levenshtein + compare("aüa", "aua", Levenshtein()) + compare("aüa", "aua", DamerauLevenshtein()) + + # Winkler + @test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4 + @test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4 + @test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4 + @test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4 + @test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 + @test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4 + @test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 + + # RatcliffObershelp + @test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0 + @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 + compare("aüa", "aua", TokenMax(RatcliffObershelp())) + + @test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0 + @test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0 + @test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444 + @test compare("HSINCHUANG", "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875 + @test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8 + @test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8 + @test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888 + @test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0 + @test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094 + @test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0 + @test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0 + @test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0 + @test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333 + @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 + @test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7 + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79 + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88 + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11 + @test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39 + @test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88 + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39 + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88 + # not exactly the same because tokenmax has uses the max of rounded tokenset etc + @test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52 -# Winkler -@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4 -@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4 -@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4 -@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4 -@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 -@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4 -@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4 + # check missing + @test compare("ok", missing, Levenshtein()) === missing + + # check min + strings = [ + ("martha", "marhta"), + ("dwayne", "duane") , + ("dixon", "dicksonx"), + ("william", "williams"), + ("", "foo"), + ("a", "a"), + ("abc", "xyz"), + ("abc", "ccc"), + ("kitten", "sitting"), + ("saturday", "sunday"), + ("hi, my name is", "my name is"), + ("alborgów", "amoniak"), + ("cape sand recycling ", "edith ann graham"), + ( "jellyifhs", "jellyfish"), + ("ifhs", "fish"), + ("leia", "leela"), + ] + for dist in (Levenshtein, DamerauLevenshtein) + for i in eachindex(strings) + if compare(strings[i]..., dist()) < 1 / 3 + @test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0 + else + @test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ compare(strings[i]..., dist()) + end + end + end + + # check find_best and find_all + @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork" + @test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork" + @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"] + @test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"] -strings = [ -("martha", "marhta"), -("dwayne", "duane") , -("dixon", "dicksonx"), -("william", "williams"), -("", "foo") -] -solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000] -for i in 1:length(solutions) - @test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0, 4)) ≈ (1 - solutions[i]) atol = 1e-4 end - - - -# Partial -@test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0 - -@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0 -@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0 -@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444 - - -s = "HSINCHUANG" -@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875 -@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8 -@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8 -@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888 - -@test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1 -@test compare("New York Yankees", "", Partial(Hamming())) ≈ 1 - - - -# Token -@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0 -@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094 - - -@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0 - - -@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0 - -# ADD AGAIN -@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0 - - - - -@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333 - - - -#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0 -#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094 -#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0 -#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0 - - - -@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0 - - -@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 - - -# test with fuzz ratio -@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5 -@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7 -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79 -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88 -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11 -@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39 -@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88 -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39 -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88 -# not exactly the same because tokenmax has uses the max of rounded tokenset etc -@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52 - #= Python code from fuzzywuzzy import fuzz fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。") -fuzz.WRatio("mariners", "mariner are playing tomorrow") -fuzz.partial_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.token_sort_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.partial_token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow") -=# - - -using StringDistances, Test - -# check with weird utf8 strings -compare("aüa", "aua", TokenMax(RatcliffObershelp())) -compare("aüa", "aua", TokenMax(QGram(2))) -compare("aüa", "aua", DamerauLevenshtein()) -compare("aüa", "aua", Hamming()) -compare("aüa", "aua", Jaro()) -compare("aüa", "aua", Levenshtein()) - - -s1 = "aü☃" -s2 = "aüaüafs" -dist = QGram(4) -@test evaluate(dist, s1, s2) == 4 - -# check Substrings work -s1 = SubString(s1, 1, 4) -s2 = SubString(s2, 1, 4) -dist = QGram(2) -@test evaluate(dist, s1, s2) == 2 - - -# check missing -@test compare(s1, missing, Levenshtein()) === missing - -# check min -for dist in (Levenshtein, DamerauLevenshtein) - for i in eachindex(strings) - if compare(strings[i]..., dist()) < 1 / 3 - @test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0 - else - @test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ compare(strings[i]..., dist()) - end - end -end - -# check find_best and find_all -@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork" -@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork" -@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"] -@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"] - -end +=# \ No newline at end of file