improve support for missings

pull/22/head
matthieugomez 2019-12-12 09:38:20 -05:00
parent e0cc4f6bea
commit 16cf5abb94
10 changed files with 326 additions and 352 deletions

24
.github/workflows/CompatHelper.yml vendored Normal file
View File

@ -0,0 +1,24 @@
name: CompatHelper
on:
schedule:
- cron: '00 * * * *'
jobs:
CompatHelper:
runs-on: ${{ matrix.os }}
strategy:
matrix:
julia-version: [1.2.0]
julia-arch: [x86]
os: [ubuntu-latest]
steps:
- uses: julia-actions/setup-julia@latest
with:
version: ${{ matrix.julia-version }}
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: julia -e 'using CompatHelper; CompatHelper.main()'

View File

@ -3,12 +3,18 @@ uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.5.0"
[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
[compat]
DataStructures = "0.17"
Distances = "0.8"
julia = "1"
DataStructures = "0.14, 0.15, 0.16, 0.17"
Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
[targets]
test = ["Test"]

View File

@ -1,13 +1,17 @@
module StringDistances
using Distances
import Distances: evaluate, result_type
using DataStructures # for SortedSet in TokenSort
##############################################################################
##
## Export
##
##############################################################################
using DataStructures
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
import Distances: evaluate, Hamming, hamming, PreMetric, result_type, SemiMetric
export
evaluate,
compare,
@ -46,7 +50,6 @@ function result_type(m::Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, Ra
typeof(evaluate(m, oneunit(a), oneunit(b)))
end
end
##############################################################################

View File

@ -10,31 +10,34 @@
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
"""
function compare(s1::AbstractString, s2::AbstractString, dist::Hamming; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Hamming; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
1.0 - evaluate(dist, s1, s2) / len2
end
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
1.0 - evaluate(dist, s1, s2)
end
function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::AbstractQGramDistance; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
# When string length < q for qgram distance, returns s1 == s2
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
if typeof(dist) <: QGram
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
else
1 - evaluate(dist, s1, s2)
1.0 - evaluate(dist, s1, s2)
end
end
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len2 == 0 && return 1.0
@ -48,10 +51,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei
end
end
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
@deprecate compare(dist::PreMetric, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) compare(s1, s2, dist)
##############################################################################
##
@ -76,7 +76,8 @@ end
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
# hard to use min_score because of whether there is boost or not in the end
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Winkler; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
l = remove_prefix(s1, s2, dist.l)[1]
# cannot do min_score because of boosting threshold
score = compare(s1, s2, dist.dist)
@ -103,7 +104,8 @@ struct Partial{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
@ -117,8 +119,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
return out
end
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp};
min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial{RatcliffObershelp}; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
len1 == len2 && return compare(s1, s2, dist.dist)
@ -134,7 +136,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
s2_start += len2 - s2_end
s2_end += len2 - s2_end
end
i2_start = nextind(s2, 0, s2_start)
i2_start = nextind(s2, 0, s2_start)
i2_end = nextind(s2, 0, s2_end)
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
out = max(out, curr)
@ -157,7 +159,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSort; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1 = join(sort!(split(s1)), " ")
s2 = join(sort!(split(s2)), " ")
compare(s1, s2, dist.dist; min_score = min_score)
@ -178,7 +181,8 @@ struct TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSet; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
v1 = SortedSet(split(s1))
v2 = SortedSet(split(s2))
v0 = intersect(v1, v2)
@ -209,7 +213,8 @@ struct TokenMax{T <: PreMetric} <: PreMetric
dist::T
end
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenMax; min_score = 0.0)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
dist0 = compare(s1, s2, dist.dist; min_score = min_score)
@ -239,22 +244,4 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_sco
min_score = min_score / unbase_scale)
return max(dist0, dist1, dist2)
end
end
##############################################################################
##
## Missing Values
##
##############################################################################
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_score = nothing)
missing
end
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_score = nothing)
missing
end
function compare(::Missing, ::Missing, dist::PreMetric; min_score = nothing)
missing
end
end

View File

@ -4,7 +4,7 @@
## Hamming
##
##############################################################################
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
function evaluate(dist::Hamming, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
current = abs(length(s2) - length(s1))
for (ch1, ch2) in zip(s1, s2)
current += ch1 != ch2
@ -12,6 +12,11 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
return current
end
evaluate(dist::Hamming, s1::Missing, s2::AbstractString) = missing
evaluate(dist::Hamming, s1::AbstractString, s2::Missing) = missing
##############################################################################
##
## Jaro
@ -33,7 +38,8 @@ where ``m`` is the number of matching characters and
struct Jaro <: SemiMetric end
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
function evaluate(dist::Jaro, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
@ -85,8 +91,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
t += ch2 != iterate(s1, i1_match[i1])[1]
end
end
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
return current
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
end
##############################################################################
@ -108,8 +113,8 @@ The Levenshtein distance is the minimum number of operations (consisting of inse
struct Levenshtein <: SemiMetric end
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
max_dist = nothing)
function evaluate(dist::Levenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -163,8 +168,8 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
struct DamerauLevenshtein <: SemiMetric end
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
max_dist = nothing)
function evaluate(dist::DamerauLevenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
(ismissing(s1) | ismissing(s2)) && return missing
s1, s2 = reorder(s1, s2)
len1, len2 = length(s1), length(s2)
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
@ -250,7 +255,8 @@ The distance between two strings is defined as one minus the number of matching
"""
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
function evaluate(dist::RatcliffObershelp, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
(ismissing(s1) | ismissing(s2)) && return missing
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)

View File

@ -18,7 +18,6 @@ function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0
end
"""
find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8)
`find_all` returns the vector with all the elements of `iter` that have a similarity score higher than `min_score` according to the distance `dist`.

View File

@ -32,7 +32,7 @@ Return an iterator that iterates on the QGram of the string
```julia
using StringDistances
for x in qgram("hello", 2)
@show x
println(x)
end
```
"""
@ -131,7 +131,8 @@ end
##############################################################################
abstract type AbstractQGramDistance <: SemiMetric end
function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
function evaluate(dist::AbstractQGramDistance, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
(ismissing(s1) | ismissing(s2)) && return missing
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
evaluate(dist, x)
end
@ -141,11 +142,6 @@ end
## q-gram
##
##############################################################################
"""
For an AbstractString s, denote v(s) the vector on the space of q-grams of length N, that contains the number of times a q-gram appears in s
The q-gram distance is ||v(s1) - v(s2)||
"""
"""
QGram(q::Int)

View File

@ -5,14 +5,13 @@ struct StringWithLength{T<:AbstractString} <: AbstractString
l::Int
end
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
string_with_length(s::StringWithLength) = s
Base.length(s::StringWithLength) = s.l
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
function reorder(s1::AbstractString, s2::AbstractString)
s1 = string_with_length(s1)
s2 = string_with_length(s2)

View File

@ -3,147 +3,162 @@ using StringDistances, Test
@testset "Distances" begin
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test result_type(Levenshtein(), "hello", "world") == Int
@inferred evaluate(Levenshtein(), "", "")
end
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test result_type(Levenshtein(), "hello", "world") == Int
@test ismissing(evaluate(Levenshtein(), "", missing))
@inferred evaluate(Levenshtein(), "", "")
end
@testset "DamerauLevenshtein" begin
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred evaluate(DamerauLevenshtein(), "", "")
end
@testset "DamerauLevenshtein" begin
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
@inferred evaluate(DamerauLevenshtein(), "", "")
end
@testset "Hamming" begin
@test evaluate(Hamming(), "", "") == 0
@test evaluate(Hamming(), "", "abc") == 3
@test evaluate(Hamming(), "abc", "abc") == 0
@test evaluate(Hamming(), "acc", "abc") == 1
@test evaluate(Hamming(), "abcd", "abc") == 1
@test evaluate(Hamming(), "abc", "abcd") == 1
@test evaluate(Hamming(), "testing", "this is a test") == 13
@test evaluate(Hamming(), "saturday", "sunday") == 7
@test result_type(Hamming(), "hello", "world") == Int
@inferred evaluate(Hamming(), "", "")
end
@testset "Hamming" begin
@test evaluate(Hamming(), "", "") == 0
@test evaluate(Hamming(), "", "abc") == 3
@test evaluate(Hamming(), "abc", "abc") == 0
@test evaluate(Hamming(), "acc", "abc") == 1
@test evaluate(Hamming(), "abcd", "abc") == 1
@test evaluate(Hamming(), "abc", "abcd") == 1
@test evaluate(Hamming(), "testing", "this is a test") == 13
@test evaluate(Hamming(), "saturday", "sunday") == 7
@test result_type(Hamming(), "hello", "world") == Int
@test ismissing(evaluate(Hamming(), "", missing))
@inferred evaluate(Hamming(), "", "")
end
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test result_type(QGram(1), "hello", "world") == Int
@inferred evaluate(QGram(1), "", "")
end
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
@test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
@test result_type(QGram(1), "hello", "world") == Int
@test ismissing(evaluate(QGram(1), "", missing))
@inferred evaluate(QGram(1), "", "")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@inferred evaluate(Cosine(2), "", "")
@inferred evaluate(Cosine(2), "abc", "ccc")
end
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(Cosine(2), "", missing))
@inferred evaluate(Cosine(2), "", "")
end
@testset "Jaccard" begin
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Jaccard(1), "", "")
end
@testset "Jaccard" begin
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(Jaccard(1), "", missing))
@inferred evaluate(Jaccard(1), "", "")
end
@testset "SorensenDice" begin
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@inferred evaluate(SorensenDice(1), "", "")
end
@testset "SorensenDice" begin
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(SorensenDice(1), "", missing))
@inferred evaluate(SorensenDice(1), "", "")
end
@testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Overlap(1), "", "")
end
@testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(Overlap(1), "", missing))
@inferred evaluate(Overlap(1), "", "")
end
@testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@inferred evaluate(RatcliffObershelp(), "", "")
end
@testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
@inferred evaluate(RatcliffObershelp(), "", "")
end
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
end
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
end
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo"),
("a", "a"),
("abc", "xyz"),
("abc", "ccc"),
("kitten", "sitting"),
("saturday", "sunday"),
("hi, my name is", "my name is"),
("alborgów", "amoniak"),
("cape sand recycling ", "edith ann graham"),
( "jellyifhs", "jellyfish"),
("ifhs", "fish"),
("leia", "leela"),
]
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo"),
("a", "a"),
("abc", "xyz"),
("abc", "ccc"),
("kitten", "sitting"),
("saturday", "sunday"),
("hi, my name is", "my name is"),
("alborgów", "amoniak"),
("cape sand recycling ", "edith ann graham"),
( "jellyifhs", "jellyfish"),
("ifhs", "fish"),
("leia", "leela"),
]
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
# Test with R package StringDist
for x in solutions
t, solution = x
for i in 1:length(solution)
if isnan(evaluate(t, strings[i]...))
@test isnan(solution[i])
else
@test evaluate(t, strings[i]...) solution[i] atol = 1e-4
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
# Test with R package StringDist
for x in solutions
t, solution = x
for i in 1:length(solution)
if isnan(evaluate(t, strings[i]...))
@test isnan(solution[i])
else
@test evaluate(t, strings[i]...) solution[i] atol = 1e-4
end
end
end
# test RatcliffObershelp
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
for i in eachindex(strings)
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) solution[i] atol = 1e-4
end
end
#= R test
library(stringdist)
strings = matrix(data = c(
@ -174,13 +189,6 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
# test RatcliffObershelp
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
for i in eachindex(strings)
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) solution[i] atol = 1e-4
end
#= Fuzzywuzzy usesRatcliffObershelp if python-Levenshtein not installed, fuzzywuzzy uses RatcliffObershelp)
from fuzzywuzzy import fuzz
strings = [
@ -205,4 +213,3 @@ for x in strings:
print(fuzz.ratio(x[0], x[1]))
=#
end

View File

@ -3,170 +3,117 @@ using StringDistances, Test
@testset "Modifiers" begin
# Compare
@test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@test compare("saturday", "sunday", Hamming()) 1/8 atol = 1e-4
# Hamming
@test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@test compare("saturday", "sunday", Hamming()) 1/8 atol = 1e-4
@test compare("New York Yankees", "Yankees", Partial(Hamming())) 1
@test compare("New York Yankees", "", Partial(Hamming())) 1
compare("aüa", "aua", Hamming())
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
# Qgram
@test compare("", "abc", QGram(1)) 0.0 atol = 1e-4
@test compare("abc", "cba", QGram(1)) 1.0 atol = 1e-4
@test compare("abc", "ccc", QGram(1)) 1/3 atol = 1e-4
compare("aüa", "aua", TokenMax(QGram(2)))
@test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
@test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
@test compare("", "abc", Jaccard(2)) 0.0 atol = 1e-4
# Jaro
compare("aüa", "aua", Jaro())
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Cosine(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Jaccard(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", Overlap(2)) 1.0 atol = 1e-4
@test compare("martha", "martha", SorensenDice(2)) 1.0 atol = 1e-4
#Levenshtein
compare("aüa", "aua", Levenshtein())
compare("aüa", "aua", DamerauLevenshtein())
# Winkler
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
# RatcliffObershelp
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
@test compare("HSINCHUANG", "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
# Winkler
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) 0.9611 atol = 1e-4
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) 0.84 atol = 1e-4
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) 0.81333 atol = 1e-4
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) 0.975 atol = 1e-4
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) 1.0 atol = 1e-4
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) 0.0 atol = 1e-4
# check missing
@test compare("ok", missing, Levenshtein()) === missing
# check min
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo"),
("a", "a"),
("abc", "xyz"),
("abc", "ccc"),
("kitten", "sitting"),
("saturday", "sunday"),
("hi, my name is", "my name is"),
("alborgów", "amoniak"),
("cape sand recycling ", "edith ann graham"),
( "jellyifhs", "jellyfish"),
("ifhs", "fish"),
("leia", "leela"),
]
for dist in (Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0
else
@test compare(strings[i]..., dist() ; min_score = 1/ 3) compare(strings[i]..., dist())
end
end
end
# check find_best and find_all
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
("dixon", "dicksonx"),
("william", "williams"),
("", "foo")
]
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
for i in 1:length(solutions)
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0, 4)) (1 - solutions[i]) atol = 1e-4
end
# Partial
@test compare("aa", "aa ", Partial(Jaccard(2))) 1.0
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) 1.0
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) 0.444444444444
s = "HSINCHUANG"
@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) 0.875
@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) 0.8
@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) 0.8888888888888
@test compare("New York Yankees", "Yankees", Partial(Hamming())) 1
@test compare("New York Yankees", "", Partial(Hamming())) 1
# Token
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) 1.0
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) 1.0 - 0.09090909090909094
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) 0.0
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) 0.0
# ADD AGAIN
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) 0.0
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) 0.933333333333333
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) 1.0
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
# test with fuzz ratio
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
#= Python code
from fuzzywuzzy import fuzz
fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。")
fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称中美经贸高级别磋商双方牵头人通话中方就美拟9月1日加征关税进行了严正交涉。")
fuzz.WRatio("mariners", "mariner are playing tomorrow")
fuzz.partial_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.token_sort_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.partial_token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
=#
using StringDistances, Test
# check with weird utf8 strings
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
compare("aüa", "aua", TokenMax(QGram(2)))
compare("aüa", "aua", DamerauLevenshtein())
compare("aüa", "aua", Hamming())
compare("aüa", "aua", Jaro())
compare("aüa", "aua", Levenshtein())
s1 = "aü☃"
s2 = "aüaüafs"
dist = QGram(4)
@test evaluate(dist, s1, s2) == 4
# check Substrings work
s1 = SubString(s1, 1, 4)
s2 = SubString(s2, 1, 4)
dist = QGram(2)
@test evaluate(dist, s1, s2) == 2
# check missing
@test compare(s1, missing, Levenshtein()) === missing
# check min
for dist in (Levenshtein, DamerauLevenshtein)
for i in eachindex(strings)
if compare(strings[i]..., dist()) < 1 / 3
@test compare(strings[i]..., dist() ; min_score = 1/ 3) 0.0
else
@test compare(strings[i]..., dist() ; min_score = 1/ 3) compare(strings[i]..., dist())
end
end
end
# check find_best and find_all
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
end
=#