improve support for missings
parent
e0cc4f6bea
commit
16cf5abb94
|
@ -0,0 +1,24 @@
|
|||
name: CompatHelper
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '00 * * * *'
|
||||
|
||||
jobs:
|
||||
CompatHelper:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
julia-version: [1.2.0]
|
||||
julia-arch: [x86]
|
||||
os: [ubuntu-latest]
|
||||
steps:
|
||||
- uses: julia-actions/setup-julia@latest
|
||||
with:
|
||||
version: ${{ matrix.julia-version }}
|
||||
- name: Pkg.add("CompatHelper")
|
||||
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
|
||||
- name: CompatHelper.main()
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: julia -e 'using CompatHelper; CompatHelper.main()'
|
16
Project.toml
16
Project.toml
|
@ -3,12 +3,18 @@ uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
|
|||
version = "0.5.0"
|
||||
|
||||
[deps]
|
||||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
|
||||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
|
||||
|
||||
[compat]
|
||||
DataStructures = "0.17"
|
||||
Distances = "0.8"
|
||||
julia = "1"
|
||||
DataStructures = "0.14, 0.15, 0.16, 0.17"
|
||||
Distances = "0.2, 0.3, 0.4, 0.4, 0.6, 0.7, 0.8"
|
||||
|
||||
[extras]
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
||||
[targets]
|
||||
test = ["Test"]
|
||||
|
||||
|
||||
|
|
|
@ -1,13 +1,17 @@
|
|||
module StringDistances
|
||||
|
||||
|
||||
|
||||
using Distances
|
||||
import Distances: evaluate, result_type
|
||||
using DataStructures # for SortedSet in TokenSort
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Export
|
||||
##
|
||||
##############################################################################
|
||||
using DataStructures
|
||||
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
|
||||
import Distances: evaluate, Hamming, hamming, PreMetric, result_type, SemiMetric
|
||||
|
||||
export
|
||||
evaluate,
|
||||
compare,
|
||||
|
@ -46,7 +50,6 @@ function result_type(m::Union{Hamming, Jaro, Levenshtein, DamerauLevenshtein, Ra
|
|||
typeof(evaluate(m, oneunit(a), oneunit(b)))
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
|
|
@ -10,31 +10,34 @@
|
|||
|
||||
compare returns a similarity score between the strings `s1` and `s2` based on the distance `dist`
|
||||
"""
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Hamming; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Hamming; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
1.0 - evaluate(dist, s1, s2) / len2
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Jaro, RatcliffObershelp}; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::AbstractQGramDistance; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::AbstractQGramDistance; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
# When string length < q for qgram distance, returns s1 == s2
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 <= dist.q - 1 && return convert(Float64, s1 == s2)
|
||||
if typeof(dist) <: QGram
|
||||
1 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
||||
1.0 - evaluate(dist, s1, s2) / (len1 + len2 - 2 * dist.q + 2)
|
||||
else
|
||||
1 - evaluate(dist, s1, s2)
|
||||
1.0 - evaluate(dist, s1, s2)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Union{Levenshtein, DamerauLevenshtein}; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len2 == 0 && return 1.0
|
||||
|
@ -48,10 +51,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Union{Levenshtei
|
|||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
@deprecate compare(dist::PreMetric, s1::AbstractString, s2::AbstractString) compare(s1, s2, dist)
|
||||
@deprecate compare(dist::PreMetric, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}) compare(s1, s2, dist)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
@ -76,7 +76,8 @@ end
|
|||
Winkler(x) = Winkler(x, 0.1, 0.7, 4)
|
||||
|
||||
# hard to use min_score because of whether there is boost or not in the end
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Winkler)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Winkler; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
l = remove_prefix(s1, s2, dist.l)[1]
|
||||
# cannot do min_score because of boosting threshold
|
||||
score = compare(s1, s2, dist.dist)
|
||||
|
@ -103,7 +104,8 @@ struct Partial{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist; min_score = min_score)
|
||||
|
@ -117,8 +119,8 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial; min_scor
|
|||
return out
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffObershelp};
|
||||
min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::Partial{RatcliffObershelp}; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 == len2 && return compare(s1, s2, dist.dist)
|
||||
|
@ -134,7 +136,7 @@ function compare(s1::AbstractString, s2::AbstractString, dist::Partial{RatcliffO
|
|||
s2_start += len2 - s2_end
|
||||
s2_end += len2 - s2_end
|
||||
end
|
||||
i2_start = nextind(s2, 0, s2_start)
|
||||
i2_start = nextind(s2, 0, s2_start)
|
||||
i2_end = nextind(s2, 0, s2_end)
|
||||
curr = compare(s1, SubString(s2, i2_start, i2_end), RatcliffObershelp())
|
||||
out = max(out, curr)
|
||||
|
@ -157,7 +159,8 @@ struct TokenSort{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSort; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSort; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1 = join(sort!(split(s1)), " ")
|
||||
s2 = join(sort!(split(s2)), " ")
|
||||
compare(s1, s2, dist.dist; min_score = min_score)
|
||||
|
@ -178,7 +181,8 @@ struct TokenSet{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenSet; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenSet; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
v1 = SortedSet(split(s1))
|
||||
v2 = SortedSet(split(s2))
|
||||
v0 = intersect(v1, v2)
|
||||
|
@ -209,7 +213,8 @@ struct TokenMax{T <: PreMetric} <: PreMetric
|
|||
dist::T
|
||||
end
|
||||
|
||||
function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_score = 0.0)
|
||||
function compare(s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}, dist::TokenMax; min_score = 0.0)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
dist0 = compare(s1, s2, dist.dist; min_score = min_score)
|
||||
|
@ -239,22 +244,4 @@ function compare(s1::AbstractString, s2::AbstractString, dist::TokenMax; min_sco
|
|||
min_score = min_score / unbase_scale)
|
||||
return max(dist0, dist1, dist2)
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Missing Values
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
function compare(s1::AbstractString, ::Missing, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, s2::AbstractString, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
function compare(::Missing, ::Missing, dist::PreMetric; min_score = nothing)
|
||||
missing
|
||||
end
|
||||
|
||||
end
|
24
src/edit.jl
24
src/edit.jl
|
@ -4,7 +4,7 @@
|
|||
## Hamming
|
||||
##
|
||||
##############################################################################
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::Hamming, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
current = abs(length(s2) - length(s1))
|
||||
for (ch1, ch2) in zip(s1, s2)
|
||||
current += ch1 != ch2
|
||||
|
@ -12,6 +12,11 @@ function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
|||
return current
|
||||
end
|
||||
|
||||
evaluate(dist::Hamming, s1::Missing, s2::AbstractString) = missing
|
||||
evaluate(dist::Hamming, s1::AbstractString, s2::Missing) = missing
|
||||
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Jaro
|
||||
|
@ -33,7 +38,8 @@ where ``m`` is the number of matching characters and
|
|||
struct Jaro <: SemiMetric end
|
||||
|
||||
## http://alias-i.com/lingpipe/docs/api/com/aliasi/spell/JaroWinklerDistance.html
|
||||
function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::Jaro, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
# if both are empty, m = 0 so should be 1.0 according to wikipedia. Add this line so that not the case
|
||||
|
@ -85,8 +91,7 @@ function evaluate(dist::Jaro, s1::AbstractString, s2::AbstractString)
|
|||
t += ch2 != iterate(s1, i1_match[i1])[1]
|
||||
end
|
||||
end
|
||||
current = 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
return current
|
||||
return 1.0 - (m / len1 + m / len2 + (m - t/2) / m) / 3.0
|
||||
end
|
||||
|
||||
##############################################################################
|
||||
|
@ -108,8 +113,8 @@ The Levenshtein distance is the minimum number of operations (consisting of inse
|
|||
struct Levenshtein <: SemiMetric end
|
||||
|
||||
## Source: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = nothing)
|
||||
function evaluate(dist::Levenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
|
@ -163,8 +168,8 @@ The DamerauLevenshtein distance is the minimum number of operations (consisting
|
|||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
## http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString;
|
||||
max_dist = nothing)
|
||||
function evaluate(dist::DamerauLevenshtein, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing}; max_dist = nothing)
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
s1, s2 = reorder(s1, s2)
|
||||
len1, len2 = length(s1), length(s2)
|
||||
max_dist !== nothing && len2 - len1 > max_dist && return max_dist + 1
|
||||
|
@ -250,7 +255,8 @@ The distance between two strings is defined as one minus the number of matching
|
|||
"""
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::RatcliffObershelp, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
n_matched = sum(last.(matching_blocks(s1, s2)))
|
||||
len1, len2 = length(s1), length(s2)
|
||||
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
|
||||
|
|
|
@ -18,7 +18,6 @@ function find_best(s1::AbstractString, iter_s2, dist::PreMetric; min_score = 0.0
|
|||
end
|
||||
|
||||
|
||||
|
||||
"""
|
||||
find_all(s1::AbstractString, iter, dist::PreMetric; min_score = 0.8)
|
||||
`find_all` returns the vector with all the elements of `iter` that have a similarity score higher than `min_score` according to the distance `dist`.
|
||||
|
|
10
src/qgram.jl
10
src/qgram.jl
|
@ -32,7 +32,7 @@ Return an iterator that iterates on the QGram of the string
|
|||
```julia
|
||||
using StringDistances
|
||||
for x in qgram("hello", 2)
|
||||
@show x
|
||||
println(x)
|
||||
end
|
||||
```
|
||||
"""
|
||||
|
@ -131,7 +131,8 @@ end
|
|||
##############################################################################
|
||||
abstract type AbstractQGramDistance <: SemiMetric end
|
||||
|
||||
function evaluate(dist::AbstractQGramDistance, s1::AbstractString, s2::AbstractString)
|
||||
function evaluate(dist::AbstractQGramDistance, s1::Union{AbstractString, Missing}, s2::Union{AbstractString, Missing})
|
||||
(ismissing(s1) | ismissing(s2)) && return missing
|
||||
x = count_map(qgram(s1, dist.q), qgram(s2, dist.q))
|
||||
evaluate(dist, x)
|
||||
end
|
||||
|
@ -141,11 +142,6 @@ end
|
|||
## q-gram
|
||||
##
|
||||
##############################################################################
|
||||
"""
|
||||
For an AbstractString s, denote v(s) the vector on the space of q-grams of length N, that contains the number of times a q-gram appears in s
|
||||
The q-gram distance is ||v(s1) - v(s2)||
|
||||
"""
|
||||
|
||||
"""
|
||||
QGram(q::Int)
|
||||
|
||||
|
|
|
@ -5,14 +5,13 @@ struct StringWithLength{T<:AbstractString} <: AbstractString
|
|||
l::Int
|
||||
end
|
||||
string_with_length(s::AbstractString) = StringWithLength(s, length(s))
|
||||
string_with_length(s::StringWithLength) = s
|
||||
Base.length(s::StringWithLength) = s.l
|
||||
Base.iterate(s::StringWithLength, i::Integer = firstindex(s.s)) = iterate(s.s, i)
|
||||
Base.isequal(s1::StringWithLength, s2::AbstractString) = isequal(s.s1, s2)
|
||||
Base.isequal(s1::AbstractString, s2::StringWithLength) = isequal(s1, s2.s)
|
||||
Base.nextind(s::StringWithLength, i::Int, n::Int = 1) = nextind(s.s, i, n)
|
||||
Base.ncodeunits(s::StringWithLength) = ncodeunits(s.s)
|
||||
Base.isvalid(s::StringWithLength, i::Int) = isvalid(s.s, i)
|
||||
|
||||
|
||||
function reorder(s1::AbstractString, s2::AbstractString)
|
||||
s1 = string_with_length(s1)
|
||||
s2 = string_with_length(s2)
|
||||
|
|
|
@ -3,147 +3,162 @@ using StringDistances, Test
|
|||
|
||||
@testset "Distances" begin
|
||||
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
@test evaluate(Levenshtein(), "", "abc") == 3
|
||||
@test evaluate(Levenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||
@inferred evaluate(Levenshtein(), "", "")
|
||||
end
|
||||
@testset "Levenshtein" begin
|
||||
@test evaluate(Levenshtein(), "", "") == 0
|
||||
@test evaluate(Levenshtein(), "abc", "") == 3
|
||||
@test evaluate(Levenshtein(), "", "abc") == 3
|
||||
@test evaluate(Levenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
|
||||
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
|
||||
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
|
||||
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
|
||||
@test result_type(Levenshtein(), "hello", "world") == Int
|
||||
@test ismissing(evaluate(Levenshtein(), "", missing))
|
||||
@inferred evaluate(Levenshtein(), "", "")
|
||||
end
|
||||
|
||||
@testset "DamerauLevenshtein" begin
|
||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@inferred evaluate(DamerauLevenshtein(), "", "")
|
||||
end
|
||||
@testset "DamerauLevenshtein" begin
|
||||
@test evaluate(DamerauLevenshtein(), "", "") == 0
|
||||
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
|
||||
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
|
||||
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
|
||||
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
|
||||
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
|
||||
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
|
||||
@test ismissing(evaluate(DamerauLevenshtein(), "", missing))
|
||||
@inferred evaluate(DamerauLevenshtein(), "", "")
|
||||
end
|
||||
|
||||
@testset "Hamming" begin
|
||||
@test evaluate(Hamming(), "", "") == 0
|
||||
@test evaluate(Hamming(), "", "abc") == 3
|
||||
@test evaluate(Hamming(), "abc", "abc") == 0
|
||||
@test evaluate(Hamming(), "acc", "abc") == 1
|
||||
@test evaluate(Hamming(), "abcd", "abc") == 1
|
||||
@test evaluate(Hamming(), "abc", "abcd") == 1
|
||||
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
||||
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
||||
@test result_type(Hamming(), "hello", "world") == Int
|
||||
@inferred evaluate(Hamming(), "", "")
|
||||
end
|
||||
@testset "Hamming" begin
|
||||
@test evaluate(Hamming(), "", "") == 0
|
||||
@test evaluate(Hamming(), "", "abc") == 3
|
||||
@test evaluate(Hamming(), "abc", "abc") == 0
|
||||
@test evaluate(Hamming(), "acc", "abc") == 1
|
||||
@test evaluate(Hamming(), "abcd", "abc") == 1
|
||||
@test evaluate(Hamming(), "abc", "abcd") == 1
|
||||
@test evaluate(Hamming(), "testing", "this is a test") == 13
|
||||
@test evaluate(Hamming(), "saturday", "sunday") == 7
|
||||
@test result_type(Hamming(), "hello", "world") == Int
|
||||
@test ismissing(evaluate(Hamming(), "", missing))
|
||||
@inferred evaluate(Hamming(), "", "")
|
||||
end
|
||||
|
||||
@testset "QGram" begin
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
@test result_type(QGram(1), "hello", "world") == Int
|
||||
@inferred evaluate(QGram(1), "", "")
|
||||
end
|
||||
@testset "QGram" begin
|
||||
@test evaluate(QGram(1), "abc", "abc") == 0
|
||||
@test evaluate(QGram(1), "", "abc") == 3
|
||||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
@test evaluate(QGram(4), "aü☃", "aüaüafs") == 4
|
||||
@test evaluate( QGram(2), SubString("aü☃", 1, 4), SubString("aüaüafs", 1, 4)) == 2
|
||||
@test result_type(QGram(1), "hello", "world") == Int
|
||||
@test ismissing(evaluate(QGram(1), "", missing))
|
||||
@inferred evaluate(QGram(1), "", "")
|
||||
end
|
||||
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Cosine(2), "", "")
|
||||
@inferred evaluate(Cosine(2), "abc", "ccc")
|
||||
end
|
||||
@testset "Cosine" begin
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(Cosine(2), "", missing))
|
||||
@inferred evaluate(Cosine(2), "", "")
|
||||
end
|
||||
|
||||
@testset "Jaccard" begin
|
||||
@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Jaccard(1), "", "")
|
||||
end
|
||||
@testset "Jaccard" begin
|
||||
@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(Jaccard(1), "", missing))
|
||||
@inferred evaluate(Jaccard(1), "", "")
|
||||
end
|
||||
|
||||
@testset "SorensenDice" begin
|
||||
@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(SorensenDice(1), "", "")
|
||||
end
|
||||
@testset "SorensenDice" begin
|
||||
@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(SorensenDice(1), "", missing))
|
||||
@inferred evaluate(SorensenDice(1), "", "")
|
||||
end
|
||||
|
||||
@testset "Overlap" begin
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(Overlap(1), "", "")
|
||||
end
|
||||
@testset "Overlap" begin
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(Overlap(1), "", missing))
|
||||
@inferred evaluate(Overlap(1), "", "")
|
||||
end
|
||||
|
||||
@testset "RatcliffObershelp" begin
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@inferred evaluate(RatcliffObershelp(), "", "")
|
||||
end
|
||||
@testset "RatcliffObershelp" begin
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
|
||||
@test ismissing(evaluate(RatcliffObershelp(), "", missing))
|
||||
@inferred evaluate(RatcliffObershelp(), "", "")
|
||||
end
|
||||
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
end
|
||||
@testset "Jaro" begin
|
||||
@test evaluate(Jaro(), "martha", "marhta") ≈ 0.05555555555555547
|
||||
@test evaluate(Jaro(), "es an ", " vs an") ≈ 0.2777777777777777
|
||||
@test evaluate(Jaro(), " vs an", "es an ") ≈ 0.2777777777777777
|
||||
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
|
||||
end
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
("dwayne", "duane") ,
|
||||
("dixon", "dicksonx"),
|
||||
("william", "williams"),
|
||||
("", "foo"),
|
||||
("a", "a"),
|
||||
("abc", "xyz"),
|
||||
("abc", "ccc"),
|
||||
("kitten", "sitting"),
|
||||
("saturday", "sunday"),
|
||||
("hi, my name is", "my name is"),
|
||||
("alborgów", "amoniak"),
|
||||
("cape sand recycling ", "edith ann graham"),
|
||||
( "jellyifhs", "jellyfish"),
|
||||
("ifhs", "fish"),
|
||||
("leia", "leela"),
|
||||
]
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
("dwayne", "duane") ,
|
||||
("dixon", "dicksonx"),
|
||||
("william", "williams"),
|
||||
("", "foo"),
|
||||
("a", "a"),
|
||||
("abc", "xyz"),
|
||||
("abc", "ccc"),
|
||||
("kitten", "sitting"),
|
||||
("saturday", "sunday"),
|
||||
("hi, my name is", "my name is"),
|
||||
("alborgów", "amoniak"),
|
||||
("cape sand recycling ", "edith ann graham"),
|
||||
( "jellyifhs", "jellyfish"),
|
||||
("ifhs", "fish"),
|
||||
("leia", "leela"),
|
||||
]
|
||||
|
||||
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
# Test with R package StringDist
|
||||
for x in solutions
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
if isnan(evaluate(t, strings[i]...))
|
||||
@test isnan(solution[i])
|
||||
else
|
||||
@test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4
|
||||
solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
||||
(DamerauLevenshtein(), [1 2 4 1 3 0 3 2 3 3 4 6 17 2 2 2]),
|
||||
(Jaro(), [0.05555556 0.17777778 0.23333333 0.04166667 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.2805556 0.2285714 0.48809524 0.3916667 0.07407407 0.16666667 0.21666667]),
|
||||
(QGram(1), [0 3 3 1 3 0 6 4 5 4 4 11 14 0 0 3]),
|
||||
(QGram(2), [ 6 7 7 1 2 0 4 4 7 8 4 13 32 8 6 5]),
|
||||
(Jaccard(1), [0.0 0.4285714 0.3750000 0.1666667 1.0 0.0 1.0000000 0.6666667 0.5714286 0.3750000 0.2000000 0.8333333 0.5000000 0.0 0.0 0.2500000]),
|
||||
(Jaccard(2), [ 0.7500000 0.8750000 0.7777778 0.1428571 1.0 NaN 1.0000000 1.0000000 0.7777778 0.8000000 0.3076923 1.0000000 0.9696970 0.6666667 1.0000000 0.8333333]),
|
||||
(Cosine(2), [0.6000000 0.7763932 0.6220355 0.0741799 NaN NaN 1.0000000 1.0000000 0.6348516 0.6619383 0.1679497 1.0000000 0.9407651 0.5000000 1.0000000 0.7113249]))
|
||||
# Test with R package StringDist
|
||||
for x in solutions
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
if isnan(evaluate(t, strings[i]...))
|
||||
@test isnan(solution[i])
|
||||
else
|
||||
@test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
end
|
||||
end
|
||||
# test RatcliffObershelp
|
||||
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
|
||||
for i in eachindex(strings)
|
||||
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#= R test
|
||||
library(stringdist)
|
||||
strings = matrix(data = c(
|
||||
|
@ -174,13 +189,6 @@ stringdist(strings[1,], strings[2,], method = "qgram", q = 1)
|
|||
|
||||
|
||||
|
||||
|
||||
# test RatcliffObershelp
|
||||
solution = [83, 73, 62, 93, 0, 100, 0, 33, 62, 71, 83, 27, 33, 78, 50, 67]
|
||||
for i in eachindex(strings)
|
||||
@test round(Int, (1 - evaluate(RatcliffObershelp(), strings[i]...)) * 100) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
|
||||
#= Fuzzywuzzy usesRatcliffObershelp if python-Levenshtein not installed, fuzzywuzzy uses RatcliffObershelp)
|
||||
from fuzzywuzzy import fuzz
|
||||
strings = [
|
||||
|
@ -205,4 +213,3 @@ for x in strings:
|
|||
print(fuzz.ratio(x[0], x[1]))
|
||||
=#
|
||||
|
||||
end
|
||||
|
|
|
@ -3,170 +3,117 @@ using StringDistances, Test
|
|||
|
||||
@testset "Modifiers" begin
|
||||
|
||||
# Compare
|
||||
@test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4
|
||||
@test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4
|
||||
@test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4
|
||||
# Hamming
|
||||
@test compare("", "abc", Hamming()) ≈ 0.0 atol = 1e-4
|
||||
@test compare("acc", "abc", Hamming()) ≈ 2/3 atol = 1e-4
|
||||
@test compare("saturday", "sunday", Hamming()) ≈ 1/8 atol = 1e-4
|
||||
@test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1
|
||||
@test compare("New York Yankees", "", Partial(Hamming())) ≈ 1
|
||||
compare("aüa", "aua", Hamming())
|
||||
|
||||
@test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4
|
||||
# Qgram
|
||||
@test compare("", "abc", QGram(1)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("abc", "cba", QGram(1)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "ccc", QGram(1)) ≈ 1/3 atol = 1e-4
|
||||
compare("aüa", "aua", TokenMax(QGram(2)))
|
||||
@test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0
|
||||
@test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
||||
|
||||
@test compare("", "abc", Jaccard(2)) ≈ 0.0 atol = 1e-4
|
||||
# Jaro
|
||||
compare("aüa", "aua", Jaro())
|
||||
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Cosine(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Jaccard(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", Overlap(2)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("martha", "martha", SorensenDice(2)) ≈ 1.0 atol = 1e-4
|
||||
#Levenshtein
|
||||
compare("aüa", "aua", Levenshtein())
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
|
||||
# Winkler
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
|
||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
|
||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
|
||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
|
||||
# RatcliffObershelp
|
||||
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
|
||||
|
||||
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
|
||||
@test compare("HSINCHUANG", "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875
|
||||
@test compare("HSINCHUANG", "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare("HSINCHUANG", "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare("HSINCHUANG", "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888
|
||||
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094
|
||||
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0
|
||||
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
|
||||
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
|
||||
|
||||
|
||||
# Winkler
|
||||
@test compare("martha", "marhta", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.9611 atol = 1e-4
|
||||
@test compare("dwayne", "duane", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.84 atol = 1e-4
|
||||
@test compare("dixon", "dicksonx", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.81333 atol = 1e-4
|
||||
@test compare("william", "williams", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.975 atol = 1e-4
|
||||
@test compare("", "foo", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
@test compare("a", "a", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 1.0 atol = 1e-4
|
||||
@test compare("abc", "xyz", Winkler(Jaro(), 0.1, 0.0, 4)) ≈ 0.0 atol = 1e-4
|
||||
# check missing
|
||||
@test compare("ok", missing, Levenshtein()) === missing
|
||||
|
||||
# check min
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
("dwayne", "duane") ,
|
||||
("dixon", "dicksonx"),
|
||||
("william", "williams"),
|
||||
("", "foo"),
|
||||
("a", "a"),
|
||||
("abc", "xyz"),
|
||||
("abc", "ccc"),
|
||||
("kitten", "sitting"),
|
||||
("saturday", "sunday"),
|
||||
("hi, my name is", "my name is"),
|
||||
("alborgów", "amoniak"),
|
||||
("cape sand recycling ", "edith ann graham"),
|
||||
( "jellyifhs", "jellyfish"),
|
||||
("ifhs", "fish"),
|
||||
("leia", "leela"),
|
||||
]
|
||||
for dist in (Levenshtein, DamerauLevenshtein)
|
||||
for i in eachindex(strings)
|
||||
if compare(strings[i]..., dist()) < 1 / 3
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0
|
||||
else
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ compare(strings[i]..., dist())
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# check find_best and find_all
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
("dwayne", "duane") ,
|
||||
("dixon", "dicksonx"),
|
||||
("william", "williams"),
|
||||
("", "foo")
|
||||
]
|
||||
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000]
|
||||
for i in 1:length(solutions)
|
||||
@test compare(strings[i]..., Winkler(Jaro(), 0.1, 0.0, 4)) ≈ (1 - solutions[i]) atol = 1e-4
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Partial
|
||||
@test compare("aa", "aa ", Partial(Jaccard(2))) ≈ 1.0
|
||||
|
||||
@test compare("New York Yankees", "Yankees", Partial(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("New York Yankees", "", Partial(RatcliffObershelp())) ≈ 0.0
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", Partial(RatcliffObershelp())) ≈ 0.444444444444
|
||||
|
||||
|
||||
s = "HSINCHUANG"
|
||||
@test compare(s, "SINJHUAN", Partial(RatcliffObershelp())) ≈ 0.875
|
||||
@test compare(s, "LSINJHUANG DISTRIC", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare(s, "SINJHUANG DISTRICT", Partial(RatcliffObershelp())) ≈ 0.8
|
||||
@test compare(s, "SINJHUANG", Partial(RatcliffObershelp())) ≈ 0.8888888888888
|
||||
|
||||
@test compare("New York Yankees", "Yankees", Partial(Hamming())) ≈ 1
|
||||
@test compare("New York Yankees", "", Partial(Hamming())) ≈ 1
|
||||
|
||||
|
||||
|
||||
# Token
|
||||
@test compare("New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets", TokenSort(RatcliffObershelp())) ≈ 1.0
|
||||
@test compare("mariners vs angels", "los angeles angels of anaheim at seattle mariners", TokenSet(RatcliffObershelp())) ≈ 1.0 - 0.09090909090909094
|
||||
|
||||
|
||||
@test compare("New York Mets vs Atlanta Braves", "", RatcliffObershelp()) ≈ 0.0
|
||||
|
||||
|
||||
@test compare("New York Mets vs Atlanta Braves", "", TokenSort(RatcliffObershelp())) ≈ 0.0
|
||||
|
||||
# ADD AGAIN
|
||||
@test compare("mariners vs angels", "", TokenSet(RatcliffObershelp())) ≈ 0.0
|
||||
|
||||
|
||||
|
||||
|
||||
@test compare("mariners", "mariner", TokenMax(RatcliffObershelp())) ≈ 0.933333333333333
|
||||
|
||||
|
||||
|
||||
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("Atlanta Braves vs New York Mets")) 1.0
|
||||
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("los angeles angels of anaheim at seattle mariners")) 1.0 - 0.09090909090909094
|
||||
#@test_approx_eq compare(TokenSort(RatcliffObershelp()), graphemeiterator("New York Mets vs Atlanta Braves"), graphemeiterator("")) 0.0
|
||||
#@test_approx_eq compare(TokenSet(RatcliffObershelp()),graphemeiterator("mariners vs angels"), graphemeiterator("")) 0.0
|
||||
|
||||
|
||||
|
||||
@test compare("mariners vs angels", "los angeles angels at seattle mariners", TokenSet(Partial(RatcliffObershelp()))) ≈ 1.0
|
||||
|
||||
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
|
||||
|
||||
# test with fuzz ratio
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", RatcliffObershelp())) == 5
|
||||
@test round(Int, 100 * compare("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。", Partial(RatcliffObershelp()))) == 7
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 79
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSort(RatcliffObershelp()))) == 11
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", RatcliffObershelp())) == 39
|
||||
@test round(Int, 100 * compare("mariners", "are mariner playing tomorrow", Partial(RatcliffObershelp()))) == 88
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(RatcliffObershelp()))) == 39
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenSet(Partial(RatcliffObershelp())))) == 88
|
||||
# not exactly the same because tokenmax has uses the max of rounded tokenset etc
|
||||
@test round(Int, 100 * compare("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow", TokenMax(RatcliffObershelp()))) == 52
|
||||
|
||||
#= Python code
|
||||
from fuzzywuzzy import fuzz
|
||||
fuzz.ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。")
|
||||
fuzz.partial_ratio("为人子女者要堂堂正正做人,千万不可作奸犯科,致使父母蒙羞", "此前稍早些时候中国商务部发布消息称,中美经贸高级别磋商双方牵头人通话,中方就美拟9月1日加征关税进行了严正交涉。")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow")
|
||||
fuzz.partial_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.token_sort_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.partial_token_set_ratio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
fuzz.WRatio("mariners", "mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow mariner are playing tomorrow")
|
||||
=#
|
||||
|
||||
|
||||
using StringDistances, Test
|
||||
|
||||
# check with weird utf8 strings
|
||||
compare("aüa", "aua", TokenMax(RatcliffObershelp()))
|
||||
compare("aüa", "aua", TokenMax(QGram(2)))
|
||||
compare("aüa", "aua", DamerauLevenshtein())
|
||||
compare("aüa", "aua", Hamming())
|
||||
compare("aüa", "aua", Jaro())
|
||||
compare("aüa", "aua", Levenshtein())
|
||||
|
||||
|
||||
s1 = "aü☃"
|
||||
s2 = "aüaüafs"
|
||||
dist = QGram(4)
|
||||
@test evaluate(dist, s1, s2) == 4
|
||||
|
||||
# check Substrings work
|
||||
s1 = SubString(s1, 1, 4)
|
||||
s2 = SubString(s2, 1, 4)
|
||||
dist = QGram(2)
|
||||
@test evaluate(dist, s1, s2) == 2
|
||||
|
||||
|
||||
# check missing
|
||||
@test compare(s1, missing, Levenshtein()) === missing
|
||||
|
||||
# check min
|
||||
for dist in (Levenshtein, DamerauLevenshtein)
|
||||
for i in eachindex(strings)
|
||||
if compare(strings[i]..., dist()) < 1 / 3
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ 0.0
|
||||
else
|
||||
@test compare(strings[i]..., dist() ; min_score = 1/ 3) ≈ compare(strings[i]..., dist())
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# check find_best and find_all
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == "NewYork"
|
||||
@test find_best("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == "NewYork"
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
|
||||
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
|
||||
|
||||
end
|
||||
=#
|
Loading…
Reference in New Issue