result_type for str metrics; fix type instability in RatcliffObershelp

pull/22/head
Dillon Daudert 2019-12-11 14:45:58 -05:00 committed by matthieugomez
parent fc5587a60c
commit 55221b5794
8 changed files with 117 additions and 74 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
benchmark/benchmark.md
PC25
Manifest.toml

View File

@ -3,12 +3,12 @@ uuid = "88034a9c-02f8-509d-84a9-84ec65e18404"
version = "0.5.0"
[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
[compat]
julia = "1"
Distances = "0"
DataStructures = "0"
Distances = "0"
julia = "1"

View File

@ -7,10 +7,11 @@ module StringDistances
##############################################################################
using DataStructures
import Base: eltype, length, iterate, ==, hash, isless, convert, show, @deprecate
import Distances: evaluate, Hamming, hamming, PreMetric, SemiMetric
import Distances: evaluate, Hamming, hamming, PreMetric, result_type, SemiMetric
export
evaluate,
compare,
result_type,
Hamming,
Levenshtein,
DamerauLevenshtein,
@ -55,4 +56,4 @@ end
# nextinds(s, i): return the index of the start of the character whose encoding starts after index i
# nextind(s, 0, N): return the index of the Nth character of s (or, if there are less than N characters, return ncodeunits(str) + (N - length(s))
##############################################################################
##############################################################################

View File

@ -251,9 +251,9 @@ The distance between two strings is defined as one minus the number of matching
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractString, s2::AbstractString)
n_matched = sum(last.(matching_blocks(s1, s2)))
n_matched = sum(last.(matching_blocks(s1, s2)))
len1, len2 = length(s1), length(s2)
len1 + len2 == 0 ? 0 : 1.0 - 2 * n_matched / (len1 + len2)
len1 + len2 == 0 ? 0. : 1.0 - 2 * n_matched / (len1 + len2)
end
function matching_blocks(s1::AbstractString, s2::AbstractString)
@ -277,3 +277,9 @@ function matching_blocks!(x::Set{Tuple{Int, Int, Int}}, s1::AbstractString, s2::
return x
end
const string_metrics = (Hamming, Jaro, Levenshtein, DamerauLevenshtein, RatcliffObershelp)
const UnionStringMetrics = Union{string_metrics...}
function result_type(m::UnionStringMetrics, a::AbstractString, b::AbstractString)
return typeof(evaluate(m, oneunit(a), oneunit(b)))
end

View File

@ -291,3 +291,6 @@ function evaluate(dist::Overlap, count_dict)
1.0 - nintersect / min(ndistinct1, ndistinct2)
end
# result types for QGram distances
result_type(m::AbstractQGramDistance, a::AbstractString, b::AbstractString) =
typeof(evaluate(m, oneunit(a), oneunit(b)))

View File

@ -1,64 +1,106 @@
using StringDistances, Test
@testset "Distances" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@testset "Levenshtein" begin
@test evaluate(Levenshtein(), "", "") == 0
@test evaluate(Levenshtein(), "abc", "") == 3
@test evaluate(Levenshtein(), "", "abc") == 3
@test evaluate(Levenshtein(), "bc", "abc") == 1
@test evaluate(Levenshtein(), "kitten", "sitting") == 3
@test evaluate(Levenshtein(), "saturday", "sunday") == 3
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@test result_type(Levenshtein(), "hello", "world") == Int
@inferred evaluate(Levenshtein(), "", "")
end
@test evaluate(Levenshtein(), "hi, my name is", "my name is") == 4
@test evaluate(Levenshtein(), "alborgów", "amoniak") == 6
@testset "DamerauLevenshtein" begin
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@test result_type(DamerauLevenshtein(), "hello", "world") == Int
@inferred evaluate(DamerauLevenshtein(), "", "")
end
@test evaluate(DamerauLevenshtein(), "", "") == 0
@test evaluate(DamerauLevenshtein(), "abc", "") == 3
@test evaluate(DamerauLevenshtein(), "bc", "abc") == 1
@test evaluate(DamerauLevenshtein(), "fuor", "four") == 1
@test evaluate(DamerauLevenshtein(), "abcd", "acb") == 2
@test evaluate(DamerauLevenshtein(), "cape sand recycling ", "edith ann graham") == 17
@test evaluate(DamerauLevenshtein(), "jellyifhs", "jellyfish") == 2
@test evaluate(DamerauLevenshtein(), "ifhs", "fish") == 2
@testset "Hamming" begin
@test evaluate(Hamming(), "", "") == 0
@test evaluate(Hamming(), "", "abc") == 3
@test evaluate(Hamming(), "abc", "abc") == 0
@test evaluate(Hamming(), "acc", "abc") == 1
@test evaluate(Hamming(), "abcd", "abc") == 1
@test evaluate(Hamming(), "abc", "abcd") == 1
@test evaluate(Hamming(), "testing", "this is a test") == 13
@test evaluate(Hamming(), "saturday", "sunday") == 7
@test result_type(Hamming(), "hello", "world") == Int
@inferred evaluate(Hamming(), "", "")
end
@test evaluate(Hamming(), "", "") == 0
@test evaluate(Hamming(), "", "abc") == 3
@test evaluate(Hamming(), "abc", "abc") == 0
@test evaluate(Hamming(), "acc", "abc") == 1
@test evaluate(Hamming(), "abcd", "abc") == 1
@test evaluate(Hamming(), "abc", "abcd") == 1
@test evaluate(Hamming(), "testing", "this is a test") == 13
@test evaluate(Hamming(), "saturday", "sunday") == 7
@testset "QGram" begin
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test result_type(QGram(1), "hello", "world") == Int
@inferred evaluate(QGram(1), "", "")
end
@test evaluate(QGram(1), "abc", "abc") == 0
@test evaluate(QGram(1), "", "abc") == 3
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@testset "Cosine" begin
@test isnan(evaluate(Cosine(2), "", "abc"))
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test result_type(Cosine(2), "hello", "world") == typeof(float(1))
@inferred evaluate(Cosine(2), "", "")
@inferred evaluate(Cosine(2), "abc", "ccc")
end
@testset "Jaccard" begin
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test result_type(Jaccard(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Jaccard(1), "", "")
end
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@testset "SorensenDice" begin
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test result_type(SorensenDice(1), "hello", "world") == typeof(float(1))
@inferred evaluate(SorensenDice(1), "", "")
end
@testset "Overlap" begin
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test result_type(Overlap(1), "hello", "world") == typeof(float(1))
@inferred evaluate(Overlap(1), "", "")
end
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@testset "RatcliffObershelp" begin
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test result_type(RatcliffObershelp(), "hello", "world") == typeof(float(1))
@inferred evaluate(RatcliffObershelp(), "", "")
end
@testset "Jaro" begin
@test evaluate(Jaro(), "martha", "marhta") 0.05555555555555547
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
@test result_type(Jaro(), "hello", "world") == typeof(float(1))
end
@test evaluate(Jaro(), "es an ", " vs an") 0.2777777777777777
@test evaluate(Jaro(), " vs an", "es an ") 0.2777777777777777
strings = [
("martha", "marhta"),
("dwayne", "duane") ,
@ -120,7 +162,7 @@ strings = matrix(data = c(
"cape sand recycling ", "edith ann graham",
"jellyifhs", "jellyfish",
"ifhs", "fish",
"leia", "leela"),
"leia", "leela"),
nrow = 2
)
stringdist(strings[1,], strings[2,], method = "jw", p = 0)
@ -163,4 +205,4 @@ for x in strings:
print(fuzz.ratio(x[0], x[1]))
=#
end

View File

@ -1,6 +1,8 @@
using StringDistances, Test
@testset "Modifiers" begin
# Compare
@test compare("", "abc", Hamming()) 0.0 atol = 1e-4
@test compare("acc", "abc", Hamming()) 2/3 atol = 1e-4
@ -167,5 +169,4 @@ end
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Levenshtein()) == ["NewYork"]
@test find_all("New York", ["NewYork", "Newark", "San Francisco"], Jaro()) == ["NewYork", "Newark"]
end

View File

@ -1,16 +1,5 @@
using StringDistances
using Test
tests = ["distances.jl", "modifiers.jl"]
println("Running tests:")
for test in tests
try
include(test)
println("\t\033[1m\033[32mPASSED\033[0m: $(test)")
catch e
println("\t\033[1m\033[31mFAILED\033[0m: $(test)")
showerror(stdout, e, backtrace())
rethrow(e)
end
end
include("distances.jl")
include("modifiers.jl")