update to 0.6

pull/3/head
matthieugomez 2017-05-12 17:41:56 -04:00
parent 90f6865120
commit ed18107c03
11 changed files with 80 additions and 75 deletions

View File

@ -1,6 +1,6 @@
language: julia
julia:
- 0.4
- 0.6
- nightly
script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi

View File

@ -97,7 +97,7 @@ The package includes distance "modifiers", that can be applied to any distance.
It depends on your specific problem. As a rule of thumb,
- standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
- if the order of words does not matter, avoid edit distances.
- if word order does not matter, avoid edit distances.
## References

View File

@ -1,3 +1,3 @@
julia 0.4.1
julia 0.6-
Distances
Iterators

View File

@ -39,9 +39,9 @@ graphemeiterator
##
##############################################################################
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
#typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
typealias AbstractStringorGraphemeIterator AbstractString
const GraphemeIterator = Base.UTF8proc.GraphemeIterator
#const AbstractStringorGraphemeIterator = Union{AbstractString, Base.UTF8proc.GraphemeIterator}
const AbstractStringorGraphemeIterator = AbstractString
##############################################################################
##

View File

@ -49,7 +49,8 @@ function matching_blocks(s1::AbstractStringorGraphemeIterator, s2::AbstractStrin
return x
end
type RatcliffObershelp <: PreMetric end
struct RatcliffObershelp <: PreMetric end
function evaluate(dist::RatcliffObershelp, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
result = matching_blocks(s1, s2)
matched = 0

View File

@ -41,16 +41,16 @@ end
##############################################################################
type Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
struct Levenshtein <: SemiMetric end
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
# prefix common to both strings can be ignored
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
# distance initialized to first row of matrix
# => distance between "" and s2[1:i}
v0 = Array(Int, len2 - k)
v0 = Array{Int}(len2 - k)
@inbounds for i2 in 1:(len2 - k)
v0[i2] = i2
end
@ -88,7 +88,7 @@ end
##
##############################################################################
type DamerauLevenshtein <: SemiMetric end
struct DamerauLevenshtein <: SemiMetric end
function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
@ -96,11 +96,11 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator
k, start1, start2 = common_prefix(s1, s2)
done(s1, start1) && return len2 - k
v0 = Array(Int, len2 - k)
v0 = Array{Int}(len2 - k)
@inbounds for i2 in 1:(len2 - k)
v0[i2] = i2
end
v2 = Array(Int, len2 - k)
v2 = Array{Int}(len2 - k)
ch1, = next(s1, start1)
current = 0
@ -156,7 +156,7 @@ end
##
##############################################################################
type Jaro <: SemiMetric end
struct Jaro <: SemiMetric end
function evaluate(dist::Jaro, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.

View File

@ -4,7 +4,7 @@
##
##############################################################################
immutable QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
struct QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
s::S # grapheme
l::Int # length of string
q::T # length of q-grams
@ -27,7 +27,7 @@ Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{type
Base.eltype{S <: GraphemeIterator, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s.s)}
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
function Base.collect(qgram::QGramIterator)
x = Array(eltype(qgram), length(qgram))
x = Array{eltype(qgram)}(length(qgram))
i = 0
for q in qgram
i += 1
@ -44,7 +44,7 @@ Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
##
##############################################################################
type CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
struct CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
v1::T1
v2::T2
end
@ -79,7 +79,7 @@ end
## Distance on strings is computed by set distance on qgram sets
##
##############################################################################
abstract AbstractQGram <: SemiMetric
abstract type AbstractQGram <: SemiMetric end
function evaluate(dist::AbstractQGram, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
sort1 = sort(QGramIterator(s1, len1, dist.q))
@ -96,7 +96,7 @@ end
##
##############################################################################
immutable QGram{T <: Integer} <: AbstractQGram
struct QGram{T <: Integer} <: AbstractQGram
q::T
end
QGram() = QGram(2)
@ -116,7 +116,7 @@ end
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
##############################################################################
immutable Cosine{T <: Integer} <: AbstractQGram
struct Cosine{T <: Integer} <: AbstractQGram
q::T
end
Cosine() = Cosine(2)
@ -140,7 +140,7 @@ end
##
##############################################################################
immutable Jaccard{T <: Integer} <: AbstractQGram
struct Jaccard{T <: Integer} <: AbstractQGram
q::T
end
Jaccard() = Jaccard(2)
@ -162,7 +162,7 @@ end
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
##############################################################################
immutable SorensenDice{T <: Integer} <: AbstractQGram
struct SorensenDice{T <: Integer} <: AbstractQGram
q::T
end
SorensenDice() = SorensenDice(2)
@ -184,7 +184,7 @@ end
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
##############################################################################
immutable Overlap{T <: Integer} <: AbstractQGram
struct Overlap{T <: Integer} <: AbstractQGram
q::T
end
Overlap() = Overlap(2)

View File

@ -4,7 +4,7 @@
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
type Partial{T <: PreMetric} <: PreMetric
struct Partial{T <: PreMetric} <: PreMetric
dist::T
end
@ -55,7 +55,7 @@ end
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
type TokenSort{T <: PreMetric} <: PreMetric
struct TokenSort{T <: PreMetric} <: PreMetric
dist::T
end
@ -77,7 +77,7 @@ end
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
##
##############################################################################
type TokenSet{T <: PreMetric} <: PreMetric
struct TokenSet{T <: PreMetric} <: PreMetric
dist::T
end
@ -125,7 +125,7 @@ end
## TokenMax
##
##############################################################################
type TokenMax{T <: PreMetric} <: PreMetric
struct TokenMax{T <: PreMetric} <: PreMetric
dist::T
end

View File

@ -4,7 +4,7 @@
##
##############################################################################
immutable Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
dist::T1
scaling_factor::T2 # scaling factor. Default to 0.1
boosting_limit::T3 # boost threshold. Default to 0.7

View File

@ -34,24 +34,24 @@ using StringDistances, Base.Test
@test evaluate(QGram(1), "abc", "cba") == 0
@test evaluate(QGram(1), "abc", "ccc") == 4
@test isnan(evaluate(Cosine(2), "", "abc"))
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
@test evaluate(Cosine(2), "abc", "ccc") 1 atol = 1e-4
@test evaluate(Cosine(2), "leia", "leela") 0.7113249 atol = 1e-4
@test evaluate(Jaccard(1), "", "abc") 1.0
@test evaluate(Jaccard(1), "abc", "ccc") .666666 atol = 1e-4
@test evaluate(Jaccard(2), "leia", "leela") 0.83333 atol = 1e-4
@test evaluate(SorensenDice(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(SorensenDice(2), "night", "nacht") 0.75 atol = 1e-4
@test evaluate(Overlap(1), "night", "nacht") 0.4 atol = 1e-4
@test evaluate(Overlap(1), "context", "contact") .2 atol = 1e-4
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
@ -87,7 +87,11 @@ solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
for x in solutions
t, solution = x
for i in 1:length(solution)
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
if isnan(evaluate(t, strings[i]...))
@test isnan(solution[i])
else
@test evaluate(t, strings[i]...) solution[i] atol = 1e-4
end
end
end

View File

@ -1,13 +1,13 @@
using StringDistances, Base.Test
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") 0.9611 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") 0.84 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") 0.81333 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") 0.975 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") 0.0 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") 1.0 1e-4
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") 0.0 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") 0.9611 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") 0.84 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") 0.81333 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") 0.975 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") 0.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") 1.0 atol = 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") 0.0 atol = 1e-4
strings = [
("martha", "marhta"),
@ -29,47 +29,47 @@ strings = [
]
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]
for i in 1:length(solutions)
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) (1 - solutions[i]) 1e-4
@test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) (1 - solutions[i]) atol = 1e-4
end
@test_approx_eq_eps compare(Hamming(), "", "abc") 0.0 1e-4
@test_approx_eq_eps compare(Hamming(), "acc", "abc") 2/3 1e-4
@test_approx_eq_eps compare(Hamming(), "saturday", "sunday") 1/8 1e-4
@test compare(Hamming(), "", "abc") 0.0 atol = 1e-4
@test compare(Hamming(), "acc", "abc") 2/3 atol = 1e-4
@test compare(Hamming(), "saturday", "sunday") 1/8 atol = 1e-4
@test_approx_eq_eps compare(QGram(1), "", "abc") 0.0 1e-4
@test_approx_eq_eps compare(QGram(1), "abc", "cba") 1.0 1e-4
@test_approx_eq_eps compare(QGram(1), "abc", "ccc") 1/3 1e-4
@test compare(QGram(1), "", "abc") 0.0 atol = 1e-4
@test compare(QGram(1), "abc", "cba") 1.0 atol = 1e-4
@test compare(QGram(1), "abc", "ccc") 1/3 atol = 1e-4
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0
@test_approx_eq compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") 0.444444444444
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0
@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") 0.444444444444
s = "HSINCHUANG"
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888
@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "Yankees") 1
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "") 1
@test compare(Partial(Hamming()), "New York Yankees", "Yankees") 1
@test compare(Partial(Hamming()), "New York Yankees", "") 1
@test_approx_eq compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") 1.0
@test_approx_eq compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") 1.0 - 0.09090909090909094
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") 1.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") 1.0 - 0.09090909090909094
@test_approx_eq compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") 0.0
@test_approx_eq compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") 0.0
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test_approx_eq compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
@ -80,5 +80,5 @@ s = "HSINCHUANG"
@test_approx_eq compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners") 0.7378917378917379
@test_approx_eq compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") 1.0
@test compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners") 0.7378917378917379
@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") 1.0