update to 0.6
parent
90f6865120
commit
ed18107c03
|
@ -1,6 +1,6 @@
|
|||
language: julia
|
||||
julia:
|
||||
- 0.4
|
||||
- 0.6
|
||||
- nightly
|
||||
script:
|
||||
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
|
||||
|
|
|
@ -97,7 +97,7 @@ The package includes distance "modifiers", that can be applied to any distance.
|
|||
|
||||
It depends on your specific problem. As a rule of thumb,
|
||||
- standardize strings before comparing them (lowercase, punctuation, whitespaces, accents, abbreviations...)
|
||||
- if the order of words does not matter, avoid edit distances.
|
||||
- if word order does not matter, avoid edit distances.
|
||||
|
||||
|
||||
## References
|
||||
|
|
|
@ -39,9 +39,9 @@ graphemeiterator
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
typealias GraphemeIterator Base.UTF8proc.GraphemeIterator
|
||||
#typealias AbstractStringorGraphemeIterator Union{AbstractString, Base.UTF8proc.GraphemeIterator}
|
||||
typealias AbstractStringorGraphemeIterator AbstractString
|
||||
const GraphemeIterator = Base.UTF8proc.GraphemeIterator
|
||||
#const AbstractStringorGraphemeIterator = Union{AbstractString, Base.UTF8proc.GraphemeIterator}
|
||||
const AbstractStringorGraphemeIterator = AbstractString
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
|
|
|
@ -49,7 +49,8 @@ function matching_blocks(s1::AbstractStringorGraphemeIterator, s2::AbstractStrin
|
|||
return x
|
||||
end
|
||||
|
||||
type RatcliffObershelp <: PreMetric end
|
||||
struct RatcliffObershelp <: PreMetric end
|
||||
|
||||
function evaluate(dist::RatcliffObershelp, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
result = matching_blocks(s1, s2)
|
||||
matched = 0
|
||||
|
|
|
@ -41,16 +41,16 @@ end
|
|||
##############################################################################
|
||||
|
||||
|
||||
type Levenshtein <: SemiMetric end
|
||||
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
struct Levenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Levenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
# prefix common to both strings can be ignored
|
||||
k, start1, start2 = common_prefix(s1, s2)
|
||||
done(s1, start1) && return len2 - k
|
||||
|
||||
# distance initialized to first row of matrix
|
||||
# => distance between "" and s2[1:i}
|
||||
v0 = Array(Int, len2 - k)
|
||||
v0 = Array{Int}(len2 - k)
|
||||
@inbounds for i2 in 1:(len2 - k)
|
||||
v0[i2] = i2
|
||||
end
|
||||
|
@ -88,7 +88,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
type DamerauLevenshtein <: SemiMetric end
|
||||
struct DamerauLevenshtein <: SemiMetric end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
|
||||
|
@ -96,11 +96,11 @@ function evaluate(dist::DamerauLevenshtein, s1::AbstractStringorGraphemeIterator
|
|||
k, start1, start2 = common_prefix(s1, s2)
|
||||
done(s1, start1) && return len2 - k
|
||||
|
||||
v0 = Array(Int, len2 - k)
|
||||
v0 = Array{Int}(len2 - k)
|
||||
@inbounds for i2 in 1:(len2 - k)
|
||||
v0[i2] = i2
|
||||
end
|
||||
v2 = Array(Int, len2 - k)
|
||||
v2 = Array{Int}(len2 - k)
|
||||
|
||||
ch1, = next(s1, start1)
|
||||
current = 0
|
||||
|
@ -156,7 +156,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
type Jaro <: SemiMetric end
|
||||
struct Jaro <: SemiMetric end
|
||||
|
||||
function evaluate(dist::Jaro, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
# if len2 == 0, m = 0 so should be 1.0 according to wikipedia. Nope.
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
|
||||
struct QGramIterator{S <: AbstractStringorGraphemeIterator, T <: Integer}
|
||||
s::S # grapheme
|
||||
l::Int # length of string
|
||||
q::T # length of q-grams
|
||||
|
@ -27,7 +27,7 @@ Base.eltype{S <: AbstractString, T}(qgram::QGramIterator{S, T}) = SubString{type
|
|||
Base.eltype{S <: GraphemeIterator, T}(qgram::QGramIterator{S, T}) = SubString{typeof(qgram.s.s)}
|
||||
Base.length(qgram::QGramIterator) = max(qgram.l - qgram.q + 1, 0)
|
||||
function Base.collect(qgram::QGramIterator)
|
||||
x = Array(eltype(qgram), length(qgram))
|
||||
x = Array{eltype(qgram)}(length(qgram))
|
||||
i = 0
|
||||
for q in qgram
|
||||
i += 1
|
||||
|
@ -44,7 +44,7 @@ Base.sort(qgram::QGramIterator) = sort!(collect(qgram))
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
type CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||
struct CountInterator{T1 <: AbstractVector, T2 <: AbstractVector}
|
||||
v1::T1
|
||||
v2::T2
|
||||
end
|
||||
|
@ -79,7 +79,7 @@ end
|
|||
## Distance on strings is computed by set distance on qgram sets
|
||||
##
|
||||
##############################################################################
|
||||
abstract AbstractQGram <: SemiMetric
|
||||
abstract type AbstractQGram <: SemiMetric end
|
||||
|
||||
function evaluate(dist::AbstractQGram, s1::AbstractStringorGraphemeIterator, s2::AbstractStringorGraphemeIterator, len1::Integer, len2::Integer)
|
||||
sort1 = sort(QGramIterator(s1, len1, dist.q))
|
||||
|
@ -96,7 +96,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable QGram{T <: Integer} <: AbstractQGram
|
||||
struct QGram{T <: Integer} <: AbstractQGram
|
||||
q::T
|
||||
end
|
||||
QGram() = QGram(2)
|
||||
|
@ -116,7 +116,7 @@ end
|
|||
## 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)||
|
||||
##############################################################################
|
||||
|
||||
immutable Cosine{T <: Integer} <: AbstractQGram
|
||||
struct Cosine{T <: Integer} <: AbstractQGram
|
||||
q::T
|
||||
end
|
||||
Cosine() = Cosine(2)
|
||||
|
@ -140,7 +140,7 @@ end
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable Jaccard{T <: Integer} <: AbstractQGram
|
||||
struct Jaccard{T <: Integer} <: AbstractQGram
|
||||
q::T
|
||||
end
|
||||
Jaccard() = Jaccard(2)
|
||||
|
@ -162,7 +162,7 @@ end
|
|||
## 1 - 2 * |intersect(Q(s1, q), Q(s2, q))| / (|Q(s1, q)| + |Q(s2, q))|)
|
||||
##############################################################################
|
||||
|
||||
immutable SorensenDice{T <: Integer} <: AbstractQGram
|
||||
struct SorensenDice{T <: Integer} <: AbstractQGram
|
||||
q::T
|
||||
end
|
||||
SorensenDice() = SorensenDice(2)
|
||||
|
@ -184,7 +184,7 @@ end
|
|||
## 1 - |intersect(Q(s1, q), Q(s2, q))| / min(|Q(s1, q)|, |Q(s2, q)))
|
||||
##############################################################################
|
||||
|
||||
immutable Overlap{T <: Integer} <: AbstractQGram
|
||||
struct Overlap{T <: Integer} <: AbstractQGram
|
||||
q::T
|
||||
end
|
||||
Overlap() = Overlap(2)
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
##
|
||||
##############################################################################
|
||||
type Partial{T <: PreMetric} <: PreMetric
|
||||
struct Partial{T <: PreMetric} <: PreMetric
|
||||
dist::T
|
||||
end
|
||||
|
||||
|
@ -55,7 +55,7 @@ end
|
|||
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
##
|
||||
##############################################################################
|
||||
type TokenSort{T <: PreMetric} <: PreMetric
|
||||
struct TokenSort{T <: PreMetric} <: PreMetric
|
||||
dist::T
|
||||
end
|
||||
|
||||
|
@ -77,7 +77,7 @@ end
|
|||
## http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
|
||||
##
|
||||
##############################################################################
|
||||
type TokenSet{T <: PreMetric} <: PreMetric
|
||||
struct TokenSet{T <: PreMetric} <: PreMetric
|
||||
dist::T
|
||||
end
|
||||
|
||||
|
@ -125,7 +125,7 @@ end
|
|||
## TokenMax
|
||||
##
|
||||
##############################################################################
|
||||
type TokenMax{T <: PreMetric} <: PreMetric
|
||||
struct TokenMax{T <: PreMetric} <: PreMetric
|
||||
dist::T
|
||||
end
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
##
|
||||
##############################################################################
|
||||
|
||||
immutable Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
|
||||
struct Winkler{T1 <: PreMetric, T2 <: Real, T3 <: Real} <: PreMetric
|
||||
dist::T1
|
||||
scaling_factor::T2 # scaling factor. Default to 0.1
|
||||
boosting_limit::T3 # boost threshold. Default to 0.7
|
||||
|
|
|
@ -34,24 +34,24 @@ using StringDistances, Base.Test
|
|||
@test evaluate(QGram(1), "abc", "cba") == 0
|
||||
@test evaluate(QGram(1), "abc", "ccc") == 4
|
||||
@test isnan(evaluate(Cosine(2), "", "abc"))
|
||||
@test_approx_eq_eps evaluate(Cosine(2), "abc", "ccc") 1 1e-4
|
||||
@test_approx_eq_eps evaluate(Cosine(2), "leia", "leela") 0.7113249 1e-4
|
||||
@test_approx_eq evaluate(Jaccard(1), "", "abc") 1.0
|
||||
@test_approx_eq_eps evaluate(Jaccard(1), "abc", "ccc") .666666 1e-4
|
||||
@test_approx_eq_eps evaluate(Jaccard(2), "leia", "leela") 0.83333 1e-4
|
||||
@test_approx_eq_eps evaluate(SorensenDice(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(SorensenDice(2), "night", "nacht") 0.75 1e-4
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "night", "nacht") 0.4 1e-4
|
||||
@test_approx_eq_eps evaluate(Overlap(1), "context", "contact") .2 1e-4
|
||||
@test evaluate(Cosine(2), "abc", "ccc") ≈ 1 atol = 1e-4
|
||||
@test evaluate(Cosine(2), "leia", "leela") ≈ 0.7113249 atol = 1e-4
|
||||
@test evaluate(Jaccard(1), "", "abc") ≈ 1.0
|
||||
@test evaluate(Jaccard(1), "abc", "ccc") ≈ .666666 atol = 1e-4
|
||||
@test evaluate(Jaccard(2), "leia", "leela") ≈ 0.83333 atol = 1e-4
|
||||
@test evaluate(SorensenDice(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(SorensenDice(2), "night", "nacht") ≈ 0.75 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "night", "nacht") ≈ 0.4 atol = 1e-4
|
||||
@test evaluate(Overlap(1), "context", "contact") ≈ .2 atol = 1e-4
|
||||
|
||||
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "dixon", "dicksonx") 1 - 0.6153846153846154
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "alexandre", "aleksander") 1 - 0.7368421052631579
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") 1 - 0.6666666666666
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "", "pencilvaneya") 1.0
|
||||
@test_approx_eq evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") 1 - 0.962962962963
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") 0.3913043478260869
|
||||
@test_approx_eq evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") 0.24137931034482762
|
||||
@test evaluate(RatcliffObershelp(), "dixon", "dicksonx") ≈ 1 - 0.6153846153846154
|
||||
@test evaluate(RatcliffObershelp(), "alexandre", "aleksander") ≈ 1 - 0.7368421052631579
|
||||
@test evaluate(RatcliffObershelp(), "pennsylvania", "pencilvaneya") ≈ 1 - 0.6666666666666
|
||||
@test evaluate(RatcliffObershelp(), "", "pencilvaneya") ≈ 1.0
|
||||
@test evaluate(RatcliffObershelp(),"NEW YORK METS", "NEW YORK MEATS") ≈ 1 - 0.962962962963
|
||||
@test evaluate(RatcliffObershelp(), "Yankees", "New York Yankees") ≈ 0.3913043478260869
|
||||
@test evaluate(RatcliffObershelp(), "New York Mets", "New York Yankees") ≈ 0.24137931034482762
|
||||
|
||||
|
||||
|
||||
|
@ -87,7 +87,11 @@ solutions = ((Levenshtein(), [2 2 4 1 3 0 3 2 3 3 4 6 17 3 3 2]),
|
|||
for x in solutions
|
||||
t, solution = x
|
||||
for i in 1:length(solution)
|
||||
@test_approx_eq_eps evaluate(t, strings[i]...) solution[i] 1e-4
|
||||
if isnan(evaluate(t, strings[i]...))
|
||||
@test isnan(solution[i])
|
||||
else
|
||||
@test evaluate(t, strings[i]...) ≈ solution[i] atol = 1e-4
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
|
||||
using StringDistances, Base.Test
|
||||
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") 0.9611 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") 0.84 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") 0.81333 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") 0.975 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") 0.0 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") 1.0 1e-4
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") 0.0 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "martha", "marhta") ≈ 0.9611 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "dwayne", "duane") ≈ 0.84 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "dixon", "dicksonx") ≈ 0.81333 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "william", "williams") ≈ 0.975 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "", "foo") ≈ 0.0 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "a", "a") ≈ 1.0 atol = 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), "abc", "xyz") ≈ 0.0 atol = 1e-4
|
||||
|
||||
strings = [
|
||||
("martha", "marhta"),
|
||||
|
@ -29,47 +29,47 @@ strings = [
|
|||
]
|
||||
solutions = [0.03888889 0.16000000 0.18666667 0.02500000 1.00000000 0.00000000 1.00000000 0.44444444 0.25396825 0.22250000 0.16190476 0.43928571 0.49166667 0.04444444 0.16666667 0.17333333]
|
||||
for i in 1:length(solutions)
|
||||
@test_approx_eq_eps compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) (1 - solutions[i]) 1e-4
|
||||
@test compare(Winkler(Jaro(), 0.1, 0.0), strings[i]...) ≈ (1 - solutions[i]) atol = 1e-4
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
@test_approx_eq_eps compare(Hamming(), "", "abc") 0.0 1e-4
|
||||
@test_approx_eq_eps compare(Hamming(), "acc", "abc") 2/3 1e-4
|
||||
@test_approx_eq_eps compare(Hamming(), "saturday", "sunday") 1/8 1e-4
|
||||
@test compare(Hamming(), "", "abc") ≈ 0.0 atol = 1e-4
|
||||
@test compare(Hamming(), "acc", "abc") ≈ 2/3 atol = 1e-4
|
||||
@test compare(Hamming(), "saturday", "sunday") ≈ 1/8 atol = 1e-4
|
||||
|
||||
@test_approx_eq_eps compare(QGram(1), "", "abc") 0.0 1e-4
|
||||
@test_approx_eq_eps compare(QGram(1), "abc", "cba") 1.0 1e-4
|
||||
@test_approx_eq_eps compare(QGram(1), "abc", "ccc") 1/3 1e-4
|
||||
@test compare(QGram(1), "", "abc") ≈ 0.0 atol = 1e-4
|
||||
@test compare(QGram(1), "abc", "cba") ≈ 1.0 atol = 1e-4
|
||||
@test compare(QGram(1), "abc", "ccc") ≈ 1/3 atol = 1e-4
|
||||
|
||||
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") 1.0
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), "New York Yankees", "") 0.0
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") 0.444444444444
|
||||
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "Yankees") ≈ 1.0
|
||||
@test compare(Partial(RatcliffObershelp()), "New York Yankees", "") ≈ 0.0
|
||||
@test compare(Partial(RatcliffObershelp()),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 0.444444444444
|
||||
|
||||
|
||||
s = "HSINCHUANG"
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUAN") 0.875
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") 0.8
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") 0.8
|
||||
@test_approx_eq compare(Partial(RatcliffObershelp()), s, "SINJHUANG") 0.8888888888888
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUAN") ≈ 0.875
|
||||
@test compare(Partial(RatcliffObershelp()), s, "LSINJHUANG DISTRIC") ≈ 0.8
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG DISTRICT") ≈ 0.8
|
||||
@test compare(Partial(RatcliffObershelp()), s, "SINJHUANG") ≈ 0.8888888888888
|
||||
|
||||
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "Yankees") 1
|
||||
@test_approx_eq compare(Partial(Hamming()), "New York Yankees", "") 1
|
||||
@test compare(Partial(Hamming()), "New York Yankees", "Yankees") ≈ 1
|
||||
@test compare(Partial(Hamming()), "New York Yankees", "") ≈ 1
|
||||
|
||||
|
||||
|
||||
|
||||
@test_approx_eq compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") 1.0
|
||||
@test_approx_eq compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") 1.0 - 0.09090909090909094
|
||||
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "Atlanta Braves vs New York Mets") ≈ 1.0
|
||||
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "los angeles angels of anaheim at seattle mariners") ≈ 1.0 - 0.09090909090909094
|
||||
|
||||
|
||||
@test_approx_eq compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") 0.0
|
||||
@test_approx_eq compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") 0.0
|
||||
@test compare(TokenSort(RatcliffObershelp()), "New York Mets vs Atlanta Braves", "") ≈ 0.0
|
||||
@test compare(TokenSet(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0
|
||||
|
||||
|
||||
@test_approx_eq compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") 0.0
|
||||
@test compare(TokenMax(RatcliffObershelp()),"mariners vs angels", "") ≈ 0.0
|
||||
|
||||
|
||||
|
||||
|
@ -80,5 +80,5 @@ s = "HSINCHUANG"
|
|||
|
||||
|
||||
|
||||
@test_approx_eq compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners") 0.7378917378917379
|
||||
@test_approx_eq compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") 1.0
|
||||
@test compare(Winkler(Partial(Jaro())),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 0.7378917378917379
|
||||
@test compare(TokenSet(Partial(RatcliffObershelp())),"mariners vs angels", "los angeles angels at seattle mariners") ≈ 1.0
|
Loading…
Reference in New Issue