From f7cb7f82dcf7c296549e1925dc4a54eb4a6bf997 Mon Sep 17 00:00:00 2001 From: matthieugomez Date: Sat, 24 Oct 2015 12:45:24 -0400 Subject: [PATCH] add unicode support --- README.md | 26 +++--- src/edit_distances.jl | 183 +++++++++++++++++++++------------------ src/qgrams_distances.jl | 66 ++++++++------ src/scoring_distances.jl | 144 ++++++++++++++++++++++++++++++ test/distances.jl | 7 ++ 5 files changed, 305 insertions(+), 121 deletions(-) create mode 100644 src/scoring_distances.jl diff --git a/README.md b/README.md index 3610ae1..6e490f3 100644 --- a/README.md +++ b/README.md @@ -4,25 +4,25 @@ # StringDistances -Edit Distances +ASCII - [x] Hamming Distance -- [x] Jaro Distance -- [x] Jaro-Winkler Distance +- [x] Jaro Distance and Jaro-Winkler Distance - [x] Levenshtein Distance - [x] Damerau-Levenshtein Distance +- [x] Qgram Distance +- [x] Cosine Distance +- [x] Jaccard Distance -Q-gram Distances +AbstractString -- [x] qgram -- [x] cosine -- [x] jaccard - -Type supports - -- [x] ASCIIString -- [x] UTF8String -- [ ] Unicode +- [x] Hamming Distance +- [] Jaro Distance and Jaro-Winkler Distance +- [x] Levenshtein Distance +- [x] Damerau-Levenshtein Distance +- [x] Qgram Distance +- [x] Cosine Distance +- [x] Jaccard Distance diff --git a/src/edit_distances.jl b/src/edit_distances.jl index 14f3bbe..0476a4c 100644 --- a/src/edit_distances.jl +++ b/src/edit_distances.jl @@ -5,17 +5,21 @@ ## ############################################################################## -function evaluate{T}(dist::Hamming, s1::T, s2::T) - length(s1) > length(s2) && return evaluate(dist, s2, s1) +function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString) + len1, len2 = length(s1), length(s2) + len1 > len2 && return evaluate(dist, s2, s1) count = 0 - @inbounds for i in 1:length(s1) - count += s1[i] != s2[i] + + state2 = start(s2) + for ch1 in s1 + ch2, state2 = next(s2, state2) + count += ch1 != ch2 end - count += length(s2) - length(s1) + count += len2 - len1 return count end -hamming{T}(s1::T, s2::T) = evaluate(Hamming(), s1, s2) +hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2) ############################################################################## ## @@ -24,111 +28,113 @@ hamming{T}(s1::T, s2::T) = evaluate(Hamming(), s1, s2) ## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html ## ############################################################################## - -function common_suffix{T}(s1::T, s2::T) - len1 = length(s1) - len2 = length(s2) - while ((len1 > 0) && (s1[len1] == s2[len2])) - len1 -= 1 - len2 -= 1 +# prefix common to both strings can be ignored +function common_prefix(s1::AbstractString, s2::AbstractString) + start1 = start(s1) + start2 = start(s2) + while !done(s1, start1) + ch1, nextstart1 = next(s1, start1) + ch2, nextstart2 = next(s2, start2) + ch1 != ch2 && break + start1, start2 = nextstart1, nextstart2 end - return len1, len2 + return start1, start2 end - -function common_prefix{T}(s1::T, s2::T, len1::Int, len2::Int) - start = 0 - len1 == 0 && return len1, len2, start - if (s1[start + 1] == s2[start + 1]) - while ((start < len1) && (s1[start + 1] == s2[start + 1])) - start += 1 - end - len1 -= start - len2 -= start - len1 == 0 && return len1, len2, start - end - return len1, len2, start -end - type Levenshtein end +function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString) + len1, len2 = length(s1), length(s2) -function evaluate{T}(dist::Levenshtein, s1::T, s2::T) - length(s1) > length(s2) && return evaluate(dist, s2, s1) - length(s2) == 0 && return 0 + len1 > len2 && return evaluate(dist, s2, s1) + len2 == 0 && return 0 # common - len1, len2 = common_suffix(s1, s2) - len1, len2, start = common_prefix(s1, s2, len1, len2) - len1 == 0 && return len2 + start1, start2 = common_prefix(s1, s2) + done(s1, start1) && return len2 - dist = Array(Int, len2) + # distance initialized to first row of matrix + # => distance between "" and s2[1:i} + v0 = Array(Int, len2) @inbounds for i2 in 1:len2 - dist[i2] = i2 + v0[i2] = i2 end - current = 0 - for i1 in 1:len1 - ch1 = s1[start + i1] - left = current = i1 - 1 - for i2 in 1:len2 - above = current - current = left - left = dist[i2] - if ch1 != s2[start + i2] - current += 1 - insDel = above + 1 - if insDel < current - current = insDel - end - insDel = left + 1 - if insDel < current - current = insDel - end + current = zero(0) + state1 = start1 + i1 = 0 + while !done(s1, state1) + i1 += 1 + ch1, state1 = next(s1, state1) + left = (i1 - 1) + current = (i1 - 1) + state2 = start2 + i2 = 0 + while !done(s2, state2) + i2 += 1 + ch2, state2 = next(s2, state2) + # update + above, current, left = current, left, v0[i2] + if ch1 != ch2 + # substitution + current = min(current + 1, + above + 1, + left + 1) end - dist[i2] = current + v0[i2] = current end end return current end - -levenshtein{T}(s1::T, s2::T) = evaluate(Levenshtein(), s1, s2) +function levenshtein(s1::AbstractString, s2::AbstractString) + evaluate(Levenshtein(), s1, s2) +end type DamerauLevenshtein end -function evaluate{T}(dist::DamerauLevenshtein, s1::T, s2::T) +function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString) length(s1) > length(s2) && return evaluate(dist, s2, s1) length(s2) == 0 && return 0 # common - len1, len2 = common_suffix(s1, s2) - len1, len2, start = common_prefix(s1, s2, len1, len2) - len1 == 0 && return len2 + len1, len2 = length(s1), length(s2) + start1, start2 = common_prefix(s1, s2) + done(s1, start1) && return len2 - dist = Array(Int, length(s2)) + v0 = Array(Int, length(s2)) @inbounds for i2 in 1:len2 - dist[i2] = i2 + v0[i2] = i2 end - dist2 = Array(Int, length(s2)) + v2 = Array(Int, length(s2)) - ch1 = s1[1] + ch1, = next(s1, start1) current = 0 - for i1 in 1:len1 + state1 = start1 + i1 = 0 + while !done(s1, state1) + i1 += 1 prevch1 = ch1 - ch1 = s1[start + i1] - ch2 = s2[start + 1] - left = i1 - 1 - current = i1 + ch1, state1 = next(s1, state1) + ch2, = next(s2, start2) + left = (i1 - 1) + current = i1 nextTransCost = 0 - for i2 in 1:len2 + state2 = start2 + i2 = 0 + while !done(s2, state2) + i2 += 1 + prevch2 = ch2 + ch2, state2 = next(s2, state2) above = current thisTransCost = nextTransCost - nextTransCost = dist2[i2] - dist2[i2] = current = left - left = dist[i2] - prevch2 = ch2 - ch2 = s2[start + i2] + nextTransCost = v2[i2] + # cost of diagonal (substitution) + v2[i2] = current = left + # left now equals current cost (which will be diagonal at next iteration) + left = v0[i2] if ch1 != ch2 + # insertion if left < current current = left end + # deletion if above < current current = above end @@ -140,13 +146,13 @@ function evaluate{T}(dist::DamerauLevenshtein, s1::T, s2::T) end end end - dist[i2] = current + v0[i2] = current end end return current end -damerau_levenshtein{T}(s1::T, s2::T) = evaluate(DamerauLevenshtein(), s1, s2) +damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2) ############################################################################## ## @@ -161,7 +167,7 @@ type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer} end JaroWinkler() = JaroWinkler(0.1, 0.7, 5) -function evaluate{T}(dist::JaroWinkler, s1::T, s2::T) +function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString) length(s1) > length(s2) && return evaluate(dist, s2, s1) length(s2) == 0 && return 1.0 @@ -209,11 +215,24 @@ function evaluate{T}(dist::JaroWinkler, s1::T, s2::T) return 1 - score end -function jaro_winkler{T}(s1::T, s2::T; +function jaro_winkler(s1::AbstractString, s2::AbstractString; scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5) evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2) end -jaro{T}(s1::T, s2::T) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2) +jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2) + + + + + + + + + + + + + diff --git a/src/qgrams_distances.jl b/src/qgrams_distances.jl index e1a1810..3e0ef3f 100644 --- a/src/qgrams_distances.jl +++ b/src/qgrams_distances.jl @@ -1,12 +1,28 @@ - ############################################################################## ## -## Define v(s) a vector on the space of q-uple which contains number of times it appears in s -## For instance v("leila")["il"] =1 -## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)|| -## q-gram is ∑ |v(s1, p) - v(s2, p)| +## Gram Iterator iterates through q-grams of a string +## ############################################################################## +type QGramIterator{S <: AbstractString, T <: Integer} + s::S + q::T +end +function Base.start(qgram::QGramIterator) + len = length(qgram.s) + (1, len == 0 ? 1 : len < qgram.q ? nextind(chr2ind(qgram.s, len)) : chr2ind(qgram.s, qgram.q)) +end +function Base.next{S, T}(qgram::QGramIterator{S, T}, state) + istart, iend = state + convert(S, SubString(qgram.s, istart, iend)), (nextind(qgram.s, istart), nextind(qgram.s, iend)) +end +function Base.done(qgram::QGramIterator, state) + istart, idend = state + done(qgram.s, idend) +end +Base.eltype{S, T}(::QGramIterator{S, T}) = S +Base.length(qgram::QGramIterator) = length(qgram.s - qgram.q + 1) + ############################################################################## ## ## A Bag is like Set that it allows duplicated values @@ -34,14 +50,23 @@ end Base.length(bag::Bag) = convert(Int, sum(values(bag.dict))) -function Bag(s::AbstractString, q::Integer) - bag = Bag{typeof(s), UInt}() - @inbounds for i in 1:(length(s) - q + 1) - push!(bag, s[i:(i + q - 1)]) +function Bag(s) + bag = Bag{eltype(s), UInt}() + for x in s + push!(bag, x) end return bag end + +############################################################################## +## +## Define v(s) a vector on the space of q-uple which contains number of times it appears in s +## For instance v("leila")["il"] =1 +## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)|| +## q-gram is ∑ |v(s1, p) - v(s2, p)| +############################################################################## + ############################################################################## ## ## q-gram @@ -58,11 +83,10 @@ function evaluate{T}(dist::QGram, s1::T, s2::T) length(s1) > length(s2) && return evaluate(dist, s2, s1) length(s2) == 0 && return 0 n2 = length(s2) - dist.q + 1 - bag = Bag(s2, dist.q) + bag = Bag(QGramIterator(s2, dist.q)) count = 0 n1 = length(s1) - dist.q + 1 - for i1 in 1:n1 - @inbounds ch = s1[i1:(i1 + dist.q - 1)] + for ch in QGramIterator(s1, dist.q) delete!(bag, ch) end # number non matched in s1 : n1 - (n2 - length(bag)) @@ -87,9 +111,8 @@ function evaluate{T}(dist::Cosine, s1::T, s2::T) length(s1) > length(s2) && return evaluate(dist, s2, s1) length(s2) == 0 && return 0.0 - bag2 = Bag(s2, dist.q) - bag1 = Bag(s1, dist.q) - + bag2 = Bag(QGramIterator(s2, dist.q)) + bag1 = Bag(QGramIterator(s1, dist.q)) count = 0 for (k, v1) in bag1.dict count += v1 * get(bag2.dict, k, 0) @@ -119,17 +142,8 @@ function evaluate{T}(dist::Jaccard, s1::T, s2::T) length(s2) == 0 && return 0.0 - set2 = Set{T}() - n2 = length(s2) - dist.q + 1 - @inbounds for i2 in 1:n2 - push!(set2, s2[i2:(i2 + dist.q - 1)]) - end - - set1 = Set{T}() - n1 = length(s1) - dist.q + 1 - @inbounds for i1 in 1:n1 - push!(set1, s1[i1:(i1 + dist.q - 1)]) - end + set2 = Set(QGramIterator(s2, dist.q)) + set1 = Set(QGramIterator(s1, dist.q)) n_intersect = 0 for x in set1 diff --git a/src/scoring_distances.jl b/src/scoring_distances.jl new file mode 100644 index 0000000..e1a1810 --- /dev/null +++ b/src/scoring_distances.jl @@ -0,0 +1,144 @@ + +############################################################################## +## +## Define v(s) a vector on the space of q-uple which contains number of times it appears in s +## For instance v("leila")["il"] =1 +## cosine is 1 - v(s1, p).v(s2, p) / ||v(s1, p)|| * ||v(s2, p)|| +## q-gram is ∑ |v(s1, p) - v(s2, p)| +############################################################################## + +############################################################################## +## +## A Bag is like Set that it allows duplicated values +## I implement it as dictionary from elements => number of duplicates +## +############################################################################## + +type Bag{Tv, Ti <: Integer} + dict::Dict{Tv, Ti} + Bag() = new(Dict{Tv, Ti}()) +end + +function Base.push!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv) + bag.dict[x] = get(bag.dict, x, zero(Ti)) + one(Ti) + return bag +end + +function Base.delete!{Tv, Ti}(bag::Bag{Tv, Ti}, x::Tv) + v = get(bag.dict, x, zero(Ti)) + if v > zero(Ti) + bag.dict[x] = v - one(Ti) + end + return x +end + +Base.length(bag::Bag) = convert(Int, sum(values(bag.dict))) + +function Bag(s::AbstractString, q::Integer) + bag = Bag{typeof(s), UInt}() + @inbounds for i in 1:(length(s) - q + 1) + push!(bag, s[i:(i + q - 1)]) + end + return bag +end + +############################################################################## +## +## q-gram +## +############################################################################## + +type QGram{T <: Integer} + q::T +end +QGram() = QGram(2) + + +function evaluate{T}(dist::QGram, s1::T, s2::T) + length(s1) > length(s2) && return evaluate(dist, s2, s1) + length(s2) == 0 && return 0 + n2 = length(s2) - dist.q + 1 + bag = Bag(s2, dist.q) + count = 0 + n1 = length(s1) - dist.q + 1 + for i1 in 1:n1 + @inbounds ch = s1[i1:(i1 + dist.q - 1)] + delete!(bag, ch) + end + # number non matched in s1 : n1 - (n2 - length(bag)) + # number non matched in s2 : length(bag) + return n1 - n2 + 2 * length(bag) +end + +qgram{T}(s1::T, s2::T; q = 2) = evaluate(QGram(q), s1, s2) + +############################################################################## +## +## cosine +## +############################################################################## + +type Cosine{T <: Integer} + q::T +end +Cosine() = Cosine(2) + +function evaluate{T}(dist::Cosine, s1::T, s2::T) + length(s1) > length(s2) && return evaluate(dist, s2, s1) + length(s2) == 0 && return 0.0 + + bag2 = Bag(s2, dist.q) + bag1 = Bag(s1, dist.q) + + count = 0 + for (k, v1) in bag1.dict + count += v1 * get(bag2.dict, k, 0) + end + denominator = sqrt(sumabs2(values(bag1.dict))) * sqrt(sumabs2(values(bag2.dict))) + denominator == 0 ? 1.0 : 1.0 - count / denominator +end + +cosine{T}(s1::T, s2::T; q = 2) = evaluate(Cosine(q), s1, s2) + +############################################################################## +## +## Jaccard +## +## Denote Q(s, q) the set of tuple of length q in s +## jaccard(s1, s2, q) = 1 - |intersect(Q(s1, q), Q(s2, q))| / |union(Q(s1, q), Q(s2, q))| +## +############################################################################## + +type Jaccard{T <: Integer} + q::T +end +Jaccard() = Jaccard(2) + +function evaluate{T}(dist::Jaccard, s1::T, s2::T) + length(s1) > length(s2) && return evaluate(dist, s2, s1) + length(s2) == 0 && return 0.0 + + + set2 = Set{T}() + n2 = length(s2) - dist.q + 1 + @inbounds for i2 in 1:n2 + push!(set2, s2[i2:(i2 + dist.q - 1)]) + end + + set1 = Set{T}() + n1 = length(s1) - dist.q + 1 + @inbounds for i1 in 1:n1 + push!(set1, s1[i1:(i1 + dist.q - 1)]) + end + + n_intersect = 0 + for x in set1 + if x in set2 + n_intersect += 1 + end + end + + return 1.0 - n_intersect / (length(set1) + length(set2) - n_intersect) +end + +jaccard{T}(s1::T, s2::T; q = 2) = evaluate(Jaccard(q), s1, s2) \ No newline at end of file diff --git a/test/distances.jl b/test/distances.jl index 304de9e..4097c6b 100644 --- a/test/distances.jl +++ b/test/distances.jl @@ -20,6 +20,13 @@ using StringDistances, Base.Test @test levenshtein("Saturday", "Sunday") == 3 +@test 4 == evaluate(Levenshtein(), "Hi, my name is", "my name is") +@test 21 == evaluate(Levenshtein(), "%^@!^@#^@#!! Snoooooooop", "Dro!p it!!!! like it's hot") +@test 7 == evaluate(Levenshtein(), "Alborgów", "amoniak") + + + + @test damerau_levenshtein("", "") == 0 @test damerau_levenshtein("abc", "") == 3 @test damerau_levenshtein("bc", "abc") == 1