add DamerauLevenshtein

pull/1/head
Matthieu Gomez 2015-10-22 21:03:57 -04:00
parent 977a280c00
commit 756a1114db
3 changed files with 138 additions and 19 deletions

View File

@ -1,5 +1,5 @@
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/FixedEffectModels.jl?branch=master)
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
# StringDistances
@ -7,18 +7,26 @@
String Distances in Julia
- [x] Hamming Distance
- [x] Jaro distance
- [x] Jaro Distance
- [x] Jaro-Winkler Distance
- [x] Levenshtein distance
- [ ] Damerau-Levenshtein Distance
- [x] Levenshtein Distance
- [x] Damerau-Levenshtein Distance
- [ ] qgram
Type supports
- [x] ASCIIString
- [x] UTF8String
- [ ] Unicode
Examples
```julia
using StringDistances
hamming("MARTHA", "MARHTA")
levenshtein("MARTHA", "MARHTA")
damerau_levenshtein("MARTHA", "MARHTA")
jaro("MARTHA", "MARHTA")
jaro_winkler("MARTHA", "MARHTA"; scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5)
```

View File

@ -13,8 +13,10 @@ import Distances: evaluate
export Hamming,
Levenshtein,
JaroWinkler,
DamerauLevenshtein,
hamming,
levenshtein,
damerau_levenshtein,
jaro_winkler,
jaro
@ -39,34 +41,133 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
##############################################################################
##
## Levenshtein
## Levenshtein and Damerau Levenshtein
## Source Levenshtein: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
##
##############################################################################
function common_suffix(s1::AbstractString, s2::AbstractString)
len1 = length(s1)
len2 = length(s2)
while ((len1 > 0) && (s1[len1] == s2[len2]))
len1 -= 1
len2 -= 1
end
return len1, len2
end
function common_prefix(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
start = 0
len1 == 0 && return len1, len2, start
if (s1[start + 1] == s2[start + 1])
while ((start < len1) && (s1[start + 1] == s2[start + 1]))
start += 1
end
len1 -= start
len2 -= start
len1 == 0 && return len1, len2, start
end
return len1, len2, start
end
type Levenshtein end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
dist = Array(Int, length(s1) + 1)
@inbounds for i1 in 1:length(s1)
dist[i1 + 1] = i1
# common
len1, len2 = common_suffix(s1, s2)
len1, len2, start = common_prefix(s1, s2, len1, len2)
len1 == 0 && return len2
dist = Array(Int, len2)
@inbounds for i2 in 1:len2
dist[i2] = i2
end
@inbounds for i2 in 1:length(s2)
dist[1] = i2
lastdiag = i2 - 1
for i1 in 1:length(s1)
olddiag = dist[i1 + 1]
dist[i1 + 1] = min(dist[i1 + 1] + 1, dist[i1] + 1, lastdiag + (s1[i1] == s2[i2] ? 0 : 1))
lastdiag = olddiag
current = 0
for i1 in 1:len1
ch1 = s1[start + i1]
left = current = i1 - 1
for i2 in 1:len2
above = current
current = left
left = dist[i2]
if ch1 != s2[start + i2]
current += 1
insDel = above + 1
if insDel < current
current = insDel
end
insDel = left + 1
if insDel < current
current = insDel
end
end
dist[i2] = current
end
end
return dist[end]
return current
end
levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1, s2)
type DamerauLevenshtein end
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
# common
len1, len2 = common_suffix(s1, s2)
len1, len2, start = common_prefix(s1, s2, len1, len2)
len1 == 0 && return len2
dist = Array(Int, length(s2))
@inbounds for i2 in 1:len2
dist[i2] = i2
end
dist2 = Array(Int, length(s2))
ch1 = s1[1]
current = 0
for i1 in 1:len1
prevch1 = ch1
ch1 = s1[start + i1]
ch2 = s2[start + 1]
left = i1 - 1
current = i1
nextTransCost = 0
for i2 in 1:len2
above = current
thisTransCost = nextTransCost
nextTransCost = dist2[i2]
dist2[i2] = current = left
left = dist[i2]
prevch2 = ch2
ch2 = s2[start + i2]
if ch1 != ch2
if left < current
current = left
end
if above < current
current = above
end
current += 1
if i1 != 1 && i2 != 1 && ch1 == prevch2 && prevch1 == ch2
thisTransCost += 1
if thisTransCost < current
current = thisTransCost
end
end
end
dist[i2] = current
end
end
return current
end
damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2)
##############################################################################
##
## JaroWinkler
@ -82,6 +183,7 @@ end
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
maxdist = max(0, div(length(s2), 2) - 1)
m = 0 # matching characters
t = 0 # half number of transpositions

View File

@ -1,5 +1,5 @@
using Base.Test
using StringDistances, Base.Test
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4
@ -20,6 +20,15 @@ using Base.Test
@test levenshtein("Saturday", "Sunday") == 3
@test damerau_levenshtein("", "") == 0
@test damerau_levenshtein("abc", "") == 3
@test damerau_levenshtein("bc", "abc") == 1
@test damerau_levenshtein("fuor", "four") == 1
@test damerau_levenshtein("abcd", "acb") == 2
@test damerau_levenshtein("cape sand recycling ", "edith ann graham") == 17
@test damerau_levenshtein("jellyifhs", "jellyfish") == 2
@test damerau_levenshtein("ifhs", "fish") == 2
@test hamming("", "") == 0
@test hamming("", "abc") == 3