add DamerauLevenshtein
parent
977a280c00
commit
756a1114db
16
README.md
16
README.md
|
@ -1,5 +1,5 @@
|
|||
[![Build Status](https://travis-ci.org/matthieugomez/StringDistances.jl.svg?branch=master)](https://travis-ci.org/matthieugomez/StringDistances.jl)
|
||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/FixedEffectModels.jl?branch=master)
|
||||
[![Coverage Status](https://coveralls.io/repos/matthieugomez/StringDistances.jl/badge.svg?branch=master)](https://coveralls.io/r/matthieugomez/StringDistances.jl?branch=master)
|
||||
|
||||
|
||||
# StringDistances
|
||||
|
@ -7,18 +7,26 @@
|
|||
String Distances in Julia
|
||||
|
||||
- [x] Hamming Distance
|
||||
- [x] Jaro distance
|
||||
- [x] Jaro Distance
|
||||
- [x] Jaro-Winkler Distance
|
||||
- [x] Levenshtein distance
|
||||
- [ ] Damerau-Levenshtein Distance
|
||||
- [x] Levenshtein Distance
|
||||
- [x] Damerau-Levenshtein Distance
|
||||
- [ ] qgram
|
||||
|
||||
Type supports
|
||||
|
||||
- [x] ASCIIString
|
||||
- [x] UTF8String
|
||||
- [ ] Unicode
|
||||
|
||||
|
||||
|
||||
Examples
|
||||
```julia
|
||||
using StringDistances
|
||||
hamming("MARTHA", "MARHTA")
|
||||
levenshtein("MARTHA", "MARHTA")
|
||||
damerau_levenshtein("MARTHA", "MARHTA")
|
||||
jaro("MARTHA", "MARHTA")
|
||||
jaro_winkler("MARTHA", "MARHTA"; scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5)
|
||||
```
|
||||
|
|
|
@ -13,8 +13,10 @@ import Distances: evaluate
|
|||
export Hamming,
|
||||
Levenshtein,
|
||||
JaroWinkler,
|
||||
DamerauLevenshtein,
|
||||
hamming,
|
||||
levenshtein,
|
||||
damerau_levenshtein,
|
||||
jaro_winkler,
|
||||
jaro
|
||||
|
||||
|
@ -39,34 +41,133 @@ hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
|
|||
|
||||
##############################################################################
|
||||
##
|
||||
## Levenshtein
|
||||
## Levenshtein and Damerau Levenshtein
|
||||
## Source Levenshtein: http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-c.html
|
||||
## Source DamerauLevenshtein: http://blog.softwx.net/2015/01/optimizing-damerau-levenshtein_15.html
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
function common_suffix(s1::AbstractString, s2::AbstractString)
|
||||
len1 = length(s1)
|
||||
len2 = length(s2)
|
||||
while ((len1 > 0) && (s1[len1] == s2[len2]))
|
||||
len1 -= 1
|
||||
len2 -= 1
|
||||
end
|
||||
return len1, len2
|
||||
end
|
||||
|
||||
function common_prefix(s1::AbstractString, s2::AbstractString, len1::Int, len2::Int)
|
||||
start = 0
|
||||
len1 == 0 && return len1, len2, start
|
||||
if (s1[start + 1] == s2[start + 1])
|
||||
while ((start < len1) && (s1[start + 1] == s2[start + 1]))
|
||||
start += 1
|
||||
end
|
||||
len1 -= start
|
||||
len2 -= start
|
||||
len1 == 0 && return len1, len2, start
|
||||
end
|
||||
return len1, len2, start
|
||||
end
|
||||
|
||||
type Levenshtein end
|
||||
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0
|
||||
|
||||
dist = Array(Int, length(s1) + 1)
|
||||
@inbounds for i1 in 1:length(s1)
|
||||
dist[i1 + 1] = i1
|
||||
|
||||
# common
|
||||
len1, len2 = common_suffix(s1, s2)
|
||||
len1, len2, start = common_prefix(s1, s2, len1, len2)
|
||||
len1 == 0 && return len2
|
||||
|
||||
dist = Array(Int, len2)
|
||||
@inbounds for i2 in 1:len2
|
||||
dist[i2] = i2
|
||||
end
|
||||
@inbounds for i2 in 1:length(s2)
|
||||
dist[1] = i2
|
||||
lastdiag = i2 - 1
|
||||
for i1 in 1:length(s1)
|
||||
olddiag = dist[i1 + 1]
|
||||
dist[i1 + 1] = min(dist[i1 + 1] + 1, dist[i1] + 1, lastdiag + (s1[i1] == s2[i2] ? 0 : 1))
|
||||
lastdiag = olddiag
|
||||
current = 0
|
||||
for i1 in 1:len1
|
||||
ch1 = s1[start + i1]
|
||||
left = current = i1 - 1
|
||||
for i2 in 1:len2
|
||||
above = current
|
||||
current = left
|
||||
left = dist[i2]
|
||||
if ch1 != s2[start + i2]
|
||||
current += 1
|
||||
insDel = above + 1
|
||||
if insDel < current
|
||||
current = insDel
|
||||
end
|
||||
insDel = left + 1
|
||||
if insDel < current
|
||||
current = insDel
|
||||
end
|
||||
end
|
||||
dist[i2] = current
|
||||
end
|
||||
end
|
||||
return dist[end]
|
||||
return current
|
||||
end
|
||||
|
||||
levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1, s2)
|
||||
|
||||
type DamerauLevenshtein end
|
||||
|
||||
function evaluate(dist::DamerauLevenshtein, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0
|
||||
|
||||
# common
|
||||
len1, len2 = common_suffix(s1, s2)
|
||||
len1, len2, start = common_prefix(s1, s2, len1, len2)
|
||||
len1 == 0 && return len2
|
||||
|
||||
dist = Array(Int, length(s2))
|
||||
@inbounds for i2 in 1:len2
|
||||
dist[i2] = i2
|
||||
end
|
||||
dist2 = Array(Int, length(s2))
|
||||
|
||||
ch1 = s1[1]
|
||||
current = 0
|
||||
for i1 in 1:len1
|
||||
prevch1 = ch1
|
||||
ch1 = s1[start + i1]
|
||||
ch2 = s2[start + 1]
|
||||
left = i1 - 1
|
||||
current = i1
|
||||
nextTransCost = 0
|
||||
for i2 in 1:len2
|
||||
above = current
|
||||
thisTransCost = nextTransCost
|
||||
nextTransCost = dist2[i2]
|
||||
dist2[i2] = current = left
|
||||
left = dist[i2]
|
||||
prevch2 = ch2
|
||||
ch2 = s2[start + i2]
|
||||
if ch1 != ch2
|
||||
if left < current
|
||||
current = left
|
||||
end
|
||||
if above < current
|
||||
current = above
|
||||
end
|
||||
current += 1
|
||||
if i1 != 1 && i2 != 1 && ch1 == prevch2 && prevch1 == ch2
|
||||
thisTransCost += 1
|
||||
if thisTransCost < current
|
||||
current = thisTransCost
|
||||
end
|
||||
end
|
||||
end
|
||||
dist[i2] = current
|
||||
end
|
||||
end
|
||||
return current
|
||||
end
|
||||
damerau_levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(DamerauLevenshtein(), s1, s2)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## JaroWinkler
|
||||
|
@ -82,6 +183,7 @@ end
|
|||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0.0
|
||||
|
||||
maxdist = max(0, div(length(s2), 2) - 1)
|
||||
m = 0 # matching characters
|
||||
t = 0 # half number of transpositions
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
using Base.Test
|
||||
using StringDistances, Base.Test
|
||||
|
||||
|
||||
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", boosting_threshold = 0.0, long_threshold = 100) 0.9611 1e-4
|
||||
|
@ -20,6 +20,15 @@ using Base.Test
|
|||
@test levenshtein("Saturday", "Sunday") == 3
|
||||
|
||||
|
||||
@test damerau_levenshtein("", "") == 0
|
||||
@test damerau_levenshtein("abc", "") == 3
|
||||
@test damerau_levenshtein("bc", "abc") == 1
|
||||
@test damerau_levenshtein("fuor", "four") == 1
|
||||
@test damerau_levenshtein("abcd", "acb") == 2
|
||||
@test damerau_levenshtein("cape sand recycling ", "edith ann graham") == 17
|
||||
@test damerau_levenshtein("jellyifhs", "jellyfish") == 2
|
||||
@test damerau_levenshtein("ifhs", "fish") == 2
|
||||
|
||||
|
||||
@test hamming("", "") == 0
|
||||
@test hamming("", "abc") == 3
|
||||
|
|
Loading…
Reference in New Issue