StringDistances.jl/src/StringDistances.jl

132 lines
4.0 KiB
Julia
Raw Normal View History

2015-10-22 18:12:44 +02:00
__precompile__(true)
module StringDistances
##############################################################################
##
## Export
##
##############################################################################
import Distances: evaluate
export Hamming,
Levenshtein,
JaroWinkler,
hamming,
levenshtein,
jaro_winkler,
jaro
##############################################################################
##
## Hamming
##
##############################################################################
type Hamming end
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
count = 0
2015-10-22 18:51:43 +02:00
@inbounds for i in 1:length(s1)
2015-10-22 18:12:44 +02:00
count += s1[i] != s2[i]
end
count += length(s2) - length(s1)
return count
end
hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
##############################################################################
##
## Levenshtein
##
##############################################################################
type Levenshtein end
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0
dist = Array(Int, length(s1) + 1)
2015-10-22 18:51:43 +02:00
@inbounds for i1 in 1:length(s1)
2015-10-22 18:12:44 +02:00
dist[i1 + 1] = i1
end
2015-10-22 18:51:43 +02:00
@inbounds for i2 in 1:length(s2)
2015-10-22 18:12:44 +02:00
dist[1] = i2
lastdiag = i2 - 1
for i1 in 1:length(s1)
olddiag = dist[i1 + 1]
dist[i1 + 1] = min(dist[i1 + 1] + 1, dist[i1] + 1, lastdiag + (s1[i1] == s2[i2] ? 0 : 1))
lastdiag = olddiag
end
end
return dist[end]
end
levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1, s2)
##############################################################################
##
## JaroWinkler
##
##############################################################################
type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
2015-10-22 18:23:10 +02:00
scaling_factor::T1 # scaling factor. Default to 0.1
boosting_threshold::T2 # boost threshold. Default to 0.7
long_threshold::T3 # long string adjustment. Default to 5
2015-10-22 18:12:44 +02:00
end
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
length(s1) > length(s2) && return evaluate(dist, s2, s1)
length(s2) == 0 && return 0.0
maxdist = max(0, div(length(s2), 2) - 1)
m = 0 # matching characters
t = 0 # half number of transpositions
flag = fill(false, length(s2))
prevpos = 0
2015-10-22 18:51:43 +02:00
@inbounds for i1 in 1:length(s1)
2015-10-22 18:12:44 +02:00
ch = s1[i1]
i2low = max(1, i1 - maxdist)
i2high = min(length(s2), i1 + maxdist)
for i2 in i2low:i2high
if ch == s2[i2] && !flag[i2]
m += 1
# if match is before the index of previous match
if i2 < prevpos
t += 1
end
prevpos = max(i2, prevpos)
flag[i2] = true
break
end
end
end
m == 0.0 && return 0.0
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
# common prefix adjustment
2015-10-22 18:23:10 +02:00
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold) || (length(s1) >= dist.long_threshold)
2015-10-22 18:12:44 +02:00
l = 0
last = min(4, length(s1))
while l < last && s1[l+1] == s2[l+1]
l += 1
end
# common prefix adjustment
2015-10-22 18:23:10 +02:00
if (dist.scaling_factor > 0 && score >= dist.boosting_threshold)
score += l * (1 - score) * dist.scaling_factor
2015-10-22 18:12:44 +02:00
end
# longer string adjustment
2015-10-22 18:23:10 +02:00
if (length(s1) >= dist.long_threshold) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
2015-10-22 18:12:44 +02:00
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
end
end
return score
end
2015-10-22 18:23:10 +02:00
jaro_winkler(s1::AbstractString, s2::AbstractString; scaling_factor = 0.1, boosting_threshold = 0.7, long_threshold = 5) = evaluate(JaroWinkler(scaling_factor, boosting_threshold, long_threshold), s1, s2)
2015-10-22 18:12:44 +02:00
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
2015-10-22 18:38:04 +02:00
end