first commit
parent
c1f8b3f781
commit
3ecd33e732
|
@ -0,0 +1,13 @@
|
|||
language: julia
|
||||
julia:
|
||||
- 0.4
|
||||
- nightly
|
||||
script:
|
||||
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
|
||||
- julia --check-bounds=yes -e 'Pkg.clone(pwd()); Pkg.build("StringDistances"); Pkg.test("StringDistances"; coverage=true)'
|
||||
after_success:
|
||||
- julia -e 'cd(Pkg.dir("StringDistances")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
|
||||
notifications:
|
||||
email: false
|
||||
on_success: never
|
||||
on_failure: change
|
|
@ -0,0 +1,22 @@
|
|||
The StringDistances.jl package is licensed under the MIT "Expat" License:
|
||||
|
||||
> Copyright (c) 2015: Matthieu Gomez.
|
||||
>
|
||||
> Permission is hereby granted, free of charge, to any person obtaining
|
||||
> a copy of this software and associated documentation files (the
|
||||
> "Software"), to deal in the Software without restriction, including
|
||||
> without limitation the rights to use, copy, modify, merge, publish,
|
||||
> distribute, sublicense, and/or sell copies of the Software, and to
|
||||
> permit persons to whom the Software is furnished to do so, subject to
|
||||
> the following conditions:
|
||||
>
|
||||
> The above copyright notice and this permission notice shall be
|
||||
> included in all copies or substantial portions of the Software.
|
||||
>
|
||||
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
> IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
> CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
> TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
> SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -0,0 +1,132 @@
|
|||
__precompile__(true)
|
||||
|
||||
module StringDistances
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Export
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
import Distances: evaluate
|
||||
|
||||
export Hamming,
|
||||
Levenshtein,
|
||||
JaroWinkler,
|
||||
hamming,
|
||||
levenshtein,
|
||||
jaro_winkler,
|
||||
jaro
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Hamming
|
||||
##
|
||||
##############################################################################
|
||||
type Hamming end
|
||||
|
||||
function evaluate(dist::Hamming, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
count = 0
|
||||
for i in 1:length(s1)
|
||||
count += s1[i] != s2[i]
|
||||
end
|
||||
count += length(s2) - length(s1)
|
||||
return count
|
||||
end
|
||||
hamming(s1::AbstractString, s2::AbstractString) = evaluate(Hamming(), s1, s2)
|
||||
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## Levenshtein
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
type Levenshtein end
|
||||
|
||||
function evaluate(dist::Levenshtein, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0
|
||||
|
||||
dist = Array(Int, length(s1) + 1)
|
||||
for i1 in 1:length(s1)
|
||||
dist[i1 + 1] = i1
|
||||
end
|
||||
for i2 in 1:length(s2)
|
||||
dist[1] = i2
|
||||
lastdiag = i2 - 1
|
||||
for i1 in 1:length(s1)
|
||||
olddiag = dist[i1 + 1]
|
||||
dist[i1 + 1] = min(dist[i1 + 1] + 1, dist[i1] + 1, lastdiag + (s1[i1] == s2[i2] ? 0 : 1))
|
||||
lastdiag = olddiag
|
||||
end
|
||||
end
|
||||
return dist[end]
|
||||
end
|
||||
|
||||
levenshtein(s1::AbstractString, s2::AbstractString) = evaluate(Levenshtein(), s1, s2)
|
||||
|
||||
##############################################################################
|
||||
##
|
||||
## JaroWinkler
|
||||
##
|
||||
##############################################################################
|
||||
|
||||
type JaroWinkler{T1 <: Number, T2 <: Number, T3 <: Integer}
|
||||
p::T1 # scaling factor. Default to 0.1
|
||||
b::T2 # boost threshold. Default to 0.7
|
||||
long::T3 # long string adjustment. Default to 5
|
||||
end
|
||||
|
||||
function evaluate(dist::JaroWinkler, s1::AbstractString, s2::AbstractString)
|
||||
length(s1) > length(s2) && return evaluate(dist, s2, s1)
|
||||
length(s2) == 0 && return 0.0
|
||||
maxdist = max(0, div(length(s2), 2) - 1)
|
||||
m = 0 # matching characters
|
||||
t = 0 # half number of transpositions
|
||||
flag = fill(false, length(s2))
|
||||
prevpos = 0
|
||||
for i1 in 1:length(s1)
|
||||
ch = s1[i1]
|
||||
i2low = max(1, i1 - maxdist)
|
||||
i2high = min(length(s2), i1 + maxdist)
|
||||
for i2 in i2low:i2high
|
||||
if ch == s2[i2] && !flag[i2]
|
||||
m += 1
|
||||
# if match is before the index of previous match
|
||||
if i2 < prevpos
|
||||
t += 1
|
||||
end
|
||||
prevpos = max(i2, prevpos)
|
||||
flag[i2] = true
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
m == 0.0 && return 0.0
|
||||
|
||||
score = (m / length(s1) + m / length(s2) + (m - t) / m) / 3.0
|
||||
# common prefix adjustment
|
||||
if (dist.p > 0 && score >= dist.b) || (length(s1) >= dist.long)
|
||||
l = 0
|
||||
last = min(4, length(s1))
|
||||
while l < last && s1[l+1] == s2[l+1]
|
||||
l += 1
|
||||
end
|
||||
# common prefix adjustment
|
||||
if (dist.p > 0 && score >= dist.b)
|
||||
score += l * (1 - score) * dist.p
|
||||
end
|
||||
# longer string adjustment
|
||||
if (length(s1) >= dist.long) && (m - l >= 2) && ((m - l) >= (length(s1) - l) / 2)
|
||||
score += (1 - score) * (m - (l + 1)) / (length(s1) + length(s2) - (2 * (l - 1)))
|
||||
end
|
||||
end
|
||||
return score
|
||||
end
|
||||
|
||||
jaro_winkler(s1::AbstractString, s2::AbstractString; p = 0.1, b = 0.7, long = 5) = evaluate(JaroWinkler(p, b, long), s1, s2)
|
||||
jaro(s1::AbstractString, s2::AbstractString) = evaluate(JaroWinkler(0.0, 0.0, 0), s1, s2)
|
||||
|
||||
end # module FixedEffectModels
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
using Base.Test
|
||||
|
||||
|
||||
@test_approx_eq_eps jaro_winkler("MARTHA", "MARHTA", b = 0.0, long = 100) 0.9611 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("DWAYNE", "DUANE", b = 0.0, long = 100) 0.84 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("DIXON", "DICKSONX", b = 0.0, long = 100) 0.81333 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("William", "Williams", b = 0.0, long = 100) 0.975 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("", "foo", b = 0.0, long = 100) 0.0 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("a", "a", b = 0.0, long = 100) 1.0 1e-4
|
||||
@test_approx_eq_eps jaro_winkler("abc", "xyz", b = 0.0, long = 100) 0.0 1e-4
|
||||
|
||||
|
||||
|
||||
@test levenshtein("", "") == 0
|
||||
@test levenshtein("abc", "") == 3
|
||||
@test levenshtein("", "abc") == 3
|
||||
@test levenshtein("bc", "abc") == 1
|
||||
@test levenshtein("kitten", "sitting") == 3
|
||||
@test levenshtein("Saturday", "Sunday") == 3
|
||||
|
||||
|
||||
|
||||
@test hamming("", "") == 0
|
||||
@test hamming("", "abc") == 3
|
||||
@test hamming("abc", "abc") == 0
|
||||
@test hamming("acc", "abc") == 1
|
||||
@test hamming("abcd", "abc") == 1
|
||||
@test hamming("abc", "abcd") == 1
|
||||
@test hamming("testing", "this is a test") == 13
|
||||
@test hamming("Saturday", "Sunday") == 7
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
using StringDistances
|
||||
|
||||
tests = ["distances.jl"
|
||||
]
|
||||
|
||||
println("Running tests:")
|
||||
|
||||
for test in tests
|
||||
try
|
||||
include(test)
|
||||
println("\t\033[1m\033[32mPASSED\033[0m: $(test)")
|
||||
catch e
|
||||
println("\t\033[1m\033[31mFAILED\033[0m: $(test)")
|
||||
showerror(STDOUT, e, backtrace())
|
||||
rethrow(e)
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue