126 lines
3.7 KiB
Awk
Executable File
126 lines
3.7 KiB
Awk
Executable File
#!/bin/awk -f
|
|
|
|
## Algorithms
|
|
##
|
|
## S = number of files in SOURCE
|
|
## T = number of files in TARGET
|
|
## Td = number of duplicate files in TARGET
|
|
##
|
|
## There are two possible algorithms.
|
|
##
|
|
## 1. We store duplicates in a separate array, then during the scanning process
|
|
## we need to check the array containing the path and the array containing the
|
|
## duplicates.
|
|
##
|
|
## source: 2*S tests
|
|
## target: 2*T tests
|
|
##
|
|
## 2. We set a boolean subscript to true in the same table. During the scanning
|
|
## process, we only need to do one test per entry:
|
|
##
|
|
## source: S tests
|
|
## target: T tests
|
|
##
|
|
## However during the syncing process we need to do more.
|
|
##
|
|
## final = Td splits + Td tests + 2*Td tests + (T-Td)*3 tests = Td splits + 3*T tests
|
|
##
|
|
## On the average, the second algorithm seems to be a little slower because of
|
|
## the splits. Besides, it is longer to implement and harder to understand. Thus
|
|
## we go for the first algorithm.
|
|
|
|
function _printhelp() {
|
|
print "Filesystem hierarchy synchronizer\n\
|
|
\n\
|
|
Usage: hsync SOURCE TARGET\n\
|
|
\n\
|
|
Preview how to move files in TARGET so that identical files found in SOURCE and\n\
|
|
TARGET have the same path in both folders. If duplicate files are found either\n\
|
|
in SOURCE or in TARGET, they are ignored as it is up to the user to decide what\n\
|
|
to do.\n\
|
|
\n\
|
|
Files are not actually moved, the command is printed to stdout. This way you can\n\
|
|
redirect to a file and preview the changes before actually processing them.\n\
|
|
\n\
|
|
You can also process the changes directly from command-line with some shell\n\
|
|
evaluation:\n\
|
|
eval $(hsync SOURCE TARGET)"
|
|
}
|
|
|
|
function _info(s) {
|
|
print "\n==> " s | "cat >&2"
|
|
close("cat >&2")
|
|
}
|
|
|
|
function _msg(s) {
|
|
print ":: " s | "cat >&2"
|
|
close("cat >&2")
|
|
}
|
|
|
|
BEGIN {
|
|
## Both parameters must be existing folders.
|
|
if (ARGC != 3 ||
|
|
("test -d " ARGV[1] " && test -d " ARGV[2] " || echo KO" | getline) > 0 )
|
|
{
|
|
_printhelp()
|
|
exit
|
|
}
|
|
|
|
_info("Scanning " ARGV[1] "...")
|
|
while ( ("cd -- " ARGV[1] " && find . -type f -exec md5sum {} +" | getline ) > 0)
|
|
{
|
|
sum = $1
|
|
## We need to remove sum from line since filename may contain
|
|
## spaces. substr is faster than sub.
|
|
name = substr($0, 35)
|
|
|
|
if (sum in source || sum in source_dup)
|
|
{
|
|
_msg("Skipping duplicate: " name)
|
|
## Note: it is not necessary to delete source[sum] since it will not
|
|
## be used anyway if source_dup[sum] is set to 1.
|
|
source_dup[sum] = 1
|
|
continue
|
|
}
|
|
source[sum] = name
|
|
}
|
|
|
|
_info("Scanning " ARGV[2] "...")
|
|
while ( ("cd -- " ARGV[2] " && find . -type f -exec md5sum {} +" | getline ) > 0)
|
|
{
|
|
sum = $1
|
|
name = substr($0, 35)
|
|
|
|
## Skip if duplicate, not existant in source, or identical.
|
|
if (sum in source_dup || ! (sum in source) || source[sum] == name)
|
|
continue
|
|
|
|
if (sum in target || sum in target_dup)
|
|
{
|
|
_msg("Skipping duplicate: " name)
|
|
delete target[sum]
|
|
target_dup[sum] = 1
|
|
continue
|
|
}
|
|
target[sum] = name
|
|
}
|
|
|
|
_info("Hierarchy syncing preview of " ARGV[2] " based on " ARGV[1])
|
|
for (i in target)
|
|
{
|
|
dirname = source[i]
|
|
sub(/[^/]+$/, "", dirname)
|
|
|
|
## Target -> Source
|
|
cmd = "'" ARGV[2] "/" target[i] "' '" ARGV[2] "/" source[i] "'"
|
|
|
|
## We only create a new folder if necessary.
|
|
cmd = "[ ! -d '" dirname "' ] && mkdir -p \"" ARGV[2] "/" dirname "\" ; mv -nv " cmd
|
|
|
|
## We can sort preview by uncommenting the following pipe. This is
|
|
## not very useful however, and it costs some n*log(n) more
|
|
## operations.
|
|
print cmd #|"sort"
|
|
}
|
|
}
|