ambevar-dotfiles/.scripts/hsync

#!/bin/awk -f

## Algorithms
##
## S = number of files in SOURCE
## T = number of files in TARGET
## Td = number of duplicate files in TARGET
##
## There are two possible algorithms.
##
## 1. We store duplicates in a separate array, then during the scanning process
## we need to check the array containing the path and the array containing the
## duplicates.
##
##   source: 2*S tests
##   target: 2*T tests
##
## 2. We set a boolean subscript to true in the same table. During the scanning
## process, we only need to do one test per entry:
##
##   source: S tests
##   target: T tests
##
## However during the syncing process we need to do more.
##
##   final = Td splits + Td tests + 2*Td tests + (T-Td)*3 tests = Td splits + 3*T tests
##
## On the average, the second algorithm seems to be a little slower because of
## the splits. Besides, it is longer to implement and harder to understand. Thus
## we go for the first algorithm.

function _printhelp() {
    print "Filesystem hierarchy synchronizer\n\
\n\
Usage: hsync SOURCE TARGET\n\
\n\
Preview how to move files in TARGET so that identical files found in SOURCE and\n\
TARGET have the same path in both folders. If duplicate files are found either\n\
in SOURCE or in TARGET, they are ignored as it is up to the user to decide what\n\
to do.\n\
\n\
Files are not actually moved, the command is printed to stdout. This way you can\n\
redirect to a file and preview the changes before actually processing them.\n\
\n\
You can also process the changes directly from command-line with some shell\n\
evaluation:\n\
  eval $(hsync SOURCE TARGET)"
}

function _info(s) {
    print "\n==> " s | "cat >&2"
    close("cat >&2")
}

function _msg(s) {
    print ":: " s | "cat >&2"
    close("cat >&2")
}

BEGIN {
    ## Both parameters must be existing folders.
    if (ARGC != 3 ||
        ("test -d " ARGV[1] " && test -d " ARGV[2] " || echo KO" | getline) > 0 )
    {
        _printhelp()
        exit
    }

    _info("Scanning " ARGV[1] "...")
    while ( ("cd -- " ARGV[1] " && find . -type f -exec md5sum {} +" | getline ) > 0)
    {
        sum = $1
        ## We need to remove sum from line since filename may contain
        ## spaces. substr is faster than sub.
        name = substr($0, 35)

        if (sum in source || sum in source_dup)
        {
            _msg("Skipping duplicate: " name)
            ## Note: it is not necessary to delete source[sum] since it will not
            ## be used anyway if source_dup[sum] is set to 1.
            source_dup[sum] = 1
            continue
        }
        source[sum] = name
    }

    _info("Scanning " ARGV[2] "...")
    while ( ("cd -- " ARGV[2] " && find . -type f -exec md5sum {} +" | getline ) > 0)
    {
        sum = $1
        name = substr($0, 35)

        ## Skip if duplicate, not existant in source, or identical.
        if (sum in source_dup || ! (sum in  source) || source[sum] == name)
            continue

        if (sum in target || sum in target_dup)
        {
            _msg("Skipping duplicate: " name)
            delete target[sum]
            target_dup[sum] = 1
            continue
        }
        target[sum] = name
    }

    _info("Hierarchy syncing preview of " ARGV[2] " based on " ARGV[1])
    for (i in target)
    {
        dirname = source[i]
        sub(/[^/]+$/, "", dirname)

        ## Target -> Source
        cmd = "'" ARGV[2] "/" target[i] "' '" ARGV[2] "/" source[i] "'"

        ## We only create a new folder if necessary.
        cmd = "[ ! -d '" dirname "' ] && mkdir -p \"" ARGV[2] "/" dirname "\" ; mv -nv " cmd

        ## We can sort preview by uncommenting the following pipe. This is
        ## not very useful however, and it costs some n*log(n) more
        ## operations.
        print cmd #|"sort"
    }
}