ambevar-dotfiles/.scripts/titlecase.awk

#!/bin/gawk -f
## This script is inspired by
##   http://www.pement.org/awk/titlecase.awk.txt

##
## function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
##
## Other Features:
##
##   titlecase() will compress whitespace if a second parameter is passed.  It
##   is sufficient to use a positive number: titlecase(string,1)
##
##   This function tries to implement the "Title Case" constructs specified in
##   the APA Style Manual and the Chicago Manual of Style. Instead of merely
##   capitalizing the first letter of each word and setting everything else in
##   lowercase, this function implements the following conditions:
##
##  - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
##    are the first word of the string or the first word after a colon, a
##    question mark, or an exclamation point.
##  - Compass points (NE, SW, etc.) are set in solid caps.
##  - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
##  - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
##  - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
##  - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
##  - Contractions such as I'll, You've, Don't, etc. are handled properly
##  - Degrees such as Ph.D., M.Div., etc. are properly capitalized
##
## Sample Usage with GNU awk (gawk):
##
##   gawk -f titlecase.awk infile

## TODO: maybe it would be a good idea to implement a preprocessor that would
## search and replace special strings like AC-DC.

## Tests:

## all lowercase words
## ALL UPPERCASE WORDS
## aLl cRaZY cASE WordS
## And with constants in an INTO cd Contre. Feat and Feat. the machine.
## Bad   ,punctuation. here  , should ! not be ?a problem.
## Roman numerals XIV LIV xiv liv. liv. xiv.
## Dashed--machine--ac-dc.
## About mcdonald and o'reilly, but i'll won't say.
## The "final quote" 'on the waterfront'.

BEGIN {
    ## English
    constants = constants "a an the and but for nor or so am is are against at between by from in into of on to upon "

    ## French
    constants = constants "un une de du le la les et mais pour ni ou à a où contre entre chez dans sur que qui "

    ## German
    constants = constants "der die das den dem des ein eine einen eines einer von wo an am in für gegen bei aus mit nach seit zu durch ohne um "

    ## Music
    constants = constants "feat CD DJ "
    constants = constants "KlassX Machine d'Acide BYOB MGMT AC DC JBX RZA DMX "

    ## Others
    constants = constants "AIDS ASCII DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN TV FM "

    ## Build array of constant words.
    split(constants, constarray, " ")
}

function titlecase(string)  {
    ## Initialize variables.
    a = "";            # a is/will be the string ALREADY converted
    b = string;        # b is the rest of the string, so that (string = a b)

    ## English punctuation. It is quite hard to guess the language, so French
    ## will follow English punctuation rules.
    b = gensub(/ +([,!:;?.]+) */, "\\1 ", "g", b)

    ## Compress spaces or tabs. Trim prefix and suffix space. Convert
    ## underscores to spaces.
    gsub(/[_ \t]+/, " ", b)
    gsub(/^ /, "", b)
    gsub(/ $/, "", b)

    ## Capitalize everything for ease of matching.
    b = toupper(b)

    do {
        ## Initialize for later use.
        hit = 0;

        ## 'pos' is the position of the NEXT punctuation mark (except
        ## apostrophe) after the current word. If this is the last word in b,
        ## pos will be 0.  match() automatically sets RLENGTH.  WARNING: we
        ## consider digits as part of a word.
        pos = match(b, /[^[:alnum:]']+/)

        if (pos > 0)    word = substr(b, 1, pos + RLENGTH - 1)
        else            word = b

        ## 1st char of current word.
        head = substr(b, 1, 1)
        ## Tail of current word.
        if (pos > 0)    tail = substr(b, 2, pos + RLENGTH - 2)
        else            tail = substr(b, 2)

        ## Shorten the rest of the string.
        b = substr(b, pos + RLENGTH  )

        ## RULE 1 -- Constant strings.

        ## WARNING: since we match a substring of 'word', we need to prepend and
        ## append the potentially discarded values, like dashes.
        for (var in constarray) {
            if (debug)
                print ":: Comparing " word " with " constarray[var]
            hit = match(word, "^" toupper(constarray[var]) "\\>")
            if ( hit > 0 ) {
                word = substr(word, 1, RSTART-1) constarray[var] substr(word, RSTART+RLENGTH)
                if (debug)
                    print ":: Match constant on [" constarray[var] "] in string [" word "]";
                break;
            }
        }

        ## RULE 2 -- Roman numerals

        ## Note: this match cannot distinguish between LIV (54 in Roman
        ## numerals) and a personal name like "Liv Ullman".  The Roman numerals
        ## C (100), D (500), and M (1000) are omitted to avoid false matches on
        ## words like civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of
        ## Roman numerals in titles stays in the lower ranges, such as "Vol. II"
        ## or "Pt. XXIV".
        if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
            hit = 1
            ## But we can undo I'd, I'll, I'm, I've and Ill.
            if (match(word,/^I'|ILL\>/))
                hit = 0
            if (debug && hit == 1)
                print ":: Match on Roman numerals in [" word "]"
        }

        ## RULE 3 -- Names like D'Arcy or O'Reilly
        if ( hit == 0 && capital != 1 && match(word, /^[DO]'[[:alpha:]]/) ) {
            word = substr(word,1,3) tolower(substr(word,4))
            hit = 1
            if (debug)
                print ":: Match on mixed case: " word
        }

        ## RULE 4 -- Names like MacNeil or McDonald
        if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
            if (debug)
                print  ":: Match on MacX: " substr(word,1,1) "-" \
                    tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
                    tolower(substr(word,RLENGTH+1))
            word = substr(word,1,1)       tolower(substr(word,2,RLENGTH-2)) \
                substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
            hit = 1
        }

        ## If one of the above rule is hit, we append the result to 'a',
        ## otherwise we capitalize it.
        if (hit > 0 )          a = a word
        else if (capital == 1) a = a tolower(head) tolower(tail)
        else                   a = a toupper(head) tolower(tail)

    } while (pos > 0);

    ## Everything should be converted now.

    ## Double exception 1: Set 1st word of string in capital case. Need to
    ## handle potential internal single/double quotes like "A Day in the Life"
    ## or 'On the Waterfront'. WARNING: here we consider digits as part of a
    ## word (as in 1st, 2nd, etc.).
    match(a, /[[:alnum:]]/)
    a = toupper(substr(a, 1, RSTART)) substr(a, RSTART+1)

    ## Double exception 2: Set 1st word after a some punctuation marks in title
    ## case. This kludge handles multiple colons, question marks, etc. on the
    ## line. \a is the BEL or CTRL-G character.
    result = gensub(/([:?!"-][^[:alnum:]]*)([a-zA-Z])/, "\\1\a\\2", "g", a)
    while (match(result, /\a/)) {
        beg = substr(result, 1, RSTART-1)
        cap = toupper(substr(result, RSTART+1, 1))
        end = substr(result, RSTART+2)
        result = beg cap end
    }

    return result
}

{print titlecase($0)}

## End of script