ambevar-dotfiles/.scripts/titlecase.awk

#!/usr/env/gawk -f
## This script is inspired by
##   http://www.pement.org/awk/titlecase.awk.txt
##
## function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
##
## Features:
##
##   titlecase() will compress whitespace if a second parameter is passed.  It
##   is sufficient to use a positive number: titlecase(string,1)
##
##   This function tries to implement the "Title Case" constructs specified in
##   the APA Style Manual and the Chicago Manual of Style. Instead of merely
##   capitalizing the first letter of each word and setting everything else in
##   lowercase, this function implements the following conditions:
##
##  - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
##    are the first word of the string or the first word after a colon, a
##    question mark, or an exclamation point.
##  - Compass points (NE, SW, etc.) are set in solid caps.
##  - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
##  - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
##  - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
##  - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
##  - Contractions such as I'll, You've, Don't, etc. are handled properly
##  - Degrees such as Ph.D., M.Div., etc. are properly capitalized
##
## Sample Usage with GNU awk (gawk):
##
##   gawk -f titlecase.awk infile

## TODO: maybe it would be a good idea to implement a preprocessor that would
## search and replace special strings like AC-DC.

## Tests:

## all lowercase words
## ALL UPPERCASE WORDS
## aLl cRaZY cASE WordS
## And with constants in an INTO cd Contre. Feat and Feat. the machine.
## Bad   ,punctuation. here  , should ! not be ?a problem.
## Roman numerals XIV LIV xiv liv. liv. xiv.
## Dashed--machine--ac-dc.
## About mcdonald and o'reilly, but i'll won't say.
## The "final quote" 'on the waterfront'.

BEGIN {
    ## English
    constants = constants "a an the and but for nor or so am is are against at between by from in into of on to upon "

    ## French
    constants = constants "un une de du le la les et mais pour ni ou à a où contre entre chez dans sur que qui "

    ## German
    constants = constants "der die das den dem des ein eine einen eines einer von wo an am in für gegen bei aus mit nach seit zu durch ohne um "

    ## Music
    constants = constants "feat CD DJ "
    constants = constants "KlassX Machine d'Acide BYOB MGMT AC DC JBX RZA DMX "

    ## Others
    constants = constants "AIDS ASCII DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN TV FM "

    ## Build array of constant words.
    split(constants, constarray, " ")
}

function titlecase(string)  {
    ## Initialize variables.
    a = "";            # a is/will be the string ALREADY converted
    b = string;        # b is the rest of the string, so that (string = a b)

    ## English punctuation. It is quite hard to guess the language, so French
    ## will follow English punctuation rules.
    b = gensub(/ +([,!:;?.]+) */, "\\1 ", "g", b)

    ## Compress spaces or tabs. Trim prefix and suffix space. Convert
    ## underscores to spaces.
    gsub(/[_ \t]+/, " ", b)
    gsub(/^ /, "", b)
    gsub(/ $/, "", b)

    ## Capitalize everything for ease of matching.
    b = toupper(b)

    do {
        ## Initialize for later use.
        hit = 0;

        ## 'pos' is the position of the NEXT punctuation mark (except
        ## apostrophe) after the current word. If this is the last word in b,
        ## pos will be 0.  match() automatically sets RLENGTH.  WARNING: we
        ## consider digits as part of a word.
        pos = match(b, /[^[:alnum:]']+/)

        if (pos > 0)    word = substr(b, 1, pos + RLENGTH - 1)
        else            word = b

        ## 1st char of current word.
        head = substr(b, 1, 1)
        ## Tail of current word.
        if (pos > 0)    tail = substr(b, 2, pos + RLENGTH - 2)
        else            tail = substr(b, 2)

        ## Shorten the rest of the string.
        b = substr(b, pos + RLENGTH  )

        ## RULE 1 -- Constant strings.

        ## WARNING: since we match a substring of 'word', we need to prepend and
        ## append the potentially discarded values, like dashes.
        for (var in constarray) {
            if (debug)
                print ":: Comparing " word " with " constarray[var]
            hit = match(word, "^" toupper(constarray[var]) "\\>")
            if ( hit > 0 ) {
                word = substr(word, 1, RSTART-1) constarray[var] substr(word, RSTART+RLENGTH)
                if (debug)
                    print ":: Match constant on [" constarray[var] "] in string [" word "]";
                break;
            }
        }

        ## RULE 2 -- Roman numerals

        ## Note: this match cannot distinguish between LIV (54 in Roman
        ## numerals) and a personal name like "Liv Ullman".  The Roman numerals
        ## C (100), D (500), and M (1000) are omitted to avoid false matches on
        ## words like civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of
        ## Roman numerals in titles stays in the lower ranges, such as "Vol. II"
        ## or "Pt. XXIV".
        if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
            hit = 1
            ## But we can undo I'd, I'll, I'm, I've and Ill.
            if (match(word,/^I'|ILL\>/))
                hit = 0
            if (debug && hit == 1)
                print ":: Match on Roman numerals in [" word "]"
        }

        ## RULE 3 -- Names like D'Arcy or O'Reilly
        if ( hit == 0 && capital != 1 && match(word, /^[DO]'[[:alpha:]]/) ) {
            word = substr(word,1,3) tolower(substr(word,4))
            hit = 1
            if (debug)
                print ":: Match on mixed case: " word
        }

        ## RULE 4 -- Names like MacNeil or McDonald
        if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
            if (debug)
                print  ":: Match on MacX: " substr(word,1,1) "-" \
                    tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
                    tolower(substr(word,RLENGTH+1))
            word = substr(word,1,1)       tolower(substr(word,2,RLENGTH-2)) \
                substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
            hit = 1
        }

        ## If one of the above rule is hit, we append the result to 'a',
        ## otherwise we capitalize it.
        if (hit > 0 )          a = a word
        else if (capital == 1) a = a tolower(head) tolower(tail)
        else                   a = a toupper(head) tolower(tail)

    } while (pos > 0);

    ## Everything should be converted now.

    ## Double exception 1: Set 1st word of string in capital case. Need to
    ## handle potential internal single/double quotes like "A Day in the Life"
    ## or 'On the Waterfront'. WARNING: here we consider digits as part of a
    ## word (as in 1st, 2nd, etc.).
    match(a, /[[:alnum:]]/)
    a = toupper(substr(a, 1, RSTART)) substr(a, RSTART+1)

    ## Double exception 2: Set 1st word after a some punctuation marks in title
    ## case. This kludge handles multiple colons, question marks, etc. on the
    ## line. \a is the BEL or CTRL-G character.
    result = gensub(/([:{}\[\]?!"()-][^[:alnum:]]*)([a-zA-Z])/, "\\1\a\\2", "g", a)
    while (match(result, /\a/)) {
        beg = substr(result, 1, RSTART-1)
        cap = toupper(substr(result, RSTART+1, 1))
        end = substr(result, RSTART+2)
        result = beg cap end
    }

    return result
}

{print titlecase($0)}

## End of script
strip-comments: fixed classic mistake 2013-10-30 15:57:24 +01:00			`#!/usr/env/gawk -f`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## This script is inspired by`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`## http://www.pement.org/awk/titlecase.awk.txt`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`##`
			`## function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"`
			`##`
tc-audio-trancode: {}()[] will precede cap in titlecase 2013-08-03 21:51:24 +02:00			`## Features:`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`##`
			`## titlecase() will compress whitespace if a second parameter is passed. It`
			`## is sufficient to use a positive number: titlecase(string,1)`
			`##`
			`## This function tries to implement the "Title Case" constructs specified in`
			`## the APA Style Manual and the Chicago Manual of Style. Instead of merely`
			`## capitalizing the first letter of each word and setting everything else in`
			`## lowercase, this function implements the following conditions:`
			`##`
			`## - Conjunctions, articles, and prepositions are set lowercase, UNLESS they`
			`## are the first word of the string or the first word after a colon, a`
			`## question mark, or an exclamation point.`
			`## - Compass points (NE, SW, etc.) are set in solid caps.`
			`## - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.`
			`## - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)`
			`## - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.`
			`## - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)`
			`## - Contractions such as I'll, You've, Don't, etc. are handled properly`
			`## - Degrees such as Ph.D., M.Div., etc. are properly capitalized`
			`##`
			`## Sample Usage with GNU awk (gawk):`
			`##`
			`## gawk -f titlecase.awk infile`

			`## TODO: maybe it would be a good idea to implement a preprocessor that would`
			`## search and replace special strings like AC-DC.`

			`## Tests:`

			`## all lowercase words`
			`## ALL UPPERCASE WORDS`
			`## aLl cRaZY cASE WordS`
			`## And with constants in an INTO cd Contre. Feat and Feat. the machine.`
			`## Bad ,punctuation. here , should ! not be ?a problem.`
			`## Roman numerals XIV LIV xiv liv. liv. xiv.`
			`## Dashed--machine--ac-dc.`
			`## About mcdonald and o'reilly, but i'll won't say.`
			`## The "final quote" 'on the waterfront'.`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`BEGIN {`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## English`
			`constants = constants "a an the and but for nor or so am is are against at between by from in into of on to upon "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`## French`
tc-audio-transcode: command-line parameters do not get titelcased anymore. tc-audio-transcode: overwrite option. tc-audio-transcode: capital-case option. tc-audio-transcode: clearer variable name, colored warnings. tc-audio-batch: color support + OGG support. titlecase.awk: 'capital' option support for first letter only (most non-english titles are like this). 2013-05-02 12:34:17 +02:00			`constants = constants "un une de du le la les et mais pour ni ou à a où contre entre chez dans sur que qui "`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## German`
			`constants = constants "der die das den dem des ein eine einen eines einer von wo an am in für gegen bei aus mit nach seit zu durch ohne um "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Music`
			`constants = constants "feat CD DJ "`
			`constants = constants "KlassX Machine d'Acide BYOB MGMT AC DC JBX RZA DMX "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Others`
			`constants = constants "AIDS ASCII DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN TV FM "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Build array of constant words.`
			`split(constants, constarray, " ")`
			`}`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`function titlecase(string) {`
			`## Initialize variables.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`a = ""; # a is/will be the string ALREADY converted`
			`b = string; # b is the rest of the string, so that (string = a b)`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## English punctuation. It is quite hard to guess the language, so French`
			`## will follow English punctuation rules.`
			`b = gensub(/ +([,!:;?.]+) */, "\\1 ", "g", b)`
tc-audio: FLAC support. Metadata cleasing. titlecase.awk: punctuation support. 2013-04-21 11:24:27 +02:00
tc-audio-transcode: fixes. 2013-04-21 15:16:49 +02:00			`## Compress spaces or tabs. Trim prefix and suffix space. Convert`
			`## underscores to spaces.`
			`gsub(/[_ \t]+/, " ", b)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`gsub(/^ /, "", b)`
			`gsub(/ $/, "", b)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Capitalize everything for ease of matching.`
			`b = toupper(b)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`do {`
			`## Initialize for later use.`
			`hit = 0;`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## 'pos' is the position of the NEXT punctuation mark (except`
			`## apostrophe) after the current word. If this is the last word in b,`
			`## pos will be 0. match() automatically sets RLENGTH. WARNING: we`
			`## consider digits as part of a word.`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`pos = match(b, /[^[:alnum:]']+/)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)`
			`else word = b`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## 1st char of current word.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`head = substr(b, 1, 1)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Tail of current word.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)`
			`else tail = substr(b, 2)`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Shorten the rest of the string.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`b = substr(b, pos + RLENGTH )`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## RULE 1 -- Constant strings.`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## WARNING: since we match a substring of 'word', we need to prepend and`
			`## append the potentially discarded values, like dashes.`
			`for (var in constarray) {`
			`if (debug)`
			`print ":: Comparing " word " with " constarray[var]`
			`hit = match(word, "^" toupper(constarray[var]) "\\>")`
			`if ( hit > 0 ) {`
			`word = substr(word, 1, RSTART-1) constarray[var] substr(word, RSTART+RLENGTH)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (debug)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`print ":: Match constant on [" constarray[var] "] in string [" word "]";`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`break;`
			`}`
			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## RULE 2 -- Roman numerals`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## Note: this match cannot distinguish between LIV (54 in Roman`
			`## numerals) and a personal name like "Liv Ullman". The Roman numerals`
			`## C (100), D (500), and M (1000) are omitted to avoid false matches on`
			`## words like civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of`
			`## Roman numerals in titles stays in the lower ranges, such as "Vol. II"`
			`## or "Pt. XXIV".`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {`
			`hit = 1`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## But we can undo I'd, I'll, I'm, I've and Ill.`
			`if (match(word,/^I'\|ILL\>/))`
			`hit = 0`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (debug && hit == 1)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`print ":: Match on Roman numerals in [" word "]"`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## RULE 3 -- Names like D'Arcy or O'Reilly`
tc-audio-transcode: command-line parameters do not get titelcased anymore. tc-audio-transcode: overwrite option. tc-audio-transcode: capital-case option. tc-audio-transcode: clearer variable name, colored warnings. tc-audio-batch: color support + OGG support. titlecase.awk: 'capital' option support for first letter only (most non-english titles are like this). 2013-05-02 12:34:17 +02:00			`if ( hit == 0 && capital != 1 && match(word, /^[DO]'[[:alpha:]]/) ) {`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`word = substr(word,1,3) tolower(substr(word,4))`
			`hit = 1`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`if (debug)`
			`print ":: Match on mixed case: " word`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## RULE 4 -- Names like MacNeil or McDonald`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {`
			`if (debug)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`print ":: Match on MacX: " substr(word,1,1) "-" \`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \`
			`tolower(substr(word,RLENGTH+1))`
			`word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \`
			`substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))`
			`hit = 1`
			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## If one of the above rule is hit, we append the result to 'a',`
			`## otherwise we capitalize it.`
tc-audio-transcode: command-line parameters do not get titelcased anymore. tc-audio-transcode: overwrite option. tc-audio-transcode: capital-case option. tc-audio-transcode: clearer variable name, colored warnings. tc-audio-batch: color support + OGG support. titlecase.awk: 'capital' option support for first letter only (most non-english titles are like this). 2013-05-02 12:34:17 +02:00			`if (hit > 0 ) a = a word`
			`else if (capital == 1) a = a tolower(head) tolower(tail)`
			`else a = a toupper(head) tolower(tail)`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`} while (pos > 0);`

tc-audio-transcode: track number is now safe, "Machine" does not become "MacHine", metadata entries beyond the first one are discarded. 2013-04-05 14:47:26 +02:00			`## Everything should be converted now.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
tc-audio-transcode: track number is now safe, "Machine" does not become "MacHine", metadata entries beyond the first one are discarded. 2013-04-05 14:47:26 +02:00			`## Double exception 1: Set 1st word of string in capital case. Need to`
			`## handle potential internal single/double quotes like "A Day in the Life"`
			`## or 'On the Waterfront'. WARNING: here we consider digits as part of a`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## word (as in 1st, 2nd, etc.).`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`match(a, /[[:alnum:]]/)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`a = toupper(substr(a, 1, RSTART)) substr(a, RSTART+1)`

Minor fixes. 2013-05-26 20:51:52 +02:00			`## Double exception 2: Set 1st word after a some punctuation marks in title`
			`## case. This kludge handles multiple colons, question marks, etc. on the`
			`## line. \a is the BEL or CTRL-G character.`
tc-audio-trancode: {}()[] will precede cap in titlecase 2013-08-03 21:51:24 +02:00			`result = gensub(/([:{}\[\]?!"()-][^[:alnum:]]*)([a-zA-Z])/, "\\1\a\\2", "g", a)`
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`while (match(result, /\a/)) {`
			`beg = substr(result, 1, RSTART-1)`
			`cap = toupper(substr(result, RSTART+1, 1))`
			`end = substr(result, RSTART+2)`
			`result = beg cap end`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`return result`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`}`

Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`{print titlecase($0)}`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
Titlecase: big overhaul. 2013-04-21 12:50:26 +02:00			`## End of script`