ambevar-dotfiles/.scripts/titlecase.awk

#!/bin/gawk -f
## Original file can be found at
##   http://www.pement.org/awk/titlecase.awk.txt

#
# function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
#
# Other Features:
#   titlecase() will compress whitespace if a second parameter is passed.
#   It is sufficient to use a positive number: titlecase(string,1)
#
#   This function tries to implement the "Title Case" constructs specified
#   in the APA Style Manual and the Chicago Manual of Style. Instead of
#   merely capitalizing the first letter of each word and setting
#   everything else in lowercase, this function implements the following
#   conditions:
#
#  - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
#    are the first word of the string or the first word after a colon, a
#    question mark, or an exclamation point.
#  - Compass points (NE, SW, etc.) are set in solid caps.
#  - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
#  - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
#  - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
#  - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
#  - Contractions such as I'll, You've, Don't, etc. are handled properly
#  - Degrees such as Ph.D., M.Div., etc. are properly capitalized
#
# Sample Usage with GNU awk (gawk):
#
#   awk -f titlecase.awk infile

## TODO: merge constants (MC, UC, LC) into one array. Use only one loop for matching.
## TODO: get constants from external file. Support: languages, themes (music), etc.
## TODO: rethink algorithm so that it does not need to turn everything to uppercase.
## TODO: rethink algorithm so that it does not include punctuation in 'word'.

BEGIN {

    #-----ABBREVIATIONS TO BE SET IN MIXEDCASE-----
    mixed = "KlassX Machine d'Acide "
    split(mixed, keep_mixed, " ")

    #-----ABBREVIATIONS TO BE SET IN LOWERCASE-----
    articles     = "a an the "
    conjunctions = "and but for nor or so "
    verbs = "am is are "
    abbrevs = "feat "

    # Prepositions
    # Omitted: over (=finished), under, through, before, after
    preps = "against at between by from in into of on to upon "

    ## French
    preps = preps "du "

    # Build array of words to be set lowercased
    split(articles conjunctions preps verbs abbrevs, keep_lower, " ")

    #-----ABBREVIATIONS TO BE SET IN SOLID CAPS-----
    # Other abbreviations - add to this list as needed
    other =       "AIDS ASCII CD DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN "
    other = other "PHP ROM SSN TV FM BYOB MGMT DJ AC-DC JBX RZA DMX "

    # build array of words to keep uppercase
    split(other, keep_upper, " ")
}

function titlecase(string,x)  {

    # Initialize variables
    a = "";            # a is/will be the string ALREADY converted
    b = string;        # b is the rest of the string, so that (string = a b)
    compress = x;      # optional compression argument

    # Compress spaces or tabs if 2nd argument passed. Trim prefix and suffix space.
    if (compress) {
        gsub(/[ \t]+/, " ", b)
        gsub(/^ /, "", b)
        gsub(/ $/, "", b)
        if (debug) print "DIAG: Compress argument passed to function call"
    }

    b = toupper(b)     # Capitalize everything for ease of matching

    do {
        hit = 0;         # Initialize for later use

        if(debug)
        {
            print "1a=" a
            print "1b=" b
            print "1word=" word
        }

        # pos is the position of the NEXT punctuation mark (except apostrophe)
        # after the current word. If this is the last word in b, pos will be 0.
        # match() automatically sets RLENGTH
        ## WARNING: we consider digits as part of a word.
        pos = match(b, /[^[:alnum:]']+/)
        # pos = match(b, /[^A-Z']+/)

        if (pos > 0)    word = substr(b, 1, pos + RLENGTH - 1)
        else            word = b

        # 1st char of current word
        head = substr(b, 1, 1)
        # tail of current word
        if (pos > 0)    tail = substr(b, 2, pos + RLENGTH - 2)
        else            tail = substr(b, 2)

        # shorten the rest of the string
        b = substr(b, pos + RLENGTH  )

        if(debug)
        {
            print "2a=" a
            print "2b=" b
            print "2word=" word
        }

        #----Words to keep mixedcase---- WARNING: since we match a substring of
        ## 'word', we need to prepend and append the potentially discarded
        ## values.
        for (var in keep_mixed) {
            mix = match(word, "^" toupper(keep_mixed[var]) "\\>")
            if ( mix > 0 ) {
                hit = 1
                word = substr(word, 1, RSTART-1) keep_mixed[var] substr(word, RSTART+RLENGTH)
                if (debug)
                    print "DIAG: Match MC on [" keep_mixed[var] "] in string [" word "]";
                break;
            }
        }

        if(debug)
        {
            print "3a=" a
            print "3b=" b
            print "3word=" word
        }

        #----Words to keep uppercase----
        # Case 1: abbreviations from the keep_upper array.
        if ( proect == 0) {
            for (var in keep_upper) {
                hit = match(word, "^" keep_upper[var] "\\>")
                if ( hit > 0 ) {
                    if (debug)
                        print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]";
                    break;
                }
            }
        }

        # Case 2: Roman numerals
        # Note: this match cannot distinguish between LIV (54 in Roman numerals)
        # and a personal name like "Liv Ullman".  The Roman numerals C (100),
        # D (500), and M (1000) are omitted to avoid false matches on words like
        # civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals
        # in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV".
        if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
            hit = 1
            # But we can undo I'd, I'll, I'm, I've and Ill.
            if (match(word,/^I'|ILL\>/)) hit = 0
            if (debug && hit == 1)
                print "DIAG: Match on Roman numerals in [" word "]"
        }

        #----Words to be set in MiXed case----
        # Case 3: Names like D'Arcy or O'Reilly
        if ( hit == 0 && match(word, /^[DO]'[[:alpha:]]/) ) {
            if (debug) print "DIAG: Match on mixed case: " word
            word = substr(word,1,3) tolower(substr(word,4))
            hit = 1
        }

        # Case 4: Names like MacNeil or McDonald
        if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
            if (debug)
                print  "DIAG: Match on MacX: " substr(word,1,1) "-" \
                    tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
                    tolower(substr(word,RLENGTH+1))
            word = substr(word,1,1)       tolower(substr(word,2,RLENGTH-2)) \
                substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
            hit = 1
        }

        #----Words to set in lowercase----
        # Case 5: articles, conjunctions, prepositions from the keep_lower array
        if (hit == 0) {
            for (var2 in keep_lower) {
                hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word);
                if ( hit > 0 ) {
                    if (debug)
                        print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]";
                    break;
                }
            }
        }

        #----Default: Capitalize everything else normally----
        if (mix > 0)    a = a word
        else if (hit > 0)    a = a word
        else            a = a toupper(head) tolower(tail)


    } while (pos > 0);

    ## Everything should be converted now.

    ## Double exception 1: Set 1st word of string in capital case. Need to
    ## handle potential internal single/double quotes like "A Day in the Life"
    ## or 'On the Waterfront'. WARNING: here we consider digits as part of a
    ## word (as in 1st, 2nd, etc.)
    match(a, /[[:alnum:]]/)
    a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1)

    ## Double exception 2: Set 1st word after a colon, question mark or
    ## exclamation point in title case. This kludge handles multiple colons,
    ## question marks, etc. on the line. \a is the BEL or CTRL-G character.
    ## WARNING: we also follow double quotes by a capital.
    done = gensub(/([:?!"][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a)

    while (match(done,/\a/)) {
        beg = substr(done,1,RSTART-1)
        cap = toupper(substr(done,RSTART+1,1))
        end = substr(done,RSTART+2)
        done = beg cap end
    }

    return done
}


{print titlecase($0,1)}

#---end of awk script---
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`#!/bin/gawk -f`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`## Original file can be found at`
			`## http://www.pement.org/awk/titlecase.awk.txt`

			`#`
			`# function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"`
			`#`
			`# Other Features:`
			`# titlecase() will compress whitespace if a second parameter is passed.`
			`# It is sufficient to use a positive number: titlecase(string,1)`
			`#`
			`# This function tries to implement the "Title Case" constructs specified`
			`# in the APA Style Manual and the Chicago Manual of Style. Instead of`
			`# merely capitalizing the first letter of each word and setting`
			`# everything else in lowercase, this function implements the following`
			`# conditions:`
			`#`
			`# - Conjunctions, articles, and prepositions are set lowercase, UNLESS they`
			`# are the first word of the string or the first word after a colon, a`
			`# question mark, or an exclamation point.`
			`# - Compass points (NE, SW, etc.) are set in solid caps.`
			`# - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.`
			`# - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)`
			`# - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.`
			`# - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)`
			`# - Contractions such as I'll, You've, Don't, etc. are handled properly`
			`# - Degrees such as Ph.D., M.Div., etc. are properly capitalized`
			`#`
			`# Sample Usage with GNU awk (gawk):`
			`#`
			`# awk -f titlecase.awk infile`

tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`## TODO: merge constants (MC, UC, LC) into one array. Use only one loop for matching.`
			`## TODO: get constants from external file. Support: languages, themes (music), etc.`
			`## TODO: rethink algorithm so that it does not need to turn everything to uppercase.`
			`## TODO: rethink algorithm so that it does not include punctuation in 'word'.`

Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`BEGIN {`

			`#-----ABBREVIATIONS TO BE SET IN MIXEDCASE-----`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`mixed = "KlassX Machine d'Acide "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`split(mixed, keep_mixed, " ")`

			`#-----ABBREVIATIONS TO BE SET IN LOWERCASE-----`
			`articles = "a an the "`
			`conjunctions = "and but for nor or so "`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`verbs = "am is are "`
			`abbrevs = "feat "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`# Prepositions`
			`# Omitted: over (=finished), under, through, before, after`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`preps = "against at between by from in into of on to upon "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`## French`
			`preps = preps "du "`

Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`# Build array of words to be set lowercased`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`split(articles conjunctions preps verbs abbrevs, keep_lower, " ")`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`#-----ABBREVIATIONS TO BE SET IN SOLID CAPS-----`
			`# Other abbreviations - add to this list as needed`
			`other = "AIDS ASCII CD DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN "`
Shell: audio tc fixes. 2013-01-27 21:48:05 +01:00			`other = other "PHP ROM SSN TV FM BYOB MGMT DJ AC-DC JBX RZA DMX "`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`# build array of words to keep uppercase`
			`split(other, keep_upper, " ")`
			`}`

			`function titlecase(string,x) {`

			`# Initialize variables`
			`a = ""; # a is/will be the string ALREADY converted`
			`b = string; # b is the rest of the string, so that (string = a b)`
			`compress = x; # optional compression argument`

Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`# Compress spaces or tabs if 2nd argument passed. Trim prefix and suffix space.`
			`if (compress) {`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`gsub(/[ \t]+/, " ", b)`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`gsub(/^ /, "", b)`
			`gsub(/ $/, "", b)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (debug) print "DIAG: Compress argument passed to function call"`
			`}`

			`b = toupper(b) # Capitalize everything for ease of matching`

			`do {`
			`hit = 0; # Initialize for later use`

tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`if(debug)`
			`{`
			`print "1a=" a`
			`print "1b=" b`
			`print "1word=" word`
			`}`

Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`# pos is the position of the NEXT punctuation mark (except apostrophe)`
			`# after the current word. If this is the last word in b, pos will be 0.`
			`# match() automatically sets RLENGTH`
Shell: Titlecase fixes ("), special char fixes (/). 2013-01-27 23:10:46 +01:00			`## WARNING: we consider digits as part of a word.`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`pos = match(b, /[^[:alnum:]']+/)`
Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`# pos = match(b, /[^A-Z']+/)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)`
			`else word = b`

			`# 1st char of current word`
			`head = substr(b, 1, 1)`
			`# tail of current word`
			`if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)`
			`else tail = substr(b, 2)`

			`# shorten the rest of the string`
			`b = substr(b, pos + RLENGTH )`

tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`if(debug)`
			`{`
			`print "2a=" a`
			`print "2b=" b`
			`print "2word=" word`
			`}`

			`#----Words to keep mixedcase---- WARNING: since we match a substring of`
			`## 'word', we need to prepend and append the potentially discarded`
			`## values.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`for (var in keep_mixed) {`
			`mix = match(word, "^" toupper(keep_mixed[var]) "\\>")`
			`if ( mix > 0 ) {`
			`hit = 1`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`word = substr(word, 1, RSTART-1) keep_mixed[var] substr(word, RSTART+RLENGTH)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (debug)`
			`print "DIAG: Match MC on [" keep_mixed[var] "] in string [" word "]";`
			`break;`
			`}`
			`}`

tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`if(debug)`
			`{`
			`print "3a=" a`
			`print "3b=" b`
			`print "3word=" word`
			`}`

Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`#----Words to keep uppercase----`
			`# Case 1: abbreviations from the keep_upper array.`
			`if ( proect == 0) {`
			`for (var in keep_upper) {`
			`hit = match(word, "^" keep_upper[var] "\\>")`
			`if ( hit > 0 ) {`
			`if (debug)`
			`print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]";`
			`break;`
			`}`
			`}`
			`}`

			`# Case 2: Roman numerals`
			`# Note: this match cannot distinguish between LIV (54 in Roman numerals)`
			`# and a personal name like "Liv Ullman". The Roman numerals C (100),`
			`# D (500), and M (1000) are omitted to avoid false matches on words like`
			`# civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals`
			`# in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV".`
			`if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {`
			`hit = 1`
			`# But we can undo I'd, I'll, I'm, I've and Ill.`
			`if (match(word,/^I'\|ILL\>/)) hit = 0`
			`if (debug && hit == 1)`
			`print "DIAG: Match on Roman numerals in [" word "]"`
			`}`

			`#----Words to be set in MiXed case----`
			`# Case 3: Names like D'Arcy or O'Reilly`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`if ( hit == 0 && match(word, /^[DO]'[[:alpha:]]/) ) {`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`if (debug) print "DIAG: Match on mixed case: " word`
			`word = substr(word,1,3) tolower(substr(word,4))`
			`hit = 1`
			`}`

			`# Case 4: Names like MacNeil or McDonald`
			`if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {`
			`if (debug)`
			`print "DIAG: Match on MacX: " substr(word,1,1) "-" \`
			`tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \`
			`tolower(substr(word,RLENGTH+1))`
			`word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \`
			`substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))`
			`hit = 1`
			`}`

			`#----Words to set in lowercase----`
			`# Case 5: articles, conjunctions, prepositions from the keep_lower array`
			`if (hit == 0) {`
			`for (var2 in keep_lower) {`
			`hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word);`
			`if ( hit > 0 ) {`
			`if (debug)`
			`print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]";`
			`break;`
			`}`
			`}`
			`}`

			`#----Default: Capitalize everything else normally----`
			`if (mix > 0) a = a word`
			`else if (hit > 0) a = a word`
			`else a = a toupper(head) tolower(tail)`

Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`} while (pos > 0);`

tc-audio-transcode: track number is now safe, "Machine" does not become "MacHine", metadata entries beyond the first one are discarded. 2013-04-05 14:47:26 +02:00			`## Everything should be converted now.`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
tc-audio-transcode: track number is now safe, "Machine" does not become "MacHine", metadata entries beyond the first one are discarded. 2013-04-05 14:47:26 +02:00			`## Double exception 1: Set 1st word of string in capital case. Need to`
			`## handle potential internal single/double quotes like "A Day in the Life"`
			`## or 'On the Waterfront'. WARNING: here we consider digits as part of a`
tc-audio-transcode: covers are fetched from current folder only. tc-audio-transcode: genre can now be forced. titlecase: unicode support. titlecase: fixed bug for mixed cased where following punctuation was swallowed. 2013-04-07 12:33:43 +02:00			`## word (as in 1st, 2nd, etc.)`
			`match(a, /[[:alnum:]]/)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00			`a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1)`

tc-audio-transcode: track number is now safe, "Machine" does not become "MacHine", metadata entries beyond the first one are discarded. 2013-04-05 14:47:26 +02:00			`## Double exception 2: Set 1st word after a colon, question mark or`
			`## exclamation point in title case. This kludge handles multiple colons,`
			`## question marks, etc. on the line. \a is the BEL or CTRL-G character.`
Shell: Titlecase fixes ("), special char fixes (/). 2013-01-27 23:10:46 +01:00			`## WARNING: we also follow double quotes by a capital.`
			`done = gensub(/([:?!"][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a)`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`while (match(done,/\a/)) {`
			`beg = substr(done,1,RSTART-1)`
			`cap = toupper(substr(done,RSTART+1,1))`
			`end = substr(done,RSTART+2)`
			`done = beg cap end`
			`}`

			`return done`
			`}`


Shell: audio transcoder working. Needs some tweaks and folder support. 2013-01-27 00:48:05 +01:00			`{print titlecase($0,1)}`
Shell: begin of audio transcoding. 2013-01-26 19:08:29 +01:00
			`#---end of awk script---`