ambevar-dotfiles/.scripts/titlecase.awk

195 lines
7.4 KiB
Awk
Executable File

#!/bin/gawk -f
## This script is inspired by
## http://www.pement.org/awk/titlecase.awk.txt
##
## function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
##
## Other Features:
##
## titlecase() will compress whitespace if a second parameter is passed. It
## is sufficient to use a positive number: titlecase(string,1)
##
## This function tries to implement the "Title Case" constructs specified in
## the APA Style Manual and the Chicago Manual of Style. Instead of merely
## capitalizing the first letter of each word and setting everything else in
## lowercase, this function implements the following conditions:
##
## - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
## are the first word of the string or the first word after a colon, a
## question mark, or an exclamation point.
## - Compass points (NE, SW, etc.) are set in solid caps.
## - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
## - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
## - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
## - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
## - Contractions such as I'll, You've, Don't, etc. are handled properly
## - Degrees such as Ph.D., M.Div., etc. are properly capitalized
##
## Sample Usage with GNU awk (gawk):
##
## gawk -f titlecase.awk infile
## TODO: maybe it would be a good idea to implement a preprocessor that would
## search and replace special strings like AC-DC.
## Tests:
## all lowercase words
## ALL UPPERCASE WORDS
## aLl cRaZY cASE WordS
## And with constants in an INTO cd Contre. Feat and Feat. the machine.
## Bad ,punctuation. here , should ! not be ?a problem.
## Roman numerals XIV LIV xiv liv. liv. xiv.
## Dashed--machine--ac-dc.
## About mcdonald and o'reilly, but i'll won't say.
## The "final quote" 'on the waterfront'.
BEGIN {
## English
constants = constants "a an the and but for nor or so am is are against at between by from in into of on to upon "
## French
constants = constants "un une de du le la les et mais pour ni ou à a où contre entre chez dans sur que qui "
## German
constants = constants "der die das den dem des ein eine einen eines einer von wo an am in für gegen bei aus mit nach seit zu durch ohne um "
## Music
constants = constants "feat CD DJ "
constants = constants "KlassX Machine d'Acide BYOB MGMT AC DC JBX RZA DMX "
## Others
constants = constants "AIDS ASCII DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN TV FM "
## Build array of constant words.
split(constants, constarray, " ")
}
function titlecase(string) {
## Initialize variables.
a = ""; # a is/will be the string ALREADY converted
b = string; # b is the rest of the string, so that (string = a b)
## English punctuation. It is quite hard to guess the language, so French
## will follow English punctuation rules.
b = gensub(/ +([,!:;?.]+) */, "\\1 ", "g", b)
## Compress spaces or tabs. Trim prefix and suffix space. Convert
## underscores to spaces.
gsub(/[_ \t]+/, " ", b)
gsub(/^ /, "", b)
gsub(/ $/, "", b)
## Capitalize everything for ease of matching.
b = toupper(b)
do {
## Initialize for later use.
hit = 0;
## 'pos' is the position of the NEXT punctuation mark (except
## apostrophe) after the current word. If this is the last word in b,
## pos will be 0. match() automatically sets RLENGTH. WARNING: we
## consider digits as part of a word.
pos = match(b, /[^[:alnum:]']+/)
if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)
else word = b
## 1st char of current word.
head = substr(b, 1, 1)
## Tail of current word.
if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)
else tail = substr(b, 2)
## Shorten the rest of the string.
b = substr(b, pos + RLENGTH )
## RULE 1 -- Constant strings.
## WARNING: since we match a substring of 'word', we need to prepend and
## append the potentially discarded values, like dashes.
for (var in constarray) {
if (debug)
print ":: Comparing " word " with " constarray[var]
hit = match(word, "^" toupper(constarray[var]) "\\>")
if ( hit > 0 ) {
word = substr(word, 1, RSTART-1) constarray[var] substr(word, RSTART+RLENGTH)
if (debug)
print ":: Match constant on [" constarray[var] "] in string [" word "]";
break;
}
}
## RULE 2 -- Roman numerals
## Note: this match cannot distinguish between LIV (54 in Roman
## numerals) and a personal name like "Liv Ullman". The Roman numerals
## C (100), D (500), and M (1000) are omitted to avoid false matches on
## words like civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of
## Roman numerals in titles stays in the lower ranges, such as "Vol. II"
## or "Pt. XXIV".
if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
hit = 1
## But we can undo I'd, I'll, I'm, I've and Ill.
if (match(word,/^I'|ILL\>/))
hit = 0
if (debug && hit == 1)
print ":: Match on Roman numerals in [" word "]"
}
## RULE 3 -- Names like D'Arcy or O'Reilly
if ( hit == 0 && capital != 1 && match(word, /^[DO]'[[:alpha:]]/) ) {
word = substr(word,1,3) tolower(substr(word,4))
hit = 1
if (debug)
print ":: Match on mixed case: " word
}
## RULE 4 -- Names like MacNeil or McDonald
if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
if (debug)
print ":: Match on MacX: " substr(word,1,1) "-" \
tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
tolower(substr(word,RLENGTH+1))
word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \
substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
hit = 1
}
## If one of the above rule is hit, we append the result to 'a',
## otherwise we capitalize it.
if (hit > 0 ) a = a word
else if (capital == 1) a = a tolower(head) tolower(tail)
else a = a toupper(head) tolower(tail)
} while (pos > 0);
## Everything should be converted now.
## Double exception 1: Set 1st word of string in capital case. Need to
## handle potential internal single/double quotes like "A Day in the Life"
## or 'On the Waterfront'. WARNING: here we consider digits as part of a
## word (as in 1st, 2nd, etc.).
match(a, /[[:alnum:]]/)
a = toupper(substr(a, 1, RSTART)) substr(a, RSTART+1)
## Double exception 2: Set 1st word after a some punctuation marks in title
## case. This kludge handles multiple colons, question marks, etc. on the
## line. \a is the BEL or CTRL-G character.
result = gensub(/([:?!"-][^[:alnum:]]*)([a-zA-Z])/, "\\1\a\\2", "g", a)
while (match(result, /\a/)) {
beg = substr(result, 1, RSTART-1)
cap = toupper(substr(result, RSTART+1, 1))
end = substr(result, RSTART+2)
result = beg cap end
}
return result
}
{print titlecase($0)}
## End of script