2013-10-30 15:57:24 +01:00
|
|
|
#!/usr/env/gawk -f
|
2013-04-21 12:50:26 +02:00
|
|
|
## This script is inspired by
|
2013-01-26 19:08:29 +01:00
|
|
|
## http://www.pement.org/awk/titlecase.awk.txt
|
2013-04-21 12:50:26 +02:00
|
|
|
##
|
|
|
|
## function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
|
|
|
|
##
|
2013-08-03 21:51:24 +02:00
|
|
|
## Features:
|
2013-04-21 12:50:26 +02:00
|
|
|
##
|
|
|
|
## titlecase() will compress whitespace if a second parameter is passed. It
|
|
|
|
## is sufficient to use a positive number: titlecase(string,1)
|
|
|
|
##
|
|
|
|
## This function tries to implement the "Title Case" constructs specified in
|
|
|
|
## the APA Style Manual and the Chicago Manual of Style. Instead of merely
|
|
|
|
## capitalizing the first letter of each word and setting everything else in
|
|
|
|
## lowercase, this function implements the following conditions:
|
|
|
|
##
|
|
|
|
## - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
|
|
|
|
## are the first word of the string or the first word after a colon, a
|
|
|
|
## question mark, or an exclamation point.
|
|
|
|
## - Compass points (NE, SW, etc.) are set in solid caps.
|
|
|
|
## - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
|
|
|
|
## - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
|
|
|
|
## - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
|
|
|
|
## - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
|
|
|
|
## - Contractions such as I'll, You've, Don't, etc. are handled properly
|
|
|
|
## - Degrees such as Ph.D., M.Div., etc. are properly capitalized
|
|
|
|
##
|
|
|
|
## Sample Usage with GNU awk (gawk):
|
|
|
|
##
|
|
|
|
## gawk -f titlecase.awk infile
|
|
|
|
|
|
|
|
## TODO: maybe it would be a good idea to implement a preprocessor that would
|
|
|
|
## search and replace special strings like AC-DC.
|
|
|
|
|
|
|
|
## Tests:
|
|
|
|
|
|
|
|
## all lowercase words
|
|
|
|
## ALL UPPERCASE WORDS
|
|
|
|
## aLl cRaZY cASE WordS
|
|
|
|
## And with constants in an INTO cd Contre. Feat and Feat. the machine.
|
|
|
|
## Bad ,punctuation. here , should ! not be ?a problem.
|
|
|
|
## Roman numerals XIV LIV xiv liv. liv. xiv.
|
|
|
|
## Dashed--machine--ac-dc.
|
|
|
|
## About mcdonald and o'reilly, but i'll won't say.
|
|
|
|
## The "final quote" 'on the waterfront'.
|
2013-04-07 12:33:43 +02:00
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
BEGIN {
|
2013-04-21 12:50:26 +02:00
|
|
|
## English
|
|
|
|
constants = constants "a an the and but for nor or so am is are against at between by from in into of on to upon "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
## French
|
2013-05-02 12:34:17 +02:00
|
|
|
constants = constants "un une de du le la les et mais pour ni ou à a où contre entre chez dans sur que qui "
|
2013-04-07 12:33:43 +02:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## German
|
|
|
|
constants = constants "der die das den dem des ein eine einen eines einer von wo an am in für gegen bei aus mit nach seit zu durch ohne um "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Music
|
|
|
|
constants = constants "feat CD DJ "
|
|
|
|
constants = constants "KlassX Machine d'Acide BYOB MGMT AC DC JBX RZA DMX "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Others
|
|
|
|
constants = constants "AIDS ASCII DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN TV FM "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Build array of constant words.
|
|
|
|
split(constants, constarray, " ")
|
|
|
|
}
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
function titlecase(string) {
|
|
|
|
## Initialize variables.
|
2013-01-26 19:08:29 +01:00
|
|
|
a = ""; # a is/will be the string ALREADY converted
|
|
|
|
b = string; # b is the rest of the string, so that (string = a b)
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## English punctuation. It is quite hard to guess the language, so French
|
|
|
|
## will follow English punctuation rules.
|
|
|
|
b = gensub(/ +([,!:;?.]+) */, "\\1 ", "g", b)
|
2013-04-21 11:24:27 +02:00
|
|
|
|
2013-04-21 15:16:49 +02:00
|
|
|
## Compress spaces or tabs. Trim prefix and suffix space. Convert
|
|
|
|
## underscores to spaces.
|
|
|
|
gsub(/[_ \t]+/, " ", b)
|
2013-04-21 12:50:26 +02:00
|
|
|
gsub(/^ /, "", b)
|
|
|
|
gsub(/ $/, "", b)
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Capitalize everything for ease of matching.
|
|
|
|
b = toupper(b)
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
do {
|
|
|
|
## Initialize for later use.
|
|
|
|
hit = 0;
|
2013-04-07 12:33:43 +02:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## 'pos' is the position of the NEXT punctuation mark (except
|
|
|
|
## apostrophe) after the current word. If this is the last word in b,
|
|
|
|
## pos will be 0. match() automatically sets RLENGTH. WARNING: we
|
|
|
|
## consider digits as part of a word.
|
2013-04-07 12:33:43 +02:00
|
|
|
pos = match(b, /[^[:alnum:]']+/)
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)
|
|
|
|
else word = b
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## 1st char of current word.
|
2013-01-26 19:08:29 +01:00
|
|
|
head = substr(b, 1, 1)
|
2013-04-21 12:50:26 +02:00
|
|
|
## Tail of current word.
|
2013-01-26 19:08:29 +01:00
|
|
|
if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)
|
|
|
|
else tail = substr(b, 2)
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Shorten the rest of the string.
|
2013-01-26 19:08:29 +01:00
|
|
|
b = substr(b, pos + RLENGTH )
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## RULE 1 -- Constant strings.
|
2013-04-07 12:33:43 +02:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## WARNING: since we match a substring of 'word', we need to prepend and
|
|
|
|
## append the potentially discarded values, like dashes.
|
|
|
|
for (var in constarray) {
|
|
|
|
if (debug)
|
|
|
|
print ":: Comparing " word " with " constarray[var]
|
|
|
|
hit = match(word, "^" toupper(constarray[var]) "\\>")
|
|
|
|
if ( hit > 0 ) {
|
|
|
|
word = substr(word, 1, RSTART-1) constarray[var] substr(word, RSTART+RLENGTH)
|
2013-01-26 19:08:29 +01:00
|
|
|
if (debug)
|
2013-04-21 12:50:26 +02:00
|
|
|
print ":: Match constant on [" constarray[var] "] in string [" word "]";
|
2013-01-26 19:08:29 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## RULE 2 -- Roman numerals
|
2013-04-07 12:33:43 +02:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## Note: this match cannot distinguish between LIV (54 in Roman
|
|
|
|
## numerals) and a personal name like "Liv Ullman". The Roman numerals
|
|
|
|
## C (100), D (500), and M (1000) are omitted to avoid false matches on
|
|
|
|
## words like civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of
|
|
|
|
## Roman numerals in titles stays in the lower ranges, such as "Vol. II"
|
|
|
|
## or "Pt. XXIV".
|
2013-01-26 19:08:29 +01:00
|
|
|
if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
|
|
|
|
hit = 1
|
2013-04-21 12:50:26 +02:00
|
|
|
## But we can undo I'd, I'll, I'm, I've and Ill.
|
|
|
|
if (match(word,/^I'|ILL\>/))
|
|
|
|
hit = 0
|
2013-01-26 19:08:29 +01:00
|
|
|
if (debug && hit == 1)
|
2013-04-21 12:50:26 +02:00
|
|
|
print ":: Match on Roman numerals in [" word "]"
|
2013-01-26 19:08:29 +01:00
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## RULE 3 -- Names like D'Arcy or O'Reilly
|
2013-05-02 12:34:17 +02:00
|
|
|
if ( hit == 0 && capital != 1 && match(word, /^[DO]'[[:alpha:]]/) ) {
|
2013-01-26 19:08:29 +01:00
|
|
|
word = substr(word,1,3) tolower(substr(word,4))
|
|
|
|
hit = 1
|
2013-04-21 12:50:26 +02:00
|
|
|
if (debug)
|
|
|
|
print ":: Match on mixed case: " word
|
2013-01-26 19:08:29 +01:00
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## RULE 4 -- Names like MacNeil or McDonald
|
2013-01-26 19:08:29 +01:00
|
|
|
if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
|
|
|
|
if (debug)
|
2013-04-21 12:50:26 +02:00
|
|
|
print ":: Match on MacX: " substr(word,1,1) "-" \
|
2013-01-26 19:08:29 +01:00
|
|
|
tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
|
|
|
|
tolower(substr(word,RLENGTH+1))
|
|
|
|
word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \
|
|
|
|
substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
|
|
|
|
hit = 1
|
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## If one of the above rule is hit, we append the result to 'a',
|
|
|
|
## otherwise we capitalize it.
|
2013-05-02 12:34:17 +02:00
|
|
|
if (hit > 0 ) a = a word
|
|
|
|
else if (capital == 1) a = a tolower(head) tolower(tail)
|
|
|
|
else a = a toupper(head) tolower(tail)
|
2013-01-27 00:48:05 +01:00
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
} while (pos > 0);
|
|
|
|
|
2013-04-05 14:47:26 +02:00
|
|
|
## Everything should be converted now.
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-05 14:47:26 +02:00
|
|
|
## Double exception 1: Set 1st word of string in capital case. Need to
|
|
|
|
## handle potential internal single/double quotes like "A Day in the Life"
|
|
|
|
## or 'On the Waterfront'. WARNING: here we consider digits as part of a
|
2013-04-21 12:50:26 +02:00
|
|
|
## word (as in 1st, 2nd, etc.).
|
2013-04-07 12:33:43 +02:00
|
|
|
match(a, /[[:alnum:]]/)
|
2013-04-21 12:50:26 +02:00
|
|
|
a = toupper(substr(a, 1, RSTART)) substr(a, RSTART+1)
|
|
|
|
|
2013-05-26 20:51:52 +02:00
|
|
|
## Double exception 2: Set 1st word after a some punctuation marks in title
|
|
|
|
## case. This kludge handles multiple colons, question marks, etc. on the
|
|
|
|
## line. \a is the BEL or CTRL-G character.
|
2013-08-03 21:51:24 +02:00
|
|
|
result = gensub(/([:{}\[\]?!"()-][^[:alnum:]]*)([a-zA-Z])/, "\\1\a\\2", "g", a)
|
2013-04-21 12:50:26 +02:00
|
|
|
while (match(result, /\a/)) {
|
|
|
|
beg = substr(result, 1, RSTART-1)
|
|
|
|
cap = toupper(substr(result, RSTART+1, 1))
|
|
|
|
end = substr(result, RSTART+2)
|
|
|
|
result = beg cap end
|
2013-01-26 19:08:29 +01:00
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
return result
|
2013-01-26 19:08:29 +01:00
|
|
|
}
|
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
{print titlecase($0)}
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-21 12:50:26 +02:00
|
|
|
## End of script
|