ambevar-dotfiles/.scripts/titlecase.awk

239 lines
8.6 KiB
Awk
Raw Normal View History

#!/bin/gawk -f
2013-01-26 19:08:29 +01:00
## Original file can be found at
## http://www.pement.org/awk/titlecase.awk.txt
#
# function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
#
# Other Features:
# titlecase() will compress whitespace if a second parameter is passed.
# It is sufficient to use a positive number: titlecase(string,1)
#
# This function tries to implement the "Title Case" constructs specified
# in the APA Style Manual and the Chicago Manual of Style. Instead of
# merely capitalizing the first letter of each word and setting
# everything else in lowercase, this function implements the following
# conditions:
#
# - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
# are the first word of the string or the first word after a colon, a
# question mark, or an exclamation point.
# - Compass points (NE, SW, etc.) are set in solid caps.
# - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
# - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
# - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
# - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
# - Contractions such as I'll, You've, Don't, etc. are handled properly
# - Degrees such as Ph.D., M.Div., etc. are properly capitalized
#
# Sample Usage with GNU awk (gawk):
#
# awk -f titlecase.awk infile
## TODO: merge constants (MC, UC, LC) into one array. Use only one loop for matching.
## TODO: get constants from external file. Support: languages, themes (music), etc.
## TODO: rethink algorithm so that it does not need to turn everything to uppercase.
## TODO: rethink algorithm so that it does not include punctuation in 'word'.
2013-01-26 19:08:29 +01:00
BEGIN {
#-----ABBREVIATIONS TO BE SET IN MIXEDCASE-----
mixed = "KlassX Machine d'Acide "
2013-01-26 19:08:29 +01:00
split(mixed, keep_mixed, " ")
#-----ABBREVIATIONS TO BE SET IN LOWERCASE-----
articles = "a an the "
conjunctions = "and but for nor or so "
verbs = "am is are "
abbrevs = "feat "
2013-01-26 19:08:29 +01:00
# Prepositions
# Omitted: over (=finished), under, through, before, after
preps = "against at between by from in into of on to upon "
2013-01-26 19:08:29 +01:00
## French
preps = preps "du "
2013-01-26 19:08:29 +01:00
# Build array of words to be set lowercased
split(articles conjunctions preps verbs abbrevs, keep_lower, " ")
2013-01-26 19:08:29 +01:00
#-----ABBREVIATIONS TO BE SET IN SOLID CAPS-----
# Other abbreviations - add to this list as needed
other = "AIDS ASCII CD DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN "
2013-01-27 21:48:05 +01:00
other = other "PHP ROM SSN TV FM BYOB MGMT DJ AC-DC JBX RZA DMX "
2013-01-26 19:08:29 +01:00
# build array of words to keep uppercase
split(other, keep_upper, " ")
}
function titlecase(string,x) {
# Initialize variables
a = ""; # a is/will be the string ALREADY converted
b = string; # b is the rest of the string, so that (string = a b)
compress = x; # optional compression argument
# Compress spaces or tabs if 2nd argument passed. Trim prefix and suffix space.
if (compress) {
2013-01-26 19:08:29 +01:00
gsub(/[ \t]+/, " ", b)
gsub(/^ /, "", b)
gsub(/ $/, "", b)
2013-01-26 19:08:29 +01:00
if (debug) print "DIAG: Compress argument passed to function call"
}
b = toupper(b) # Capitalize everything for ease of matching
do {
hit = 0; # Initialize for later use
if(debug)
{
print "1a=" a
print "1b=" b
print "1word=" word
}
2013-01-26 19:08:29 +01:00
# pos is the position of the NEXT punctuation mark (except apostrophe)
# after the current word. If this is the last word in b, pos will be 0.
# match() automatically sets RLENGTH
## WARNING: we consider digits as part of a word.
pos = match(b, /[^[:alnum:]']+/)
# pos = match(b, /[^A-Z']+/)
2013-01-26 19:08:29 +01:00
if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)
else word = b
# 1st char of current word
head = substr(b, 1, 1)
# tail of current word
if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)
else tail = substr(b, 2)
# shorten the rest of the string
b = substr(b, pos + RLENGTH )
if(debug)
{
print "2a=" a
print "2b=" b
print "2word=" word
}
#----Words to keep mixedcase---- WARNING: since we match a substring of
## 'word', we need to prepend and append the potentially discarded
## values.
2013-01-26 19:08:29 +01:00
for (var in keep_mixed) {
mix = match(word, "^" toupper(keep_mixed[var]) "\\>")
if ( mix > 0 ) {
hit = 1
word = substr(word, 1, RSTART-1) keep_mixed[var] substr(word, RSTART+RLENGTH)
2013-01-26 19:08:29 +01:00
if (debug)
print "DIAG: Match MC on [" keep_mixed[var] "] in string [" word "]";
break;
}
}
if(debug)
{
print "3a=" a
print "3b=" b
print "3word=" word
}
2013-01-26 19:08:29 +01:00
#----Words to keep uppercase----
# Case 1: abbreviations from the keep_upper array.
if ( proect == 0) {
for (var in keep_upper) {
hit = match(word, "^" keep_upper[var] "\\>")
if ( hit > 0 ) {
if (debug)
print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]";
break;
}
}
}
# Case 2: Roman numerals
# Note: this match cannot distinguish between LIV (54 in Roman numerals)
# and a personal name like "Liv Ullman". The Roman numerals C (100),
# D (500), and M (1000) are omitted to avoid false matches on words like
# civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals
# in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV".
if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
hit = 1
# But we can undo I'd, I'll, I'm, I've and Ill.
if (match(word,/^I'|ILL\>/)) hit = 0
if (debug && hit == 1)
print "DIAG: Match on Roman numerals in [" word "]"
}
#----Words to be set in MiXed case----
# Case 3: Names like D'Arcy or O'Reilly
if ( hit == 0 && match(word, /^[DO]'[[:alpha:]]/) ) {
2013-01-26 19:08:29 +01:00
if (debug) print "DIAG: Match on mixed case: " word
word = substr(word,1,3) tolower(substr(word,4))
hit = 1
}
# Case 4: Names like MacNeil or McDonald
if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
if (debug)
print "DIAG: Match on MacX: " substr(word,1,1) "-" \
tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
tolower(substr(word,RLENGTH+1))
word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \
substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
hit = 1
}
#----Words to set in lowercase----
# Case 5: articles, conjunctions, prepositions from the keep_lower array
if (hit == 0) {
for (var2 in keep_lower) {
hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word);
if ( hit > 0 ) {
if (debug)
print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]";
break;
}
}
}
#----Default: Capitalize everything else normally----
if (mix > 0) a = a word
else if (hit > 0) a = a word
else a = a toupper(head) tolower(tail)
2013-01-26 19:08:29 +01:00
} while (pos > 0);
## Everything should be converted now.
2013-01-26 19:08:29 +01:00
## Double exception 1: Set 1st word of string in capital case. Need to
## handle potential internal single/double quotes like "A Day in the Life"
## or 'On the Waterfront'. WARNING: here we consider digits as part of a
## word (as in 1st, 2nd, etc.)
match(a, /[[:alnum:]]/)
2013-01-26 19:08:29 +01:00
a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1)
## Double exception 2: Set 1st word after a colon, question mark or
## exclamation point in title case. This kludge handles multiple colons,
## question marks, etc. on the line. \a is the BEL or CTRL-G character.
## WARNING: we also follow double quotes by a capital.
done = gensub(/([:?!"][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a)
2013-01-26 19:08:29 +01:00
while (match(done,/\a/)) {
beg = substr(done,1,RSTART-1)
cap = toupper(substr(done,RSTART+1,1))
end = substr(done,RSTART+2)
done = beg cap end
}
return done
}
{print titlecase($0,1)}
2013-01-26 19:08:29 +01:00
#---end of awk script---