2013-04-07 12:33:43 +02:00
|
|
|
#!/bin/gawk -f
|
2013-01-26 19:08:29 +01:00
|
|
|
## Original file can be found at
|
|
|
|
## http://www.pement.org/awk/titlecase.awk.txt
|
|
|
|
|
|
|
|
#
|
|
|
|
# function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case"
|
|
|
|
#
|
|
|
|
# Other Features:
|
|
|
|
# titlecase() will compress whitespace if a second parameter is passed.
|
|
|
|
# It is sufficient to use a positive number: titlecase(string,1)
|
|
|
|
#
|
|
|
|
# This function tries to implement the "Title Case" constructs specified
|
|
|
|
# in the APA Style Manual and the Chicago Manual of Style. Instead of
|
|
|
|
# merely capitalizing the first letter of each word and setting
|
|
|
|
# everything else in lowercase, this function implements the following
|
|
|
|
# conditions:
|
|
|
|
#
|
|
|
|
# - Conjunctions, articles, and prepositions are set lowercase, UNLESS they
|
|
|
|
# are the first word of the string or the first word after a colon, a
|
|
|
|
# question mark, or an exclamation point.
|
|
|
|
# - Compass points (NE, SW, etc.) are set in solid caps.
|
|
|
|
# - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps.
|
|
|
|
# - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.)
|
|
|
|
# - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc.
|
|
|
|
# - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two)
|
|
|
|
# - Contractions such as I'll, You've, Don't, etc. are handled properly
|
|
|
|
# - Degrees such as Ph.D., M.Div., etc. are properly capitalized
|
|
|
|
#
|
|
|
|
# Sample Usage with GNU awk (gawk):
|
|
|
|
#
|
|
|
|
# awk -f titlecase.awk infile
|
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
## TODO: merge constants (MC, UC, LC) into one array. Use only one loop for matching.
|
|
|
|
## TODO: get constants from external file. Support: languages, themes (music), etc.
|
|
|
|
## TODO: rethink algorithm so that it does not need to turn everything to uppercase.
|
|
|
|
## TODO: rethink algorithm so that it does not include punctuation in 'word'.
|
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
BEGIN {
|
|
|
|
|
|
|
|
#-----ABBREVIATIONS TO BE SET IN MIXEDCASE-----
|
2013-04-07 12:33:43 +02:00
|
|
|
mixed = "KlassX Machine d'Acide "
|
2013-01-26 19:08:29 +01:00
|
|
|
split(mixed, keep_mixed, " ")
|
|
|
|
|
|
|
|
#-----ABBREVIATIONS TO BE SET IN LOWERCASE-----
|
|
|
|
articles = "a an the "
|
|
|
|
conjunctions = "and but for nor or so "
|
2013-01-27 00:48:05 +01:00
|
|
|
verbs = "am is are "
|
|
|
|
abbrevs = "feat "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
# Prepositions
|
|
|
|
# Omitted: over (=finished), under, through, before, after
|
2013-01-27 00:48:05 +01:00
|
|
|
preps = "against at between by from in into of on to upon "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
## French
|
|
|
|
preps = preps "du "
|
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
# Build array of words to be set lowercased
|
2013-01-27 00:48:05 +01:00
|
|
|
split(articles conjunctions preps verbs abbrevs, keep_lower, " ")
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
#-----ABBREVIATIONS TO BE SET IN SOLID CAPS-----
|
|
|
|
# Other abbreviations - add to this list as needed
|
|
|
|
other = "AIDS ASCII CD DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN "
|
2013-01-27 21:48:05 +01:00
|
|
|
other = other "PHP ROM SSN TV FM BYOB MGMT DJ AC-DC JBX RZA DMX "
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
# build array of words to keep uppercase
|
|
|
|
split(other, keep_upper, " ")
|
|
|
|
}
|
|
|
|
|
|
|
|
function titlecase(string,x) {
|
|
|
|
|
|
|
|
# Initialize variables
|
|
|
|
a = ""; # a is/will be the string ALREADY converted
|
|
|
|
b = string; # b is the rest of the string, so that (string = a b)
|
|
|
|
compress = x; # optional compression argument
|
|
|
|
|
2013-01-27 00:48:05 +01:00
|
|
|
# Compress spaces or tabs if 2nd argument passed. Trim prefix and suffix space.
|
|
|
|
if (compress) {
|
2013-01-26 19:08:29 +01:00
|
|
|
gsub(/[ \t]+/, " ", b)
|
2013-01-27 00:48:05 +01:00
|
|
|
gsub(/^ /, "", b)
|
|
|
|
gsub(/ $/, "", b)
|
2013-01-26 19:08:29 +01:00
|
|
|
if (debug) print "DIAG: Compress argument passed to function call"
|
|
|
|
}
|
|
|
|
|
|
|
|
b = toupper(b) # Capitalize everything for ease of matching
|
|
|
|
|
|
|
|
do {
|
|
|
|
hit = 0; # Initialize for later use
|
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
if(debug)
|
|
|
|
{
|
|
|
|
print "1a=" a
|
|
|
|
print "1b=" b
|
|
|
|
print "1word=" word
|
|
|
|
}
|
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
# pos is the position of the NEXT punctuation mark (except apostrophe)
|
|
|
|
# after the current word. If this is the last word in b, pos will be 0.
|
|
|
|
# match() automatically sets RLENGTH
|
2013-01-27 23:10:46 +01:00
|
|
|
## WARNING: we consider digits as part of a word.
|
2013-04-07 12:33:43 +02:00
|
|
|
pos = match(b, /[^[:alnum:]']+/)
|
2013-01-27 00:48:05 +01:00
|
|
|
# pos = match(b, /[^A-Z']+/)
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1)
|
|
|
|
else word = b
|
|
|
|
|
|
|
|
# 1st char of current word
|
|
|
|
head = substr(b, 1, 1)
|
|
|
|
# tail of current word
|
|
|
|
if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2)
|
|
|
|
else tail = substr(b, 2)
|
|
|
|
|
|
|
|
# shorten the rest of the string
|
|
|
|
b = substr(b, pos + RLENGTH )
|
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
if(debug)
|
|
|
|
{
|
|
|
|
print "2a=" a
|
|
|
|
print "2b=" b
|
|
|
|
print "2word=" word
|
|
|
|
}
|
|
|
|
|
|
|
|
#----Words to keep mixedcase---- WARNING: since we match a substring of
|
|
|
|
## 'word', we need to prepend and append the potentially discarded
|
|
|
|
## values.
|
2013-01-26 19:08:29 +01:00
|
|
|
for (var in keep_mixed) {
|
|
|
|
mix = match(word, "^" toupper(keep_mixed[var]) "\\>")
|
|
|
|
if ( mix > 0 ) {
|
|
|
|
hit = 1
|
2013-04-07 12:33:43 +02:00
|
|
|
word = substr(word, 1, RSTART-1) keep_mixed[var] substr(word, RSTART+RLENGTH)
|
2013-01-26 19:08:29 +01:00
|
|
|
if (debug)
|
|
|
|
print "DIAG: Match MC on [" keep_mixed[var] "] in string [" word "]";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-07 12:33:43 +02:00
|
|
|
if(debug)
|
|
|
|
{
|
|
|
|
print "3a=" a
|
|
|
|
print "3b=" b
|
|
|
|
print "3word=" word
|
|
|
|
}
|
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
#----Words to keep uppercase----
|
|
|
|
# Case 1: abbreviations from the keep_upper array.
|
|
|
|
if ( proect == 0) {
|
|
|
|
for (var in keep_upper) {
|
|
|
|
hit = match(word, "^" keep_upper[var] "\\>")
|
|
|
|
if ( hit > 0 ) {
|
|
|
|
if (debug)
|
|
|
|
print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Case 2: Roman numerals
|
|
|
|
# Note: this match cannot distinguish between LIV (54 in Roman numerals)
|
|
|
|
# and a personal name like "Liv Ullman". The Roman numerals C (100),
|
|
|
|
# D (500), and M (1000) are omitted to avoid false matches on words like
|
|
|
|
# civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals
|
|
|
|
# in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV".
|
|
|
|
if ( hit == 0 && match(word, /^[IVXL]+\>/) ) {
|
|
|
|
hit = 1
|
|
|
|
# But we can undo I'd, I'll, I'm, I've and Ill.
|
|
|
|
if (match(word,/^I'|ILL\>/)) hit = 0
|
|
|
|
if (debug && hit == 1)
|
|
|
|
print "DIAG: Match on Roman numerals in [" word "]"
|
|
|
|
}
|
|
|
|
|
|
|
|
#----Words to be set in MiXed case----
|
|
|
|
# Case 3: Names like D'Arcy or O'Reilly
|
2013-04-07 12:33:43 +02:00
|
|
|
if ( hit == 0 && match(word, /^[DO]'[[:alpha:]]/) ) {
|
2013-01-26 19:08:29 +01:00
|
|
|
if (debug) print "DIAG: Match on mixed case: " word
|
|
|
|
word = substr(word,1,3) tolower(substr(word,4))
|
|
|
|
hit = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
# Case 4: Names like MacNeil or McDonald
|
|
|
|
if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) {
|
|
|
|
if (debug)
|
|
|
|
print "DIAG: Match on MacX: " substr(word,1,1) "-" \
|
|
|
|
tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \
|
|
|
|
tolower(substr(word,RLENGTH+1))
|
|
|
|
word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \
|
|
|
|
substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1))
|
|
|
|
hit = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
#----Words to set in lowercase----
|
|
|
|
# Case 5: articles, conjunctions, prepositions from the keep_lower array
|
|
|
|
if (hit == 0) {
|
|
|
|
for (var2 in keep_lower) {
|
|
|
|
hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word);
|
|
|
|
if ( hit > 0 ) {
|
|
|
|
if (debug)
|
|
|
|
print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#----Default: Capitalize everything else normally----
|
|
|
|
if (mix > 0) a = a word
|
|
|
|
else if (hit > 0) a = a word
|
|
|
|
else a = a toupper(head) tolower(tail)
|
|
|
|
|
2013-01-27 00:48:05 +01:00
|
|
|
|
2013-01-26 19:08:29 +01:00
|
|
|
} while (pos > 0);
|
|
|
|
|
2013-04-05 14:47:26 +02:00
|
|
|
## Everything should be converted now.
|
2013-01-26 19:08:29 +01:00
|
|
|
|
2013-04-05 14:47:26 +02:00
|
|
|
## Double exception 1: Set 1st word of string in capital case. Need to
|
|
|
|
## handle potential internal single/double quotes like "A Day in the Life"
|
|
|
|
## or 'On the Waterfront'. WARNING: here we consider digits as part of a
|
2013-04-07 12:33:43 +02:00
|
|
|
## word (as in 1st, 2nd, etc.)
|
|
|
|
match(a, /[[:alnum:]]/)
|
2013-01-26 19:08:29 +01:00
|
|
|
a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1)
|
|
|
|
|
2013-04-05 14:47:26 +02:00
|
|
|
## Double exception 2: Set 1st word after a colon, question mark or
|
|
|
|
## exclamation point in title case. This kludge handles multiple colons,
|
|
|
|
## question marks, etc. on the line. \a is the BEL or CTRL-G character.
|
2013-01-27 23:10:46 +01:00
|
|
|
## WARNING: we also follow double quotes by a capital.
|
|
|
|
done = gensub(/([:?!"][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a)
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
while (match(done,/\a/)) {
|
|
|
|
beg = substr(done,1,RSTART-1)
|
|
|
|
cap = toupper(substr(done,RSTART+1,1))
|
|
|
|
end = substr(done,RSTART+2)
|
|
|
|
done = beg cap end
|
|
|
|
}
|
|
|
|
|
|
|
|
return done
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-01-27 00:48:05 +01:00
|
|
|
{print titlecase($0,1)}
|
2013-01-26 19:08:29 +01:00
|
|
|
|
|
|
|
#---end of awk script---
|