# filename: titlecase.awk ## Original file can be found at ## http://www.pement.org/awk/titlecase.awk.txt # # function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case" # # Other Features: # titlecase() will compress whitespace if a second parameter is passed. # It is sufficient to use a positive number: titlecase(string,1) # # This function tries to implement the "Title Case" constructs specified # in the APA Style Manual and the Chicago Manual of Style. Instead of # merely capitalizing the first letter of each word and setting # everything else in lowercase, this function implements the following # conditions: # # - Conjunctions, articles, and prepositions are set lowercase, UNLESS they # are the first word of the string or the first word after a colon, a # question mark, or an exclamation point. # - Compass points (NE, SW, etc.) are set in solid caps. # - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps. # - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.) # - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc. # - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two) # - Contractions such as I'll, You've, Don't, etc. are handled properly # - Degrees such as Ph.D., M.Div., etc. are properly capitalized # # Sample Usage with GNU awk (gawk): # # awk -f titlecase.awk infile BEGIN { #-----ABBREVIATIONS TO BE SET IN MIXEDCASE----- mixed = "KlassX Machine " split(mixed, keep_mixed, " ") #-----ABBREVIATIONS TO BE SET IN LOWERCASE----- articles = "a an the " conjunctions = "and but for nor or so " verbs = "am is are " abbrevs = "feat " # Prepositions # Omitted: over (=finished), under, through, before, after preps = "against at between by from in into of on to upon " # Build array of words to be set lowercased split(articles conjunctions preps verbs abbrevs, keep_lower, " ") #-----ABBREVIATIONS TO BE SET IN SOLID CAPS----- # Other abbreviations - add to this list as needed other = "AIDS ASCII CD DHTML DNA DVD FBI GNU GPL IBM IRS ISBN ISSN " other = other "PHP ROM SSN TV FM BYOB MGMT DJ AC-DC JBX RZA DMX " # build array of words to keep uppercase split(other, keep_upper, " ") } function titlecase(string,x) { # Initialize variables a = ""; # a is/will be the string ALREADY converted b = string; # b is the rest of the string, so that (string = a b) compress = x; # optional compression argument # Compress spaces or tabs if 2nd argument passed. Trim prefix and suffix space. if (compress) { gsub(/[ \t]+/, " ", b) gsub(/^ /, "", b) gsub(/ $/, "", b) if (debug) print "DIAG: Compress argument passed to function call" } b = toupper(b) # Capitalize everything for ease of matching do { hit = 0; # Initialize for later use # pos is the position of the NEXT punctuation mark (except apostrophe) # after the current word. If this is the last word in b, pos will be 0. # match() automatically sets RLENGTH ## WARNING: we consider digits as part of a word. pos = match(b, /[^A-Z0-9']+/) # pos = match(b, /[^A-Z']+/) if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1) else word = b # 1st char of current word head = substr(b, 1, 1) # tail of current word if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2) else tail = substr(b, 2) # shorten the rest of the string b = substr(b, pos + RLENGTH ) #----Words to keep mixedcase---- for (var in keep_mixed) { mix = match(word, "^" toupper(keep_mixed[var]) "\\>") if ( mix > 0 ) { hit = 1 word = keep_mixed[var] if (debug) print "DIAG: Match MC on [" keep_mixed[var] "] in string [" word "]"; break; } } #----Words to keep uppercase---- # Case 1: abbreviations from the keep_upper array. if ( proect == 0) { for (var in keep_upper) { hit = match(word, "^" keep_upper[var] "\\>") if ( hit > 0 ) { if (debug) print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]"; break; } } } # Case 2: Roman numerals # Note: this match cannot distinguish between LIV (54 in Roman numerals) # and a personal name like "Liv Ullman". The Roman numerals C (100), # D (500), and M (1000) are omitted to avoid false matches on words like # civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals # in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV". if ( hit == 0 && match(word, /^[IVXL]+\>/) ) { hit = 1 # But we can undo I'd, I'll, I'm, I've and Ill. if (match(word,/^I'|ILL\>/)) hit = 0 if (debug && hit == 1) print "DIAG: Match on Roman numerals in [" word "]" } #----Words to be set in MiXed case---- # Case 3: Names like D'Arcy or O'Reilly if ( hit == 0 && match(word, /^[DO]'[A-Z]/) ) { if (debug) print "DIAG: Match on mixed case: " word word = substr(word,1,3) tolower(substr(word,4)) hit = 1 } # Case 4: Names like MacNeil or McDonald if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) { if (debug) print "DIAG: Match on MacX: " substr(word,1,1) "-" \ tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \ tolower(substr(word,RLENGTH+1)) word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \ substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1)) hit = 1 } #----Words to set in lowercase---- # Case 5: articles, conjunctions, prepositions from the keep_lower array if (hit == 0) { for (var2 in keep_lower) { hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word); if ( hit > 0 ) { if (debug) print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]"; break; } } } #----Default: Capitalize everything else normally---- if (mix > 0) a = a word else if (hit > 0) a = a word else a = a toupper(head) tolower(tail) } while (pos > 0); ## Everything should be converted now. ## Double exception 1: Set 1st word of string in capital case. Need to ## handle potential internal single/double quotes like "A Day in the Life" ## or 'On the Waterfront'. WARNING: here we consider digits as part of a ## work (as in 1st, 2nd, etc.) match(a, /[A-Za-z0-9]/) a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1) ## Double exception 2: Set 1st word after a colon, question mark or ## exclamation point in title case. This kludge handles multiple colons, ## question marks, etc. on the line. \a is the BEL or CTRL-G character. ## WARNING: we also follow double quotes by a capital. done = gensub(/([:?!"][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a) while (match(done,/\a/)) { beg = substr(done,1,RSTART-1) cap = toupper(substr(done,RSTART+1,1)) end = substr(done,RSTART+2) done = beg cap end } return done } {print titlecase($0,1)} #---end of awk script---