cosmetic changes: word variables, avoiding short circuits, less returns

This commit is contained in:
Brandon Taylor 2017-08-06 14:34:42 -04:00
parent 3f6dfb22d1
commit 673e30ce1f
3 changed files with 217 additions and 174 deletions

View File

@ -1,7 +1,7 @@
# conversion of LaTeX directives to plain text, markdown, etc.
#
# The basic idea is that we search for `\foo{arg}`, `{\foo arg}`,
# or `{\foo{arg}}`, and look up `foo` in a dictionary of substitutions
# The basic idea is that we search for `\foo{argument}`, `{\foo argument}`,
# or `{\foo{argument}}`, and look up `foo` in a dictionary of substitutions
# like `\textit` -> `*#1*` where #1 is where the (first) argument is
# substituted. Then we have separate dictionary entries for text/plain,
# text/markdown, etcetera.
@ -10,158 +10,185 @@
# parsing LaTeX directives:
const BACKSLASH = UInt8('\\')
const BRACEOPEN = UInt8('{')
const BRACECLOSE = UInt8('}')
const BRACE_OPEN = UInt8('{')
const BRACE_CLOSE = UInt8('}')
const SPACE = UInt8(' ')
const DOLLAR = UInt8('$')
const CARET = UInt8('^')
const UNDERSCORE = UInt8('_')
isalpha8(x::UInt8) = UInt8('a') x UInt8('z') || UInt8('A') x UInt8('Z')
isalnum8(x::UInt8) = UInt8('0') x UInt8('9') || isalpha8(x)
is_letter(x::UInt8) = UInt8('a') x UInt8('z') || UInt8('A') x UInt8('Z')
is_alphanumeric(x::UInt8) = UInt8('0') x UInt8('9') || is_letter(x)
"""
search_latexdirective(string, istart=1, inbrace=false)
search_latex_directive(astring, start_position = 1, inbrace=false)
Search for a LaTeX directive \\directive{arg} or similar in `string`, returning
`(ds, de, ae)` such that `string[ds:de]` gives `\\directive` and `string[de+1:ae]`
gives `{arg}`. Use [`striparg`](@ref) to remove surrounding braces and whitespace
from the `arg`.
Search for a LaTeX directive \\directive{argument} or similar in `string`, returning
`(start_position, directive_end, argument_end)` such that `string[start_position:directive_end]` gives `\\directive` and `string[directive_end+1:argument_end]`
gives `{argument}`. Use [`strip_argument`](@ref) to remove surrounding braces and whitespace
from the `argument`.
"""
function search_latexdirective(s::Union{String,SubString{String}}, istart::Int=1)
e = sizeof(s)
0 < istart e || return 0,0,0
p = Vector{UInt8}(s)
i = istart
allspaces=true
function search_latex_directive(astring, start_position = 1)
string_length = sizeof(astring)
if !(0 < start_position string_length)
0, 0, 0
else
character_vector = Vector{UInt8}(astring)
index = start_position
all_spaces = true
# find \foo directive or {...}:
c = UInt8(0)
while i e
c = p[i]
(c == BACKSLASH || c == BRACEOPEN || c == CARET || c == UNDERSCORE) && break
c != SPACE && (allspaces = false)
i += 1
end
if i e && c != BRACEOPEN
directive_start = i
if c == BACKSLASH
i += 2
i-1 > e && return 0,0,0
if isalpha8(p[i-1])
while i e && isalpha8(p[i])
i += 1
# find \foo directive or {...}:
character = UInt8(0)
while index string_length
character = character_vector[index]
if (character == BACKSLASH || character == BRACE_OPEN || character == CARET || character == UNDERSCORE)
break
end
if character != SPACE
all_spaces = false
end
index += 1
end
if index string_length && character != BRACE_OPEN
directive_start = index
if character == BACKSLASH
index += 2
if index - 1 > string_length
return 0,0,0
end
end
directive_end = i-1
else
directive_end = directive_start # ^ or _
i += 1
end
# look for optional opening brace
while i e && p[i] == SPACE
i += 1
end
i > e && return directive_start, directive_end, e
inbrace = p[i] == BRACEOPEN
if !inbrace
# search backwards from \foo to look for { \foo ...}
j = directive_start - 1
while j istart && p[j] == SPACE
j -= 1
end
if j < istart || p[j] != BRACEOPEN
if p[i] == BACKSLASH
# argument is another latex directive
ds,de,ae = search_latexdirective(s, i)
return directive_start, directive_end, ae
elseif c != BACKSLASH
# in an equation, token is a single char
return directive_start, directive_end, i
elseif allspaces
# if `\directive ...` was preceded only
# by whitespace, then assume arguments
# extend to the end of the string. This
# happens when we recurse on `{\directive ...}`.
return directive_start, directive_end, e
else
# argument is not in braces … get next token
while i e && isalnum8(p[i])
i += 1
if is_letter(character_vector[index - 1])
while index string_length && is_letter(character_vector[index])
index += 1
end
end
directive_end = index - 1
else
directive_end = directive_start # ^ or _
index += 1
end
# look for optional opening brace
while index string_length && character_vector[index] == SPACE
index += 1
end
if index > string_length
return directive_start, directive_end, string_length
end
in_braces = character_vector[index] == BRACE_OPEN
if !in_braces
# search backwards from \foo to look for { \foo ...}
backwards_index = directive_start - 1
while backwards_index start_position && character_vector[backwards_index] == SPACE
backwards_index -= 1
end
if backwards_index < start_position || character_vector[backwards_index] != BRACE_OPEN
if character_vector[index] == BACKSLASH
# argument is another latex directive
inner_start_position, inner_directive_end, inner_argument_end = search_latex_directive(astring, index)
return directive_start, directive_end, inner_argument_end
elseif character != BACKSLASH
# in an equation, token is a single char
return directive_start, directive_end, index
elseif all_spaces
# if `\directive ...` was preceded only
# by whitespace, then assume arguments
# extend to the end of the string. This
# happens when we recurse on `{\directive ...}`.
return directive_start, directive_end, string_length
else
# argument is not in braces … get next token
while index string_length && is_alphanumeric(character_vector[index])
index += 1
end
return directive_start, directive_end, index - 1
end
return directive_start, directive_end, i-1
end
end
index += 1
elseif index > string_length
return 0, 0, 0
else # { ... }
directive_start = index
directive_end = index - 1
in_braces = true
index += 1
end
i += 1
elseif i > e
return 0,0,0
else # { ... }
directive_start = i
directive_end = i - 1
inbrace = true
i += 1
end
# search for end of argument (closing brace)
nbraces = 1
while i e
c = p[i]
if c == BRACEOPEN
nbraces += 1
elseif c == BRACECLOSE
nbraces -= 1
if nbraces == 0
return directive_start, directive_end, inbrace ? i : i-1
# search for end of argument (closing brace)
number_of_braces = 1
while index string_length
character = character_vector[index]
if character == BRACE_OPEN
number_of_braces += 1
elseif character == BRACE_CLOSE
number_of_braces -= 1
if number_of_braces == 0
argument_end = if in_braces
index
else
index - 1
end
return directive_start, directive_end, argument_end
end
end
index += 1
end
i += 1
directive_start, directive_end, string_length
end
return directive_start, directive_end, e
end
"""
striparg(s, argstart=start(s), argend=endof(s))
strip_argument(astring, start_position = start(astring), end_position = endof(astring))
Return the substring of `s` corresponding to the argument from `argstart:argend`, stripping
Return the substring of `astring` corresponding to the argument from `start_position:end_position`, stripping
leading/trailing whitespace and braces.
"""
function striparg(s::Union{String,SubString{String}}, argstart::Int=start(s), argend::Int=endof(s))
argstart > argend && return SubString(s, 1, 0)
e = endof(s)
(1 argstart e && 1 argend e) || throw(BoundsError())
p = Vector{UInt8}(s)
if p[argend] == BRACECLOSE
argend -= 1 # omit brace
while argstart argend && p[argstart] != BRACEOPEN
argstart += 1
function strip_argument(astring, start_position = start(astring), end_position = endof(astring))
if start_position > end_position
SubString(astring, 1, 0)
else
string_length = endof(astring)
if !(1 start_position string_length && 1 end_position string_length)
throw(BoundsError())
else
character_vector = Vector{UInt8}(astring)
if character_vector[end_position] == BRACE_CLOSE
end_position -= 1 # omit brace
while start_position end_position && character_vector[start_position] != BRACE_OPEN
start_position += 1
end
if start_position > end_position
error("malformed argument")
end
start_position += 1 # omit brace
end
while start_position end_position && character_vector[end_position] == SPACE
end_position -= 1
end
while start_position end_position && character_vector[start_position] == SPACE
start_position += 1
end
SubString(astring, start_position, end_position)
end
argstart > argend && error("malformed argument")
argstart += 1 # omit brace
end
while argstart argend && p[argend] == SPACE
argend -= 1
end
while argstart argend && p[argstart] == SPACE
argstart += 1
end
return SubString(s, argstart, argend)
end
# to make replace work for LaTeX directives with our
# custom search function, all we need to do is to define
# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, i)
# a LaTeXDirectiveSearch type such that search(s, ::LaTeXDirectiveSearch, index)
# returns the range of the directive
struct LaTeXDirectiveSearch; end
function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, i::Integer)
ds, de, ae = search_latexdirective(s, i)
return ds < i ? (0:-1) : (ds:ae)
function Base.search(s::AbstractString, ::LaTeXDirectiveSearch, index)
start_position, directive_end, argument_end = search_latex_directive(s, index)
if start_position < index
0:-1
else
start_position:argument_end
end
end
###########################################################################
# Unicode substitutions for LaTeX directives
const latex_unicode = Dict{String,String}(
const latex_unicode = Dict(
# accent escapes like `\"u` for `ü`, from the list at
# https://en.wikibooks.org/wiki/LaTeX/Special_Characters
# converted to LaTeX characters (mostly combining marks)
@ -200,7 +227,7 @@ const latex_unicode = Dict{String,String}(
)
# LaTeX directives converted to Markdown
const md_directives = Dict{String,String}(
const markdown_directives = Dict(
"\\emph" => "_#1_",
"\\textit" => "_#1_",
"\\it" => "_#1_",
@ -218,7 +245,7 @@ const md_directives = Dict{String,String}(
# directives that are stripped when converting
# to text/plain
const text_directives = Dict{String,String}(
const text_directives = Dict(
"\\emph" => "#1",
"\\textit" => "#1",
"\\it" => "#1",
@ -232,7 +259,7 @@ const text_directives = Dict{String,String}(
)
# Unicode includes an incomplete set of super/subscript characters:
const superscripts = Dict{Char,Char}(
const superscripts = Dict(
'0'=>'⁰', '1'=>'¹', '2'=>'²', '3'=>'³', '4'=>'⁴', '5'=>'⁵', '6'=>'⁶', '7'=>'⁷', '8'=>'⁸', '9'=>'⁹',
'a'=>'ᵃ', 'b'=>'ᵇ', 'c'=>'ᶜ', 'd'=>'ᵈ', 'e'=>'ᵉ', 'f'=>'ᶠ', 'g'=>'ᵍ', 'h'=>'ʰ',
'i'=>'ⁱ', 'j'=>'ʲ', 'k'=>'ᵏ', 'l'=>'ˡ', 'm'=>'ᵐ', 'n'=>'ⁿ', 'o'=>'ᵒ', 'p'=>'ᵖ',
@ -242,83 +269,93 @@ const superscripts = Dict{Char,Char}(
'U'=>'ᵁ', 'V'=>'ⱽ', 'W'=>'ᵂ', 'β'=>'ᵝ', 'γ'=>'ᵞ', 'δ'=>'ᵟ', 'ψ'=>'ᵠ', 'χ'=>'ᵡ', 'Θ'=>'ᶿ',
'+'=>'⁺', '-'=>'⁻', '='=>'⁼', '('=>'⁽', ')'=>'⁾', ' '=>' ', '∘'=>'°',
)
const subscripts = Dict{Char,Char}(
const subscripts = Dict(
'0'=>'₀', '1'=>'₁', '2'=>'₂', '3'=>'₃', '4'=>'₄', '5'=>'₅', '6'=>'₆', '7'=>'₇', '8'=>'₈', '9'=>'₉',
'a'=>'ₐ', 'e'=>'ₑ', 'h'=>'ₕ', 'i'=>'ᵢ', 'j'=>'ⱼ', 'k'=>'ₖ', 'l'=>'ₗ', 'm'=>'ₘ',
'n'=>'ₙ', 'o'=>'ₒ', 'p'=>'ₚ', 'r'=>'ᵣ', 's'=>'ₛ', 't'=>'ₜ', 'u'=>'ᵤ', 'v'=>'ᵥ', 'x'=>'ₓ',
'β'=>'ᵦ', 'γ'=>'ᵧ', 'ρ'=>'ᵨ', 'ψ'=>'ᵩ', 'χ'=>'ᵪ',
'-'=>'₋', '+'=>'₊', '='=>'₌', '('=>'₍', ')'=>'₎', ' '=>' ',
)
function replacechars(s::AbstractString, charmap::Associative{Char,Char})
buf = IOBuffer()
for c in s
cm = get(charmap, c, '\0')
cm == '\0' && return ""
print(buf, cm)
function replace_characters(astring, character_map)
buffer = IOBuffer()
for character in astring
mapped_character = get(character_map, character, '\0')
if mapped_character == '\0'
return ""
end
print(buffer, mapped_character)
end
return String(take!(buf))
String(take!(buffer))
end
# Given a (sub)string `s` that represents a LaTeX directive matched
# by search_latexdirective, performs our Unicode substitutions and
# by search_latex_directive, performs our Unicode substitutions and
# also any additional substitutions given by extra_directives.
function directive_substitution(s::AbstractString, extra_directives::Associative{String,String})
ds, de = search_latexdirective(s)
ae = endof(s)
directive = SubString(s, ds, de)
function directive_substitution(astring, extra_directives)
start_position, directive_end, argument_end = search_latex_directive(astring)
string_length = endof(astring)
directive = SubString(astring, start_position, directive_end)
for dict in (extra_directives, latex_unicode, Base.REPLCompletions.latex_symbols)
if haskey(dict, directive)
sub = dict[directive]
if contains(sub, "#1")
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
return replace(sub, "#1", arg)
substitution = dict[directive]
if contains(substitution, "#1")
argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
return replace(substitution, "#1", argument)
else
arg = replace_directives(SubString(s, de+1, ae), extra_directives)
if strwidth(sub) == 0 # \hat{...} etc: combining chars go after argument
return string(striparg(arg), sub)
argument = replace_directives(SubString(astring, directive_end+1, string_length), extra_directives)
if strwidth(substitution) == 0 # \hat{...} etc: combining chars go after argument
return string(strip_argument(argument), substitution)
else
return string(sub, arg) # don't strip for 0-arg macros
return string(substitution, argument) # don't strip for 0-arg macros
end
end
end
end
if directive == "^" || directive == "_" # super/subscripts
arg = striparg(replace_directives(striparg(s, de+1, ae), extra_directives))
sarg = replacechars(arg, directive == "^" ? superscripts : subscripts)
!isempty(sarg) && return sarg
argument = strip_argument(replace_directives(strip_argument(astring, directive_end + 1, string_length), extra_directives))
dict = if directive == "^"
superscripts
else
subscripts
end
substitution = replace_characters(argument, dict)
if !isempty(substitution)
return substitution
end
end
return s # ignore unrecognized directives
astring # ignore unrecognized directives
end
# replace all latex directives in `s` via `directive_substitution`
replace_directives(s::AbstractString, extra_directives::Associative{String,String}) =
replace(s, LaTeXDirectiveSearch(), sub -> directive_substitution(sub, extra_directives))
replace_directives(astring, extra_directives) =
replace(astring, LaTeXDirectiveSearch(), substitution -> directive_substitution(substitution, extra_directives))
# strip unescaped $ signs from s
function strip_dollars(s::Union{String,SubString{String}})
buf = IOBuffer()
p = Vector{UInt8}(s)
for i = 1:sizeof(s)
c = p[i]
if c == BACKSLASH && i < sizeof(s) && p[i+1] == DOLLAR
write(buf, DOLLAR) # \$ -> $
elseif c != DOLLAR
write(buf, c)
function strip_dollars(astring)
buffer = IOBuffer()
character_vector = Vector{UInt8}(astring)
for index = 1:sizeof(astring)
character = character_vector[index]
if character == BACKSLASH && index < sizeof(astring) && character_vector[index + 1] == DOLLAR
write(buffer, DOLLAR) # \$ -> $
elseif character != DOLLAR
write(buffer, character)
end
end
return String(take!(buf))
return String(take!(buffer))
end
"""
simplify_latex(s::AbstractString, extra_directives=BibTeX.text_directives)
simplify_latex(astring, extra_directives)
Simplify a LaTeX string `s` into "plain text" if possible, stripping/converting
Simplify a LaTeX string `astring` into "plain text" if possible, stripping/converting
known LaTeX directives in favor of e.g Unicode.
`extra_directives` is a dictionary (`String=>String`) that maps LaTeX directives
to replacements. It defaults to `BibTeX.text_directives`, which simply strips
out things like bold and italics. Alternatively, you can pass `BibTeX.md_directives`,
out things like bold and italics. Alternatively, you can pass `BibTeX.markdown_directives`,
which uses Markdown syntax for such directives.
"""
simplify_latex(s::AbstractString, extra_directives::Associative{String,String}=text_directives) =
strip_dollars(replace_directives(s, extra_directives))
simplify_latex(astring, extra_directives = text_directives) =
strip_dollars(replace_directives(astring, extra_directives))

View File

@ -18,6 +18,8 @@ parse_text(text) = matchall(r"[^\s\"#{}@,=\\]+|\s+|\"|#|{|}|@|,|=|\\", text) |>
location(parser) = "on line $(parser.line)"
Base.isempty(p::Parser) = isempty(p.tokens)
next_token_default!(parser) =
if isempty(parser.tokens)
one(parser)
@ -49,14 +51,15 @@ next_token!(parser, eol = "additional tokens") = begin
end
end
expect(parser, result, expectation) =
if result != expectation
error("Expected $expectation $(location(parser))")
expect(parser, result, eol) =
if result != eol
error("Expected $eol $(location(parser))")
end
expect!(parser, expectation) = expect(parser, next_token!(parser, expectation), expectation)
expect!(parser, eol) =
expect(parser, next_token!(parser, eol), eol)
token_and_counter!(parser, eol) = begin
token_and_counter!(parser, eol = "}") = begin
token = next_token_with_space!(parser, eol)
if token == "{"
parser.bracket_counter += 1
@ -80,10 +83,10 @@ value!(parser, values = eltype(parser)[]) = begin
end
elseif token == "{"
parser.bracket_counter += 1
token = token_and_counter!(parser, "}")
token = token_and_counter!(parser)
while parser.bracket_counter > 0
push!(values, token)
token = token_and_counter!(parser, "}")
token = token_and_counter!(parser)
end
else
push!(values, getkey(parser.substitutions, token, String(token) ) )

View File

@ -1,11 +1,13 @@
using BibTeX, Base.Test
base_file = dirname(dirname(@__FILE__))
import Documenter
Documenter.makedocs(
modules = [BibTeX],
format = :html,
sitename = "BibTeX.jl",
root = joinpath(dirname(dirname(@__FILE__)), "docs"),
root = joinpath(base_file, "docs"),
pages = Any["Home" => "index.md"],
strict = true,
linkcheck = true,
@ -14,7 +16,8 @@ Documenter.makedocs(
)
@testset "examples.bib" begin
b = open(Bibliography, joinpath("..", "example", "examples.bib"), "r")
# note: ".." does not work on windows
b = open(Bibliography, joinpath(base_file, "example", "examples.bib"), "r")
@test length(b) == 92
@test (b["angenendt"]::Citation{:article})["date"] == "2002"
end
@ -48,8 +51,8 @@ end
@test isempty(sizehint!(empty!(x2),10))
end
import BibTeX: simplify_latex, md_directives
import BibTeX: simplify_latex, markdown_directives
@testset "latex" begin
@test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", md_directives) ==
@test simplify_latex(raw"foo \$$x_1x_2^\mathrm{3}$ \dot{\alpha} {quote} \% \{unquote\} \emph{bar \textbf{bold}} {\bf baz 2.0} {\^{u}}", markdown_directives) ==
"foo \$x₁x₂³ α̇ quote % {unquote} _bar **bold**_ **baz 2.0** û"
end