refactor readers.jl:

- avoid unnecessary overloading on `parse_doc`
- split readers.jl into multiple scripts
pull/318/head
Shuhei Kadowaki 2020-05-09 19:25:47 +09:00
parent 8b910c0d70
commit 750b7624e0
11 changed files with 347 additions and 348 deletions

View File

@ -300,7 +300,7 @@ include("chunks.jl")
include("config.jl")
include("WeaveMarkdown/markdown.jl")
include("display_methods.jl")
include("readers.jl")
include("reader/reader.jl")
include("run.jl")
include("cache.jl")
include("formatters.jl")

View File

@ -1,7 +1,6 @@
import Mustache
abstract type WeaveChunk end
abstract type Inline end
mutable struct WeaveDoc
source::AbstractString
@ -75,19 +74,12 @@ mutable struct CodeChunk <: WeaveChunk
end
end
abstract type Inline end
mutable struct DocChunk <: WeaveChunk
content::Array{Inline}
content::Vector{Inline}
number::Int
start_line::Int
function DocChunk(
text::AbstractString,
number::Int,
start_line::Int,
inline_regex = nothing,
)
chunks = parse_inline(text, inline_regex)
new(chunks, number, start_line)
end
end
mutable struct InlineText <: Inline

91
src/reader/markdown.jl Normal file
View File

@ -0,0 +1,91 @@
"""
parse_markdown(document_body, is_pandoc = false)::Vector{WeaveChunk}
parse_markdown(document_body, code_start, code_end)::Vector{WeaveChunk}
Parses Weave markdown and returns [`WeaveChunk`](@ref)s.
"""
function parse_markdown end
function parse_markdown(document_body, is_pandoc = false)::Vector{WeaveChunk}
code_start, code_end = if is_pandoc
r"^<<(?<options>.*?)>>=\s*$",
r"^@\s*$"
else
r"^[`~]{3}(?:\{?)julia(?:;?)\s*(?<options>.*?)(\}|\s*)$",
r"^[`~]{3}\s*$"
end
return parse_markdown(document_body, code_start, code_end)
end
function parse_markdown(document_body, code_start, code_end)::Vector{WeaveChunk}
lines = split(document_body, '\n')
state = "doc"
docno = 1
codeno = 1
content = ""
start_line = 0
options = Dict()
optionString = ""
chunks = WeaveChunk[]
for (lineno, line) in enumerate(lines)
m = match(code_start, line)
if !isnothing(m) && state == "doc"
state = "code"
if m.captures[1] == nothing
optionString = ""
else
optionString = strip(m.captures[1])
end
options = Dict{Symbol,Any}()
if length(optionString) > 0
expr = Meta.parse(optionString)
Base.Meta.isexpr(expr, :(=)) && (options[expr.args[1]] = expr.args[2])
Base.Meta.isexpr(expr, :toplevel) &&
map(pushopt, fill(options, length(expr.args)), expr.args)
end
haskey(options, :label) && (options[:name] = options[:label])
haskey(options, :name) || (options[:name] = nothing)
if !isempty(strip(content))
chunk = DocChunk(content, docno, start_line)
docno += 1
push!(chunks, chunk)
end
content = ""
start_line = lineno
continue
end
if occursin(code_end, line) && state == "code"
chunk = CodeChunk(content, codeno, start_line, optionString, options)
codeno += 1
start_line = lineno
content = ""
state = "doc"
push!(chunks, chunk)
continue
end
if lineno == 1
content *= line
else
content *= "\n" * line
end
end
# Remember the last chunk
if strip(content) != ""
chunk = DocChunk(content, docno, start_line)
# chunk = Dict{Symbol,Any}(:type => "doc", :content => content,
# :number => docno, :start_line => start_line)
push!(chunks, chunk)
end
return chunks
end

29
src/reader/notebook.jl Normal file
View File

@ -0,0 +1,29 @@
"""
parse_notebook(document_body)::Vector{WeaveChunk}
Parses Jupyter notebook and returns [`WeaveChunk`](@ref)s.
"""
function parse_notebook(document_body)::Vector{WeaveChunk}
nb = JSON.parse(document_body)
chunks = WeaveChunk[]
options = Dict{Symbol,Any}()
opt_string = ""
docno = 1
codeno = 1
for cell in nb["cells"]
srctext = "\n" * join(cell["source"], "")
if cell["cell_type"] == "code"
chunk = CodeChunk(rstrip(srctext), codeno, 0, opt_string, options)
push!(chunks, chunk)
codeno += 1
else
chunk = DocChunk(srctext * "\n", docno, 0; notebook = true)
push!(chunks, chunk)
docno += 1
end
end
return chunks
end

115
src/reader/reader.jl Normal file
View File

@ -0,0 +1,115 @@
using JSON, YAML
"""
read_doc(source::AbstractString, format = :auto) -> WeaveDoc
Read the input document from `source` and parse it into [`WeaveDoc`](@ref).
"""
function read_doc(source::AbstractString, format = :auto)
document = replace(read(source, String), "\r\n" => "\n") # fix line ending
format === :auto && (format = detect_informat(source))
chunks = parse_doc(document, format)
header = parse_header(first(chunks))
doc = WeaveDoc(source, chunks, header)
haskey(header, "options") && header_chunk_defaults!(doc) # TODO: rename `options` => `weave_options`
return doc
end
"""
detect_informat(source::AbstractString)
Detect Weave input format based on file extension of `source`.
"""
function detect_informat(source::AbstractString)
ext = lowercase(last(splitext(source)))
ext == ".jl" && return "script"
ext == ".jmd" && return "markdown"
ext == ".ipynb" && return "notebook"
return "noweb"
end
function parse_doc(document, format)
return if format == "markdown"
parse_markdown(document)
elseif format == "noweb"
parse_markdown(document, true)
elseif format == "script"
parse_script(document)
elseif format == "notebook"
parse_notebook(document)
else
error("unsupported format given: $(format)")
end
end
function pushopt(options::Dict, expr::Expr)
if Base.Meta.isexpr(expr, :(=))
options[expr.args[1]] = expr.args[2]
end
end
# inline
# ------
function DocChunk(text::AbstractString, number, start_line; notebook = false)
# don't parse inline code in notebook
content = notebook ? parse_inline(text) : parse_inlines(text)
return DocChunk(content, number, start_line)
end
const INLINE_REGEX = r"`j\s+(.*?)`|^!\s(.*)$"m
function parse_inlines(text)::Vector{Inline}
occursin(INLINE_REGEX, text) || return parse_inline(text)
inline_chunks = eachmatch(INLINE_REGEX, text)
s = 1
e = 1
res = Inline[]
textno = 1
codeno = 1
for ic in inline_chunks
s = ic.offset
doc = InlineText(text[e:(s-1)], e, s - 1, textno)
textno += 1
push!(res, doc)
e = s + lastindex(ic.match)
ic.captures[1] !== nothing && (ctype = :inline)
ic.captures[2] !== nothing && (ctype = :line)
cap = filter(x -> x !== nothing, ic.captures)[1]
push!(res, InlineCode(cap, s, e, codeno, ctype))
codeno += 1
end
push!(res, InlineText(text[e:end], e, length(text), textno))
return res
end
parse_inline(text) = Inline[InlineText(text, 1, length(text), 1)]
# headers
# -------
parse_header(chunk::CodeChunk) = Dict()
const HEADER_REGEX = r"^---$(?<header>((?!---).)+)^---$"ms
function parse_header(chunk::DocChunk)
m = match(HEADER_REGEX, chunk.content[1].content)
if m !== nothing
header = YAML.load(string(m[:header]))
else
header = Dict()
end
return header
end
include("markdown.jl")
include("script.jl")
include("notebook.jl")

102
src/reader/script.jl Normal file
View File

@ -0,0 +1,102 @@
"""
parse_script(document_body)::Vector{WeaveChunk}
Parse Julia script and returns [`WeaveChunk`](@ref)s.
"""
function parse_script(document_body)::Vector{WeaveChunk}
lines = split(document_body, "\n")
doc_line = r"(^#'.*)|(^#%%.*)|(^# %%.*)"
doc_start = r"(^#')|(^#%%)|(^# %%)"
opt_line = r"(^#\+.*$)|(^#%%\+.*$)|(^# %%\+.*$)"
opt_start = r"(^#\+)|(^#%%\+)|(^# %%\+)"
read = ""
docno = 1
codeno = 1
content = ""
start_line = 1
options = Dict{Symbol,Any}()
optionString = ""
chunks = WeaveChunk[]
state = "code"
lineno = 1
n_emptylines = 0
for lineno = 1:length(lines)
line = lines[lineno]
if (m = match(doc_line, line)) != nothing && (m = match(opt_line, line)) == nothing
line = replace(line, doc_start => "", count = 1)
if startswith(line, " ")
line = replace(line, " " => "", count = 1)
end
if state == "code" && strip(read) != ""
chunk =
CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(chunks, chunk)
codeno += 1
read = ""
start_line = lineno
end
state = "doc"
elseif (m = match(opt_line, line)) != nothing
start_line = lineno
if state == "code" && strip(read) != ""
chunk =
CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(chunks, chunk)
read = ""
codeno += 1
end
if state == "doc" && strip(read) != ""
(docno > 1) && (read = "\n" * read) # Add whitespace to doc chunk. Needed for markdown output
chunk = DocChunk(read, docno, start_line)
push!(chunks, chunk)
read = ""
docno += 1
end
optionString = replace(line, opt_start => "", count = 1)
# Get options
options = Dict{Symbol,Any}()
if length(optionString) > 0
expr = Meta.parse(optionString)
Base.Meta.isexpr(expr, :(=)) && (options[expr.args[1]] = expr.args[2])
Base.Meta.isexpr(expr, :toplevel) &&
map(pushopt, fill(options, length(expr.args)), expr.args)
end
haskey(options, :label) && (options[:name] = options[:label])
haskey(options, :name) || (options[:name] = nothing)
state = "code"
continue
elseif state == "doc" # && strip(line) != "" && strip(read) != ""
state = "code"
(docno > 1) && (read = "\n" * read) # Add whitespace to doc chunk. Needed for markdown output
chunk = DocChunk(read, docno, start_line)
push!(chunks, chunk)
options = Dict{Symbol,Any}()
start_line = lineno
read = ""
docno += 1
end
read *= line * "\n"
if strip(line) == ""
n_emptylines += 1
else
n_emptylines = 0
end
end
# Handle the last chunk
if state == "code"
chunk = CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(chunks, chunk)
else
chunk = DocChunk(read, docno, start_line)
push!(chunks, chunk)
end
return chunks
end

View File

@ -1,318 +0,0 @@
import JSON, YAML
pushopt(options::Dict, expr::Expr) =
Base.Meta.isexpr(expr, :(=)) && (options[expr.args[1]] = expr.args[2])
mutable struct MarkupInput
codestart::Regex
codeend::Regex
inline::Regex
end
mutable struct ScriptInput
doc_line::Regex
doc_start::Regex
opt_line::Regex
opt_start::Regex
inline::Regex
end
mutable struct NotebookInput
inline::Any
end
const input_formats = Dict{AbstractString,Any}(
"noweb" => MarkupInput(r"^<<(.*?)>>=\s*$", r"^@\s*$", r"`j\s+(.*?)`|^!\s(.*)$"m),
"markdown" => MarkupInput(
r"^[`~]{3,}(?:\{|\{\.|)julia(?:;|)\s*(.*?)(\}|\s*)$",
r"^[`~]{3,}\s*$",
r"`j\s+(.*?)`|^!\s(.*)$"m,
),
"script" => ScriptInput(
r"(^#'.*)|(^#%%.*)|(^# %%.*)",
r"(^#')|(^#%%)|(^# %%)",
r"(^#\+.*$)|(^#%%\+.*$)|(^# %%\+.*$)",
r"(^#\+)|(^#%%\+)|(^# %%\+)",
r"`j\s+(.*?)`|^!\s(.*)$"m,
),
"notebook" => NotebookInput(nothing), # Don't parse inline code from notebooks
)
"""Detect the input format based on file extension"""
function detect_informat(source::AbstractString)
ext = lowercase(splitext(source)[2])
ext == ".jl" && return "script"
ext == ".jmd" && return "markdown"
ext == ".ipynb" && return "notebook"
return "noweb"
end
"""Read and parse input document"""
function read_doc(source::AbstractString, format = :auto)
format === :auto && (format = detect_informat(source))
document = read(source, String)
document = replace(document, "\r\n" => "\n")
parsed = parse_doc(document, format)
header = parse_header(parsed[1])
doc = WeaveDoc(source, parsed, header)
haskey(header, "options") && header_chunk_defaults!(doc)
return doc
end
function parse_header(chunk::CodeChunk)
return Dict()
end
const HEADER_REGEX = r"^---$(?<header>((?!---).)+)^---$"ms
function parse_header(chunk::DocChunk)
m = match(HEADER_REGEX, chunk.content[1].content)
if m !== nothing
header = YAML.load(string(m[:header]))
else
header = Dict()
end
return header
end
parse_doc(document::AbstractString, format::AbstractString = "noweb") =
parse_doc(document, input_formats[format])
"""Parse documents with Weave.jl markup"""
function parse_doc(document::AbstractString, format::MarkupInput)
document = replace(document, "\r\n" => "\n")
lines = split(document, "\n")
codestart = format.codestart
codeend = format.codeend
state = "doc"
docno = 1
codeno = 1
content = ""
start_line = 0
options = Dict()
optionString = ""
parsed = Any[]
for lineno = 1:length(lines)
line = lines[lineno]
if (m = match(codestart, line)) != nothing && state == "doc"
state = "code"
if m.captures[1] == nothing
optionString = ""
else
optionString = strip(m.captures[1])
end
options = Dict{Symbol,Any}()
if length(optionString) > 0
expr = Meta.parse(optionString)
Base.Meta.isexpr(expr, :(=)) && (options[expr.args[1]] = expr.args[2])
Base.Meta.isexpr(expr, :toplevel) &&
map(pushopt, fill(options, length(expr.args)), expr.args)
end
haskey(options, :label) && (options[:name] = options[:label])
haskey(options, :name) || (options[:name] = nothing)
if !isempty(strip(content))
chunk = DocChunk(content, docno, start_line, format.inline)
docno += 1
push!(parsed, chunk)
end
content = ""
start_line = lineno
continue
end
if occursin(codeend, line) && state == "code"
chunk = CodeChunk(content, codeno, start_line, optionString, options)
codeno += 1
start_line = lineno
content = ""
state = "doc"
push!(parsed, chunk)
continue
end
if lineno == 1
content *= line
else
content *= "\n" * line
end
end
# Remember the last chunk
if strip(content) != ""
chunk = DocChunk(content, docno, start_line, format.inline)
# chunk = Dict{Symbol,Any}(:type => "doc", :content => content,
# :number => docno, :start_line => start_line)
push!(parsed, chunk)
end
return parsed
end
"""Parse .jl scripts with Weave.jl markup"""
function parse_doc(document::AbstractString, format::ScriptInput)
document = replace(document, "\r\n" => "\n")
lines = split(document, "\n")
doc_line = format.doc_line
doc_start = format.doc_start
opt_line = format.opt_line
opt_start = format.opt_start
read = ""
chunks = []
docno = 1
codeno = 1
content = ""
start_line = 1
options = Dict{Symbol,Any}()
optionString = ""
parsed = Any[]
state = "code"
lineno = 1
n_emptylines = 0
for lineno = 1:length(lines)
line = lines[lineno]
if (m = match(doc_line, line)) != nothing && (m = match(opt_line, line)) == nothing
line = replace(line, doc_start => "", count = 1)
if startswith(line, " ")
line = replace(line, " " => "", count = 1)
end
if state == "code" && strip(read) != ""
chunk =
CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(parsed, chunk)
codeno += 1
read = ""
start_line = lineno
end
state = "doc"
elseif (m = match(opt_line, line)) != nothing
start_line = lineno
if state == "code" && strip(read) != ""
chunk =
CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(parsed, chunk)
read = ""
codeno += 1
end
if state == "doc" && strip(read) != ""
(docno > 1) && (read = "\n" * read) # Add whitespace to doc chunk. Needed for markdown output
chunk = DocChunk(read, docno, start_line)
push!(parsed, chunk)
read = ""
docno += 1
end
optionString = replace(line, opt_start => "", count = 1)
# Get options
options = Dict{Symbol,Any}()
if length(optionString) > 0
expr = Meta.parse(optionString)
Base.Meta.isexpr(expr, :(=)) && (options[expr.args[1]] = expr.args[2])
Base.Meta.isexpr(expr, :toplevel) &&
map(pushopt, fill(options, length(expr.args)), expr.args)
end
haskey(options, :label) && (options[:name] = options[:label])
haskey(options, :name) || (options[:name] = nothing)
state = "code"
continue
elseif state == "doc" # && strip(line) != "" && strip(read) != ""
state = "code"
(docno > 1) && (read = "\n" * read) # Add whitespace to doc chunk. Needed for markdown output
chunk = DocChunk(read, docno, start_line, format.inline)
push!(parsed, chunk)
options = Dict{Symbol,Any}()
start_line = lineno
read = ""
docno += 1
end
read *= line * "\n"
if strip(line) == ""
n_emptylines += 1
else
n_emptylines = 0
end
end
# Handle the last chunk
if state == "code"
chunk = CodeChunk("\n" * strip(read), codeno, start_line, optionString, options)
push!(parsed, chunk)
else
chunk = DocChunk(read, docno, start_line, format.inline)
push!(parsed, chunk)
end
return parsed
end
"""Parse IJUlia notebook"""
function parse_doc(document::String, format::NotebookInput)
document = replace(document, "\r\n" => "\n")
nb = JSON.parse(document)
parsed = Any[]
options = Dict{Symbol,Any}()
opt_string = ""
docno = 1
codeno = 1
for cell in nb["cells"]
srctext = "\n" * join(cell["source"], "")
if cell["cell_type"] == "code"
chunk = CodeChunk(rstrip(srctext), codeno, 0, opt_string, options)
push!(parsed, chunk)
codeno += 1
else
chunk = DocChunk(srctext * "\n", docno, 0)
push!(parsed, chunk)
docno += 1
end
end
return parsed
end
# Use this if regex is undefined
function parse_inline(text, noex)
return Inline[InlineText(text, 1, length(text), 1)]
end
function parse_inline(text::AbstractString, inline_ex::Regex)
occursin(inline_ex, text) || return Inline[InlineText(text, 1, length(text), 1)]
inline_chunks = eachmatch(inline_ex, text)
s = 1
e = 1
res = Inline[]
textno = 1
codeno = 1
for ic in inline_chunks
s = ic.offset
doc = InlineText(text[e:(s-1)], e, s - 1, textno)
textno += 1
push!(res, doc)
e = s + lastindex(ic.match)
ic.captures[1] !== nothing && (ctype = :inline)
ic.captures[2] !== nothing && (ctype = :line)
cap = filter(x -> x !== nothing, ic.captures)[1]
push!(res, InlineCode(cap, s, e, codeno, ctype))
codeno += 1
end
push!(res, InlineText(text[e:end], e, length(text), textno))
return res
end

View File

@ -1,7 +1,3 @@
using Weave
using Weave: run_doc
using Test
s1= """
```julia
@ -21,12 +17,10 @@ print(y
"""
p1 = Weave.parse_doc(s1, "markdown")
p1 = Weave.parse_markdown(s1)
doc = Weave.WeaveDoc("dummy1.jmd", p1, Dict())
doc1 = run_doc(doc, doctype = "pandoc")
doc1.chunks[1].output
@test doc1.chunks[1].output == "Error: ArgumentError: Package NonExisting not found in current path:\n- Run `import Pkg; Pkg.add(\"NonExisting\")` to install the NonExisting package.\n\n"
@test doc1.chunks[2].output == "Error: syntax: incomplete: premature end of input\n"
@test doc1.chunks[3].output == "\njulia> plot(x)\nError: UndefVarError: plot not defined\n\njulia> y = 10\n10\n\njulia> print(y\nError: syntax: incomplete: premature end of input\n"

View File

@ -1,7 +1,3 @@
using Weave
using Weave: run_doc
using Test
# Test rendering of doc chunks
content = """
# Test chunk
@ -132,7 +128,7 @@ f = Weave.format_chunk(dchunk, pformat.formatdict, pformat)
@test f == "\\section{Test chunk}\nα\n\n"
function doc_from_string(str)
parsed = Weave.parse_doc(str,"markdown")
parsed = Weave.parse_markdown(str)
header = Weave.parse_header(parsed[1])
Weave.WeaveDoc("",parsed,header)
end

View File

@ -1,4 +1,3 @@
using Weave, Test
using Mustache
# Test parsing
@ -13,19 +12,18 @@ Some markdown with inline stuff and `j code`
"""
pat = Weave.input_formats["markdown"].inline
ms = collect(eachmatch(pat, doc))
ms = collect(eachmatch(Weave.INLINE_REGEX, doc))
@test ms[1][2] == "println(\"Something\")"
@test ms[2][1] == "code"
@test ms[3][1] == "show(\"is\")"
chunk = Weave.parse_doc(doc, Weave.input_formats["markdown"])[1]
chunk = Weave.parse_markdown(doc)[1]
@test length(chunk.content) == 7
@test chunk.content[2].content == ms[1][2]
@test chunk.content[4].content == ms[2][1]
@test chunk.content[6].content == ms[3][1]
chunknw = Weave.parse_doc(doc, Weave.input_formats["noweb"])[1]
chunknw = Weave.parse_markdown(doc, false)[1]
@test all([chunknw.content[i].content == chunk.content[i].content for i in 1:7])
# Test with document

View File

@ -1,6 +1,6 @@
@testset "evaluation module" begin
function mock_output(document, mod = nothing)
parsed = Weave.parse_doc(document, "markdown")
parsed = Weave.parse_markdown(document)
doc = Weave.WeaveDoc("dummy.jmd", parsed, Dict())
result_doc = run_doc(doc, mod = mod)
@test isdefined(result_doc.chunks[1], :output)