/* GNU Guix --- Functional package management for GNU Copyright (C) 2018 Ludovic Courtès <ludo@gnu.org> This file is part of GNU Guix. GNU Guix is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. GNU Guix is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Guix. If not, see <http://www.gnu.org/licenses/>. */ /* Make the given @WRAPPED_PROGRAM@ relocatable by executing it in a separate mount namespace where the store is mounted in its right place. We would happily do that in Scheme using 'call-with-container'. However, this very program needs to be relocatable, so it needs to be statically linked, which complicates things (Guile's modules can hardly be "linked" into a single executable.) */ #define _GNU_SOURCE #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sched.h> #include <sys/mount.h> #include <errno.h> #include <libgen.h> #include <limits.h> #include <string.h> #include <assert.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/wait.h> #include <fcntl.h> #include <dirent.h> #include <sys/syscall.h> /* Concatenate DIRECTORY, a slash, and FILE. Return the result, which the caller must eventually free. */ static char * concat (const char *directory, const char *file) { char *result = malloc (strlen (directory) + 2 + strlen (file)); assert (result != NULL); strcpy (result, directory); strcat (result, "/"); strcat (result, file); return result; } static void mkdir_p (const char *directory) { if (strcmp (directory, "/") != 0) { char *parent = dirname (strdupa (directory)); mkdir_p (parent); int err = mkdir (directory, 0700); if (err < 0 && errno != EEXIST) assert_perror (errno); } } static void rm_rf (const char *directory) { DIR *stream = opendir (directory); for (struct dirent *entry = readdir (stream); entry != NULL; entry = readdir (stream)) { if (strcmp (entry->d_name, ".") == 0 || strcmp (entry->d_name, "..") == 0) continue; char *full = concat (directory, entry->d_name); int err = unlink (full); if (err < 0) { if (errno == EISDIR) /* Recurse (we expect a shallow directory structure so there's little risk of stack overflow.) */ rm_rf (full); else assert_perror (errno); } free (full); } closedir (stream); int err = rmdir (directory); if (err < 0 && errno != ENOENT) assert_perror (errno); } /* Bind mount all the top-level entries in SOURCE to TARGET. */ static void bind_mount (const char *source, const char *target) { DIR *stream = opendir (source); for (struct dirent *entry = readdir (stream); entry != NULL; entry = readdir (stream)) { /* XXX: Some file systems may not report a useful 'd_type'. Ignore them for now. */ assert (entry->d_type != DT_UNKNOWN); if (strcmp (entry->d_name, ".") == 0 || strcmp (entry->d_name, "..") == 0) continue; char *abs_source = concat (source, entry->d_name); char *new_entry = concat (target, entry->d_name); if (entry->d_type == DT_LNK) { char target[PATH_MAX]; ssize_t result = readlink (abs_source, target, sizeof target - 1); if (result > 0) { target[result] = '\0'; int err = symlink (target, new_entry); if (err < 0) assert_perror (errno); } } else { /* Create the mount point. */ if (entry->d_type == DT_DIR) { int err = mkdir (new_entry, 0700); if (err != 0) assert_perror (errno); } else close (open (new_entry, O_WRONLY | O_CREAT)); int err = mount (abs_source, new_entry, "none", MS_BIND | MS_REC | MS_RDONLY, NULL); /* It used to be that only directories could be bind-mounted. Thus, keep going if we fail to bind-mount a non-directory entry. That's OK because regular files in the root file system are usually uninteresting. */ if (err != 0 && entry->d_type != DT_DIR) assert_perror (errno); free (new_entry); free (abs_source); } } closedir (stream); } /* Write the user/group ID map for PID to FILE, mapping ID to itself. See user_namespaces(7). */ static void write_id_map (pid_t pid, const char *file, int id) { char id_map_file[100]; snprintf (id_map_file, sizeof id_map_file, "/proc/%d/%s", pid, file); char id_map[100]; /* Map root and the current user. */ int len = snprintf (id_map, sizeof id_map, "%d %d 1\n", id, id); int fd = open (id_map_file, O_WRONLY); if (fd < 0) assert_perror (errno); int n = write (fd, id_map, len); if (n < 0) assert_perror (errno); close (fd); } /* Disallow setgroups(2) for PID. */ static void disallow_setgroups (pid_t pid) { char file[100]; snprintf (file, sizeof file, "/proc/%d/setgroups", pid); int fd = open (file, O_WRONLY); if (fd < 0) assert_perror (errno); int err = write (fd, "deny", 5); if (err < 0) assert_perror (errno); close (fd); } int main (int argc, char *argv[]) { ssize_t size; char self[PATH_MAX]; size = readlink ("/proc/self/exe", self, sizeof self - 1); assert (size > 0); /* SELF is something like "/home/ludo/.local/gnu/store/…-foo/bin/ls" and we want to extract "/home/ludo/.local/gnu/store". */ size_t index = strlen (self) - strlen ("@WRAPPED_PROGRAM@") + strlen ("@STORE_DIRECTORY@"); char *store = strdup (self); store[index] = '\0'; struct stat statbuf; /* If STORE is already at the "right" place, we can execute @WRAPPED_PROGRAM@ right away. This is not just an optimization: it's needed when running one of these wrappers from within an unshare'd namespace, because 'unshare' fails with EPERM in that context. */ if (strcmp (store, "@STORE_DIRECTORY@") != 0 && lstat ("@WRAPPED_PROGRAM@", &statbuf) != 0) { /* Spawn @WRAPPED_PROGRAM@ in a separate namespace where STORE is bind-mounted in the right place. */ int err; char *new_root = mkdtemp (strdup ("/tmp/guix-exec-XXXXXX")); char *new_store = concat (new_root, "@STORE_DIRECTORY@"); char *cwd = get_current_dir_name (); /* Create a child with separate namespaces and set up bind-mounts from there. That way, bind-mounts automatically disappear when the child exits, which simplifies cleanup for the parent. Note: clone is more convenient than fork + unshare since the parent can directly write the child uid_map/gid_map files. */ pid_t child = syscall (SYS_clone, SIGCHLD | CLONE_NEWNS | CLONE_NEWUSER, NULL, NULL, NULL); switch (child) { case 0: /* Note: Due to <https://bugzilla.kernel.org/show_bug.cgi?id=183461> we cannot make NEW_ROOT a tmpfs (which would have saved the need for 'rm_rf'.) */ bind_mount ("/", new_root); mkdir_p (new_store); err = mount (store, new_store, "none", MS_BIND | MS_REC | MS_RDONLY, NULL); if (err < 0) assert_perror (errno); chdir (new_root); err = chroot (new_root); if (err < 0) assert_perror (errno); /* Change back to where we were before chroot'ing. */ chdir (cwd); break; case -1: fprintf (stderr, "%s: error: 'clone' failed: %m\n", argv[0]); fprintf (stderr, "\ This may be because \"user namespaces\" are not supported on this system.\n\ Consequently, we cannot run '@WRAPPED_PROGRAM@',\n\ unless you move it to the '@STORE_DIRECTORY@' directory.\n\ \n\ Please refer to the 'guix pack' documentation for more information.\n"); return EXIT_FAILURE; default: { /* Map the current user/group ID in the child's namespace (the default is to get the "overflow UID", i.e., the UID of "nobody"). We must first disallow 'setgroups' for that process. */ disallow_setgroups (child); write_id_map (child, "uid_map", getuid ()); write_id_map (child, "gid_map", getgid ()); int status; waitpid (child, &status, 0); chdir ("/"); /* avoid EBUSY */ rm_rf (new_root); free (new_root); exit (status); } } } /* The executable is available under @STORE_DIRECTORY@, so we can now execute it. */ int err = execv ("@WRAPPED_PROGRAM@", argv); if (err < 0) assert_perror (errno); return EXIT_FAILURE; }