Home | History | Annotate | Download | only in minijail
      1 /* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      5 
      6 #define _BSD_SOURCE
      7 #define _DEFAULT_SOURCE
      8 #define _GNU_SOURCE
      9 
     10 #include <asm/unistd.h>
     11 #include <dirent.h>
     12 #include <errno.h>
     13 #include <fcntl.h>
     14 #include <grp.h>
     15 #include <linux/capability.h>
     16 #include <sched.h>
     17 #include <signal.h>
     18 #include <stdbool.h>
     19 #include <stddef.h>
     20 #include <stdio.h>
     21 #include <stdlib.h>
     22 #include <string.h>
     23 #include <sys/capability.h>
     24 #include <sys/mount.h>
     25 #include <sys/param.h>
     26 #include <sys/prctl.h>
     27 #include <sys/resource.h>
     28 #include <sys/stat.h>
     29 #include <sys/sysmacros.h>
     30 #include <sys/types.h>
     31 #include <sys/user.h>
     32 #include <sys/wait.h>
     33 #include <syscall.h>
     34 #include <unistd.h>
     35 
     36 #include "libminijail.h"
     37 #include "libminijail-private.h"
     38 
     39 #include "signal_handler.h"
     40 #include "syscall_filter.h"
     41 #include "syscall_wrapper.h"
     42 #include "system.h"
     43 #include "util.h"
     44 
     45 /* Until these are reliably available in linux/prctl.h. */
     46 #ifndef PR_ALT_SYSCALL
     47 # define PR_ALT_SYSCALL 0x43724f53
     48 #endif
     49 
     50 /* Seccomp filter related flags. */
     51 #ifndef PR_SET_NO_NEW_PRIVS
     52 # define PR_SET_NO_NEW_PRIVS 38
     53 #endif
     54 
     55 #ifndef SECCOMP_MODE_FILTER
     56 #define SECCOMP_MODE_FILTER 2 /* Uses user-supplied filter. */
     57 #endif
     58 
     59 #ifndef SECCOMP_SET_MODE_STRICT
     60 # define SECCOMP_SET_MODE_STRICT 0
     61 #endif
     62 #ifndef SECCOMP_SET_MODE_FILTER
     63 # define SECCOMP_SET_MODE_FILTER 1
     64 #endif
     65 
     66 #ifndef SECCOMP_FILTER_FLAG_TSYNC
     67 # define SECCOMP_FILTER_FLAG_TSYNC 1
     68 #endif
     69 /* End seccomp filter related flags. */
     70 
     71 /* New cgroup namespace might not be in linux-headers yet. */
     72 #ifndef CLONE_NEWCGROUP
     73 # define CLONE_NEWCGROUP 0x02000000
     74 #endif
     75 
     76 #define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
     77 
     78 #define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */
     79 
     80 #define MAX_PRESERVED_FDS 32U
     81 
     82 /* Keyctl commands. */
     83 #define KEYCTL_JOIN_SESSION_KEYRING 1
     84 
     85 struct minijail_rlimit {
     86 	int type;
     87 	rlim_t cur;
     88 	rlim_t max;
     89 };
     90 
     91 struct mountpoint {
     92 	char *src;
     93 	char *dest;
     94 	char *type;
     95 	char *data;
     96 	int has_data;
     97 	unsigned long flags;
     98 	struct mountpoint *next;
     99 };
    100 
    101 struct hook {
    102 	minijail_hook_t hook;
    103 	void *payload;
    104 	minijail_hook_event_t event;
    105 	struct hook *next;
    106 };
    107 
    108 struct preserved_fd {
    109 	int parent_fd;
    110 	int child_fd;
    111 };
    112 
    113 struct minijail {
    114 	/*
    115 	 * WARNING: if you add a flag here you need to make sure it's
    116 	 * accounted for in minijail_pre{enter|exec}() below.
    117 	 */
    118 	struct {
    119 		int uid : 1;
    120 		int gid : 1;
    121 		int inherit_suppl_gids : 1;
    122 		int set_suppl_gids : 1;
    123 		int keep_suppl_gids : 1;
    124 		int use_caps : 1;
    125 		int capbset_drop : 1;
    126 		int set_ambient_caps : 1;
    127 		int vfs : 1;
    128 		int enter_vfs : 1;
    129 		int pids : 1;
    130 		int ipc : 1;
    131 		int uts : 1;
    132 		int net : 1;
    133 		int enter_net : 1;
    134 		int ns_cgroups : 1;
    135 		int userns : 1;
    136 		int disable_setgroups : 1;
    137 		int seccomp : 1;
    138 		int remount_proc_ro : 1;
    139 		int no_new_privs : 1;
    140 		int seccomp_filter : 1;
    141 		int seccomp_filter_tsync : 1;
    142 		int seccomp_filter_logging : 1;
    143 		int chroot : 1;
    144 		int pivot_root : 1;
    145 		int mount_dev : 1;
    146 		int mount_tmp : 1;
    147 		int do_init : 1;
    148 		int run_as_init : 1;
    149 		int pid_file : 1;
    150 		int cgroups : 1;
    151 		int alt_syscall : 1;
    152 		int reset_signal_mask : 1;
    153 		int close_open_fds : 1;
    154 		int new_session_keyring : 1;
    155 		int forward_signals : 1;
    156 	} flags;
    157 	uid_t uid;
    158 	gid_t gid;
    159 	gid_t usergid;
    160 	char *user;
    161 	size_t suppl_gid_count;
    162 	gid_t *suppl_gid_list;
    163 	uint64_t caps;
    164 	uint64_t cap_bset;
    165 	pid_t initpid;
    166 	int mountns_fd;
    167 	int netns_fd;
    168 	char *chrootdir;
    169 	char *pid_file_path;
    170 	char *uidmap;
    171 	char *gidmap;
    172 	char *hostname;
    173 	size_t filter_len;
    174 	struct sock_fprog *filter_prog;
    175 	char *alt_syscall_table;
    176 	struct mountpoint *mounts_head;
    177 	struct mountpoint *mounts_tail;
    178 	size_t mounts_count;
    179 	unsigned long remount_mode;
    180 	size_t tmpfs_size;
    181 	char *cgroups[MAX_CGROUPS];
    182 	size_t cgroup_count;
    183 	struct minijail_rlimit rlimits[MAX_RLIMITS];
    184 	size_t rlimit_count;
    185 	uint64_t securebits_skip_mask;
    186 	struct hook *hooks_head;
    187 	struct hook *hooks_tail;
    188 	struct preserved_fd preserved_fds[MAX_PRESERVED_FDS];
    189 	size_t preserved_fd_count;
    190 };
    191 
    192 static void run_hooks_or_die(const struct minijail *j,
    193 			     minijail_hook_event_t event);
    194 
    195 static void free_mounts_list(struct minijail *j)
    196 {
    197 	while (j->mounts_head) {
    198 		struct mountpoint *m = j->mounts_head;
    199 		j->mounts_head = j->mounts_head->next;
    200 		free(m->data);
    201 		free(m->type);
    202 		free(m->dest);
    203 		free(m->src);
    204 		free(m);
    205 	}
    206 	// No need to clear mounts_head as we know it's NULL after the loop.
    207 	j->mounts_tail = NULL;
    208 }
    209 
    210 /*
    211  * Strip out flags meant for the parent.
    212  * We keep things that are not inherited across execve(2) (e.g. capabilities),
    213  * or are easier to set after execve(2) (e.g. seccomp filters).
    214  */
    215 void minijail_preenter(struct minijail *j)
    216 {
    217 	j->flags.vfs = 0;
    218 	j->flags.enter_vfs = 0;
    219 	j->flags.remount_proc_ro = 0;
    220 	j->flags.pids = 0;
    221 	j->flags.do_init = 0;
    222 	j->flags.run_as_init = 0;
    223 	j->flags.pid_file = 0;
    224 	j->flags.cgroups = 0;
    225 	j->flags.forward_signals = 0;
    226 	j->remount_mode = 0;
    227 }
    228 
    229 /*
    230  * Strip out flags meant for the child.
    231  * We keep things that are inherited across execve(2).
    232  */
    233 void minijail_preexec(struct minijail *j)
    234 {
    235 	int vfs = j->flags.vfs;
    236 	int enter_vfs = j->flags.enter_vfs;
    237 	int remount_proc_ro = j->flags.remount_proc_ro;
    238 	int userns = j->flags.userns;
    239 	if (j->user)
    240 		free(j->user);
    241 	j->user = NULL;
    242 	if (j->suppl_gid_list)
    243 		free(j->suppl_gid_list);
    244 	j->suppl_gid_list = NULL;
    245 	free_mounts_list(j);
    246 	memset(&j->flags, 0, sizeof(j->flags));
    247 	/* Now restore anything we meant to keep. */
    248 	j->flags.vfs = vfs;
    249 	j->flags.enter_vfs = enter_vfs;
    250 	j->flags.remount_proc_ro = remount_proc_ro;
    251 	j->flags.userns = userns;
    252 	/* Note, |pids| will already have been used before this call. */
    253 }
    254 
    255 /* Minijail API. */
    256 
    257 struct minijail API *minijail_new(void)
    258 {
    259 	struct minijail *j = calloc(1, sizeof(struct minijail));
    260 	j->remount_mode = MS_PRIVATE;
    261 	return j;
    262 }
    263 
    264 void API minijail_change_uid(struct minijail *j, uid_t uid)
    265 {
    266 	if (uid == 0)
    267 		die("useless change to uid 0");
    268 	j->uid = uid;
    269 	j->flags.uid = 1;
    270 }
    271 
    272 void API minijail_change_gid(struct minijail *j, gid_t gid)
    273 {
    274 	if (gid == 0)
    275 		die("useless change to gid 0");
    276 	j->gid = gid;
    277 	j->flags.gid = 1;
    278 }
    279 
    280 void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
    281 					 const gid_t *list)
    282 {
    283 	size_t i;
    284 
    285 	if (j->flags.inherit_suppl_gids)
    286 		die("cannot inherit *and* set supplementary groups");
    287 	if (j->flags.keep_suppl_gids)
    288 		die("cannot keep *and* set supplementary groups");
    289 
    290 	if (size == 0) {
    291 		/* Clear supplementary groups. */
    292 		j->suppl_gid_list = NULL;
    293 		j->suppl_gid_count = 0;
    294 		j->flags.set_suppl_gids = 1;
    295 		return;
    296 	}
    297 
    298 	/* Copy the gid_t array. */
    299 	j->suppl_gid_list = calloc(size, sizeof(gid_t));
    300 	if (!j->suppl_gid_list) {
    301 		die("failed to allocate internal supplementary group array");
    302 	}
    303 	for (i = 0; i < size; i++) {
    304 		j->suppl_gid_list[i] = list[i];
    305 	}
    306 	j->suppl_gid_count = size;
    307 	j->flags.set_suppl_gids = 1;
    308 }
    309 
    310 void API minijail_keep_supplementary_gids(struct minijail *j) {
    311 	j->flags.keep_suppl_gids = 1;
    312 }
    313 
    314 int API minijail_change_user(struct minijail *j, const char *user)
    315 {
    316 	uid_t uid;
    317 	gid_t gid;
    318 	int rc = lookup_user(user, &uid, &gid);
    319 	if (rc)
    320 		return rc;
    321 	minijail_change_uid(j, uid);
    322 	j->user = strdup(user);
    323 	if (!j->user)
    324 		return -ENOMEM;
    325 	j->usergid = gid;
    326 	return 0;
    327 }
    328 
    329 int API minijail_change_group(struct minijail *j, const char *group)
    330 {
    331 	gid_t gid;
    332 	int rc = lookup_group(group, &gid);
    333 	if (rc)
    334 		return rc;
    335 	minijail_change_gid(j, gid);
    336 	return 0;
    337 }
    338 
    339 void API minijail_use_seccomp(struct minijail *j)
    340 {
    341 	j->flags.seccomp = 1;
    342 }
    343 
    344 void API minijail_no_new_privs(struct minijail *j)
    345 {
    346 	j->flags.no_new_privs = 1;
    347 }
    348 
    349 void API minijail_use_seccomp_filter(struct minijail *j)
    350 {
    351 	j->flags.seccomp_filter = 1;
    352 }
    353 
    354 void API minijail_set_seccomp_filter_tsync(struct minijail *j)
    355 {
    356 	if (j->filter_len > 0 && j->filter_prog != NULL) {
    357 		die("minijail_set_seccomp_filter_tsync() must be called "
    358 		    "before minijail_parse_seccomp_filters()");
    359 	}
    360 	j->flags.seccomp_filter_tsync = 1;
    361 }
    362 
    363 void API minijail_log_seccomp_filter_failures(struct minijail *j)
    364 {
    365 	if (j->filter_len > 0 && j->filter_prog != NULL) {
    366 		die("minijail_log_seccomp_filter_failures() must be called "
    367 		    "before minijail_parse_seccomp_filters()");
    368 	}
    369 	j->flags.seccomp_filter_logging = 1;
    370 }
    371 
    372 void API minijail_use_caps(struct minijail *j, uint64_t capmask)
    373 {
    374 	/*
    375 	 * 'minijail_use_caps' configures a runtime-capabilities-only
    376 	 * environment, including a bounding set matching the thread's runtime
    377 	 * (permitted|inheritable|effective) sets.
    378 	 * Therefore, it will override any existing bounding set configurations
    379 	 * since the latter would allow gaining extra runtime capabilities from
    380 	 * file capabilities.
    381 	 */
    382 	if (j->flags.capbset_drop) {
    383 		warn("overriding bounding set configuration");
    384 		j->cap_bset = 0;
    385 		j->flags.capbset_drop = 0;
    386 	}
    387 	j->caps = capmask;
    388 	j->flags.use_caps = 1;
    389 }
    390 
    391 void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
    392 {
    393 	if (j->flags.use_caps) {
    394 		/*
    395 		 * 'minijail_use_caps' will have already configured a capability
    396 		 * bounding set matching the (permitted|inheritable|effective)
    397 		 * sets. Abort if the user tries to configure a separate
    398 		 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
    399 		 * are mutually exclusive.
    400 		 */
    401 		die("runtime capabilities already configured, can't drop "
    402 		    "bounding set separately");
    403 	}
    404 	j->cap_bset = capmask;
    405 	j->flags.capbset_drop = 1;
    406 }
    407 
    408 void API minijail_set_ambient_caps(struct minijail *j)
    409 {
    410 	j->flags.set_ambient_caps = 1;
    411 }
    412 
    413 void API minijail_reset_signal_mask(struct minijail *j)
    414 {
    415 	j->flags.reset_signal_mask = 1;
    416 }
    417 
    418 void API minijail_namespace_vfs(struct minijail *j)
    419 {
    420 	j->flags.vfs = 1;
    421 }
    422 
    423 void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
    424 {
    425 	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
    426 	if (ns_fd < 0) {
    427 		pdie("failed to open namespace '%s'", ns_path);
    428 	}
    429 	j->mountns_fd = ns_fd;
    430 	j->flags.enter_vfs = 1;
    431 }
    432 
    433 void API minijail_new_session_keyring(struct minijail *j)
    434 {
    435 	j->flags.new_session_keyring = 1;
    436 }
    437 
    438 void API minijail_skip_setting_securebits(struct minijail *j,
    439 					  uint64_t securebits_skip_mask)
    440 {
    441 	j->securebits_skip_mask = securebits_skip_mask;
    442 }
    443 
    444 void API minijail_remount_mode(struct minijail *j, unsigned long mode)
    445 {
    446 	j->remount_mode = mode;
    447 }
    448 
    449 void API minijail_skip_remount_private(struct minijail *j)
    450 {
    451 	j->remount_mode = 0;
    452 }
    453 
    454 void API minijail_namespace_pids(struct minijail *j)
    455 {
    456 	j->flags.vfs = 1;
    457 	j->flags.remount_proc_ro = 1;
    458 	j->flags.pids = 1;
    459 	j->flags.do_init = 1;
    460 }
    461 
    462 void API minijail_namespace_ipc(struct minijail *j)
    463 {
    464 	j->flags.ipc = 1;
    465 }
    466 
    467 void API minijail_namespace_uts(struct minijail *j)
    468 {
    469 	j->flags.uts = 1;
    470 }
    471 
    472 int API minijail_namespace_set_hostname(struct minijail *j, const char *name)
    473 {
    474 	if (j->hostname)
    475 		return -EINVAL;
    476 	minijail_namespace_uts(j);
    477 	j->hostname = strdup(name);
    478 	if (!j->hostname)
    479 		return -ENOMEM;
    480 	return 0;
    481 }
    482 
    483 void API minijail_namespace_net(struct minijail *j)
    484 {
    485 	j->flags.net = 1;
    486 }
    487 
    488 void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
    489 {
    490 	int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
    491 	if (ns_fd < 0) {
    492 		pdie("failed to open namespace '%s'", ns_path);
    493 	}
    494 	j->netns_fd = ns_fd;
    495 	j->flags.enter_net = 1;
    496 }
    497 
    498 void API minijail_namespace_cgroups(struct minijail *j)
    499 {
    500 	j->flags.ns_cgroups = 1;
    501 }
    502 
    503 void API minijail_close_open_fds(struct minijail *j)
    504 {
    505 	j->flags.close_open_fds = 1;
    506 }
    507 
    508 void API minijail_remount_proc_readonly(struct minijail *j)
    509 {
    510 	j->flags.vfs = 1;
    511 	j->flags.remount_proc_ro = 1;
    512 }
    513 
    514 void API minijail_namespace_user(struct minijail *j)
    515 {
    516 	j->flags.userns = 1;
    517 }
    518 
    519 void API minijail_namespace_user_disable_setgroups(struct minijail *j)
    520 {
    521 	j->flags.disable_setgroups = 1;
    522 }
    523 
    524 int API minijail_uidmap(struct minijail *j, const char *uidmap)
    525 {
    526 	j->uidmap = strdup(uidmap);
    527 	if (!j->uidmap)
    528 		return -ENOMEM;
    529 	char *ch;
    530 	for (ch = j->uidmap; *ch; ch++) {
    531 		if (*ch == ',')
    532 			*ch = '\n';
    533 	}
    534 	return 0;
    535 }
    536 
    537 int API minijail_gidmap(struct minijail *j, const char *gidmap)
    538 {
    539 	j->gidmap = strdup(gidmap);
    540 	if (!j->gidmap)
    541 		return -ENOMEM;
    542 	char *ch;
    543 	for (ch = j->gidmap; *ch; ch++) {
    544 		if (*ch == ',')
    545 			*ch = '\n';
    546 	}
    547 	return 0;
    548 }
    549 
    550 void API minijail_inherit_usergroups(struct minijail *j)
    551 {
    552 	j->flags.inherit_suppl_gids = 1;
    553 }
    554 
    555 void API minijail_run_as_init(struct minijail *j)
    556 {
    557 	/*
    558 	 * Since the jailed program will become 'init' in the new PID namespace,
    559 	 * Minijail does not need to fork an 'init' process.
    560 	 */
    561 	j->flags.run_as_init = 1;
    562 }
    563 
    564 int API minijail_enter_chroot(struct minijail *j, const char *dir)
    565 {
    566 	if (j->chrootdir)
    567 		return -EINVAL;
    568 	j->chrootdir = strdup(dir);
    569 	if (!j->chrootdir)
    570 		return -ENOMEM;
    571 	j->flags.chroot = 1;
    572 	return 0;
    573 }
    574 
    575 int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
    576 {
    577 	if (j->chrootdir)
    578 		return -EINVAL;
    579 	j->chrootdir = strdup(dir);
    580 	if (!j->chrootdir)
    581 		return -ENOMEM;
    582 	j->flags.pivot_root = 1;
    583 	return 0;
    584 }
    585 
    586 char API *minijail_get_original_path(struct minijail *j,
    587 				     const char *path_inside_chroot)
    588 {
    589 	struct mountpoint *b;
    590 
    591 	b = j->mounts_head;
    592 	while (b) {
    593 		/*
    594 		 * If |path_inside_chroot| is the exact destination of a
    595 		 * mount, then the original path is exactly the source of
    596 		 * the mount.
    597 		 *  for example: "-b /some/path/exe,/chroot/path/exe"
    598 		 *    mount source = /some/path/exe, mount dest =
    599 		 *    /chroot/path/exe Then when getting the original path of
    600 		 *    "/chroot/path/exe", the source of that mount,
    601 		 *    "/some/path/exe" is what should be returned.
    602 		 */
    603 		if (!strcmp(b->dest, path_inside_chroot))
    604 			return strdup(b->src);
    605 
    606 		/*
    607 		 * If |path_inside_chroot| is within the destination path of a
    608 		 * mount, take the suffix of the chroot path relative to the
    609 		 * mount destination path, and append it to the mount source
    610 		 * path.
    611 		 */
    612 		if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
    613 			const char *relative_path =
    614 				path_inside_chroot + strlen(b->dest);
    615 			return path_join(b->src, relative_path);
    616 		}
    617 		b = b->next;
    618 	}
    619 
    620 	/* If there is a chroot path, append |path_inside_chroot| to that. */
    621 	if (j->chrootdir)
    622 		return path_join(j->chrootdir, path_inside_chroot);
    623 
    624 	/* No chroot, so the path outside is the same as it is inside. */
    625 	return strdup(path_inside_chroot);
    626 }
    627 
    628 size_t minijail_get_tmpfs_size(const struct minijail *j)
    629 {
    630 	return j->tmpfs_size;
    631 }
    632 
    633 void API minijail_mount_dev(struct minijail *j)
    634 {
    635 	j->flags.mount_dev = 1;
    636 }
    637 
    638 void API minijail_mount_tmp(struct minijail *j)
    639 {
    640 	minijail_mount_tmp_size(j, 64 * 1024 * 1024);
    641 }
    642 
    643 void API minijail_mount_tmp_size(struct minijail *j, size_t size)
    644 {
    645 	j->tmpfs_size = size;
    646 	j->flags.mount_tmp = 1;
    647 }
    648 
    649 int API minijail_write_pid_file(struct minijail *j, const char *path)
    650 {
    651 	j->pid_file_path = strdup(path);
    652 	if (!j->pid_file_path)
    653 		return -ENOMEM;
    654 	j->flags.pid_file = 1;
    655 	return 0;
    656 }
    657 
    658 int API minijail_add_to_cgroup(struct minijail *j, const char *path)
    659 {
    660 	if (j->cgroup_count >= MAX_CGROUPS)
    661 		return -ENOMEM;
    662 	j->cgroups[j->cgroup_count] = strdup(path);
    663 	if (!j->cgroups[j->cgroup_count])
    664 		return -ENOMEM;
    665 	j->cgroup_count++;
    666 	j->flags.cgroups = 1;
    667 	return 0;
    668 }
    669 
    670 int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max)
    671 {
    672 	size_t i;
    673 
    674 	if (j->rlimit_count >= MAX_RLIMITS)
    675 		return -ENOMEM;
    676 	/* It's an error if the caller sets the same rlimit multiple times. */
    677 	for (i = 0; i < j->rlimit_count; i++) {
    678 		if (j->rlimits[i].type == type)
    679 			return -EEXIST;
    680 	}
    681 
    682 	j->rlimits[j->rlimit_count].type = type;
    683 	j->rlimits[j->rlimit_count].cur = cur;
    684 	j->rlimits[j->rlimit_count].max = max;
    685 	j->rlimit_count++;
    686 	return 0;
    687 }
    688 
    689 int API minijail_forward_signals(struct minijail *j)
    690 {
    691 	j->flags.forward_signals = 1;
    692 	return 0;
    693 }
    694 
    695 int API minijail_mount_with_data(struct minijail *j, const char *src,
    696 				 const char *dest, const char *type,
    697 				 unsigned long flags, const char *data)
    698 {
    699 	struct mountpoint *m;
    700 
    701 	if (*dest != '/')
    702 		return -EINVAL;
    703 	m = calloc(1, sizeof(*m));
    704 	if (!m)
    705 		return -ENOMEM;
    706 	m->dest = strdup(dest);
    707 	if (!m->dest)
    708 		goto error;
    709 	m->src = strdup(src);
    710 	if (!m->src)
    711 		goto error;
    712 	m->type = strdup(type);
    713 	if (!m->type)
    714 		goto error;
    715 	if (data) {
    716 		m->data = strdup(data);
    717 		if (!m->data)
    718 			goto error;
    719 		m->has_data = 1;
    720 	}
    721 	m->flags = flags;
    722 
    723 	info("mount %s -> %s type '%s'", src, dest, type);
    724 
    725 	/*
    726 	 * Force vfs namespacing so the mounts don't leak out into the
    727 	 * containing vfs namespace.
    728 	 */
    729 	minijail_namespace_vfs(j);
    730 
    731 	if (j->mounts_tail)
    732 		j->mounts_tail->next = m;
    733 	else
    734 		j->mounts_head = m;
    735 	j->mounts_tail = m;
    736 	j->mounts_count++;
    737 
    738 	return 0;
    739 
    740 error:
    741 	free(m->type);
    742 	free(m->src);
    743 	free(m->dest);
    744 	free(m);
    745 	return -ENOMEM;
    746 }
    747 
    748 int API minijail_mount(struct minijail *j, const char *src, const char *dest,
    749 		       const char *type, unsigned long flags)
    750 {
    751 	return minijail_mount_with_data(j, src, dest, type, flags, NULL);
    752 }
    753 
    754 int API minijail_bind(struct minijail *j, const char *src, const char *dest,
    755 		      int writeable)
    756 {
    757 	unsigned long flags = MS_BIND;
    758 
    759 	if (!writeable)
    760 		flags |= MS_RDONLY;
    761 
    762 	return minijail_mount(j, src, dest, "", flags);
    763 }
    764 
    765 int API minijail_add_hook(struct minijail *j, minijail_hook_t hook,
    766 			  void *payload, minijail_hook_event_t event)
    767 {
    768 	struct hook *c;
    769 
    770 	if (hook == NULL)
    771 		return -EINVAL;
    772 	if (event >= MINIJAIL_HOOK_EVENT_MAX)
    773 		return -EINVAL;
    774 	c = calloc(1, sizeof(*c));
    775 	if (!c)
    776 		return -ENOMEM;
    777 
    778 	c->hook = hook;
    779 	c->payload = payload;
    780 	c->event = event;
    781 
    782 	if (j->hooks_tail)
    783 		j->hooks_tail->next = c;
    784 	else
    785 		j->hooks_head = c;
    786 	j->hooks_tail = c;
    787 
    788 	return 0;
    789 }
    790 
    791 int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd)
    792 {
    793 	if (parent_fd < 0 || child_fd < 0)
    794 		return -EINVAL;
    795 	if (j->preserved_fd_count >= MAX_PRESERVED_FDS)
    796 		return -ENOMEM;
    797 	j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd;
    798 	j->preserved_fds[j->preserved_fd_count].child_fd = child_fd;
    799 	j->preserved_fd_count++;
    800 	return 0;
    801 }
    802 
    803 static void clear_seccomp_options(struct minijail *j)
    804 {
    805 	j->flags.seccomp_filter = 0;
    806 	j->flags.seccomp_filter_tsync = 0;
    807 	j->flags.seccomp_filter_logging = 0;
    808 	j->filter_len = 0;
    809 	j->filter_prog = NULL;
    810 	j->flags.no_new_privs = 0;
    811 }
    812 
    813 static int seccomp_should_parse_filters(struct minijail *j)
    814 {
    815 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
    816 		/*
    817 		 * |errno| will be set to EINVAL when seccomp has not been
    818 		 * compiled into the kernel. On certain platforms and kernel
    819 		 * versions this is not a fatal failure. In that case, and only
    820 		 * in that case, disable seccomp and skip loading the filters.
    821 		 */
    822 		if ((errno == EINVAL) && seccomp_can_softfail()) {
    823 			warn("not loading seccomp filters, seccomp filter not "
    824 			     "supported");
    825 			clear_seccomp_options(j);
    826 			return 0;
    827 		}
    828 		/*
    829 		 * If |errno| != EINVAL or seccomp_can_softfail() is false,
    830 		 * we can proceed. Worst case scenario minijail_enter() will
    831 		 * abort() if seccomp fails.
    832 		 */
    833 	}
    834 	if (j->flags.seccomp_filter_tsync) {
    835 		/* Are the seccomp(2) syscall and the TSYNC option supported? */
    836 		if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
    837 				SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
    838 			int saved_errno = errno;
    839 			if (saved_errno == ENOSYS && seccomp_can_softfail()) {
    840 				warn("seccomp(2) syscall not supported");
    841 				clear_seccomp_options(j);
    842 				return 0;
    843 			} else if (saved_errno == EINVAL &&
    844 				   seccomp_can_softfail()) {
    845 				warn(
    846 				    "seccomp filter thread sync not supported");
    847 				clear_seccomp_options(j);
    848 				return 0;
    849 			}
    850 			/*
    851 			 * Similar logic here. If seccomp_can_softfail() is
    852 			 * false, or |errno| != ENOSYS, or |errno| != EINVAL,
    853 			 * we can proceed. Worst case scenario minijail_enter()
    854 			 * will abort() if seccomp or TSYNC fail.
    855 			 */
    856 		}
    857 	}
    858 	return 1;
    859 }
    860 
    861 static int parse_seccomp_filters(struct minijail *j, const char *filename,
    862 				 FILE *policy_file)
    863 {
    864 	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
    865 	int use_ret_trap =
    866 	    j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging;
    867 	int allow_logging = j->flags.seccomp_filter_logging;
    868 
    869 	if (compile_filter(filename, policy_file, fprog, use_ret_trap,
    870 			   allow_logging)) {
    871 		free(fprog);
    872 		return -1;
    873 	}
    874 
    875 	j->filter_len = fprog->len;
    876 	j->filter_prog = fprog;
    877 	return 0;
    878 }
    879 
    880 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
    881 {
    882 	if (!seccomp_should_parse_filters(j))
    883 		return;
    884 
    885 	FILE *file = fopen(path, "r");
    886 	if (!file) {
    887 		pdie("failed to open seccomp filter file '%s'", path);
    888 	}
    889 
    890 	if (parse_seccomp_filters(j, path, file) != 0) {
    891 		die("failed to compile seccomp filter BPF program in '%s'",
    892 		    path);
    893 	}
    894 	fclose(file);
    895 }
    896 
    897 void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
    898 {
    899 	char *fd_path, *path;
    900 	FILE *file;
    901 
    902 	if (!seccomp_should_parse_filters(j))
    903 		return;
    904 
    905 	file = fdopen(fd, "r");
    906 	if (!file) {
    907 		pdie("failed to associate stream with fd %d", fd);
    908 	}
    909 
    910 	if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1)
    911 		pdie("failed to create path for fd %d", fd);
    912 	path = realpath(fd_path, NULL);
    913 	if (path == NULL)
    914 		pwarn("failed to get path of fd %d", fd);
    915 	free(fd_path);
    916 
    917 	if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) {
    918 		die("failed to compile seccomp filter BPF program from fd %d",
    919 		    fd);
    920 	}
    921 	free(path);
    922 	fclose(file);
    923 }
    924 
    925 int API minijail_use_alt_syscall(struct minijail *j, const char *table)
    926 {
    927 	j->alt_syscall_table = strdup(table);
    928 	if (!j->alt_syscall_table)
    929 		return -ENOMEM;
    930 	j->flags.alt_syscall = 1;
    931 	return 0;
    932 }
    933 
    934 struct marshal_state {
    935 	size_t available;
    936 	size_t total;
    937 	char *buf;
    938 };
    939 
    940 void marshal_state_init(struct marshal_state *state, char *buf,
    941 			size_t available)
    942 {
    943 	state->available = available;
    944 	state->buf = buf;
    945 	state->total = 0;
    946 }
    947 
    948 void marshal_append(struct marshal_state *state, void *src, size_t length)
    949 {
    950 	size_t copy_len = MIN(state->available, length);
    951 
    952 	/* Up to |available| will be written. */
    953 	if (copy_len) {
    954 		memcpy(state->buf, src, copy_len);
    955 		state->buf += copy_len;
    956 		state->available -= copy_len;
    957 	}
    958 	/* |total| will contain the expected length. */
    959 	state->total += length;
    960 }
    961 
    962 void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
    963 {
    964 	marshal_append(state, m->src, strlen(m->src) + 1);
    965 	marshal_append(state, m->dest, strlen(m->dest) + 1);
    966 	marshal_append(state, m->type, strlen(m->type) + 1);
    967 	marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
    968 	if (m->has_data)
    969 		marshal_append(state, m->data, strlen(m->data) + 1);
    970 	marshal_append(state, (char *)&m->flags, sizeof(m->flags));
    971 }
    972 
    973 void minijail_marshal_helper(struct marshal_state *state,
    974 			     const struct minijail *j)
    975 {
    976 	struct mountpoint *m = NULL;
    977 	size_t i;
    978 
    979 	marshal_append(state, (char *)j, sizeof(*j));
    980 	if (j->user)
    981 		marshal_append(state, j->user, strlen(j->user) + 1);
    982 	if (j->suppl_gid_list) {
    983 		marshal_append(state, j->suppl_gid_list,
    984 			       j->suppl_gid_count * sizeof(gid_t));
    985 	}
    986 	if (j->chrootdir)
    987 		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
    988 	if (j->hostname)
    989 		marshal_append(state, j->hostname, strlen(j->hostname) + 1);
    990 	if (j->alt_syscall_table) {
    991 		marshal_append(state, j->alt_syscall_table,
    992 			       strlen(j->alt_syscall_table) + 1);
    993 	}
    994 	if (j->flags.seccomp_filter && j->filter_prog) {
    995 		struct sock_fprog *fp = j->filter_prog;
    996 		marshal_append(state, (char *)fp->filter,
    997 			       fp->len * sizeof(struct sock_filter));
    998 	}
    999 	for (m = j->mounts_head; m; m = m->next) {
   1000 		marshal_mount(state, m);
   1001 	}
   1002 	for (i = 0; i < j->cgroup_count; ++i)
   1003 		marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
   1004 }
   1005 
   1006 size_t API minijail_size(const struct minijail *j)
   1007 {
   1008 	struct marshal_state state;
   1009 	marshal_state_init(&state, NULL, 0);
   1010 	minijail_marshal_helper(&state, j);
   1011 	return state.total;
   1012 }
   1013 
   1014 int minijail_marshal(const struct minijail *j, char *buf, size_t available)
   1015 {
   1016 	struct marshal_state state;
   1017 	marshal_state_init(&state, buf, available);
   1018 	minijail_marshal_helper(&state, j);
   1019 	return (state.total > available);
   1020 }
   1021 
   1022 int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
   1023 {
   1024 	size_t i;
   1025 	size_t count;
   1026 	int ret = -EINVAL;
   1027 
   1028 	if (length < sizeof(*j))
   1029 		goto out;
   1030 	memcpy((void *)j, serialized, sizeof(*j));
   1031 	serialized += sizeof(*j);
   1032 	length -= sizeof(*j);
   1033 
   1034 	/* Potentially stale pointers not used as signals. */
   1035 	j->pid_file_path = NULL;
   1036 	j->uidmap = NULL;
   1037 	j->gidmap = NULL;
   1038 	j->mounts_head = NULL;
   1039 	j->mounts_tail = NULL;
   1040 	j->filter_prog = NULL;
   1041 	j->hooks_head = NULL;
   1042 	j->hooks_tail = NULL;
   1043 
   1044 	if (j->user) {		/* stale pointer */
   1045 		char *user = consumestr(&serialized, &length);
   1046 		if (!user)
   1047 			goto clear_pointers;
   1048 		j->user = strdup(user);
   1049 		if (!j->user)
   1050 			goto clear_pointers;
   1051 	}
   1052 
   1053 	if (j->suppl_gid_list) {	/* stale pointer */
   1054 		if (j->suppl_gid_count > NGROUPS_MAX) {
   1055 			goto bad_gid_list;
   1056 		}
   1057 		size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
   1058 		void *gid_list_bytes =
   1059 		    consumebytes(gid_list_size, &serialized, &length);
   1060 		if (!gid_list_bytes)
   1061 			goto bad_gid_list;
   1062 
   1063 		j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
   1064 		if (!j->suppl_gid_list)
   1065 			goto bad_gid_list;
   1066 
   1067 		memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
   1068 	}
   1069 
   1070 	if (j->chrootdir) {	/* stale pointer */
   1071 		char *chrootdir = consumestr(&serialized, &length);
   1072 		if (!chrootdir)
   1073 			goto bad_chrootdir;
   1074 		j->chrootdir = strdup(chrootdir);
   1075 		if (!j->chrootdir)
   1076 			goto bad_chrootdir;
   1077 	}
   1078 
   1079 	if (j->hostname) {	/* stale pointer */
   1080 		char *hostname = consumestr(&serialized, &length);
   1081 		if (!hostname)
   1082 			goto bad_hostname;
   1083 		j->hostname = strdup(hostname);
   1084 		if (!j->hostname)
   1085 			goto bad_hostname;
   1086 	}
   1087 
   1088 	if (j->alt_syscall_table) {	/* stale pointer */
   1089 		char *alt_syscall_table = consumestr(&serialized, &length);
   1090 		if (!alt_syscall_table)
   1091 			goto bad_syscall_table;
   1092 		j->alt_syscall_table = strdup(alt_syscall_table);
   1093 		if (!j->alt_syscall_table)
   1094 			goto bad_syscall_table;
   1095 	}
   1096 
   1097 	if (j->flags.seccomp_filter && j->filter_len > 0) {
   1098 		size_t ninstrs = j->filter_len;
   1099 		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
   1100 		    ninstrs > USHRT_MAX)
   1101 			goto bad_filters;
   1102 
   1103 		size_t program_len = ninstrs * sizeof(struct sock_filter);
   1104 		void *program = consumebytes(program_len, &serialized, &length);
   1105 		if (!program)
   1106 			goto bad_filters;
   1107 
   1108 		j->filter_prog = malloc(sizeof(struct sock_fprog));
   1109 		if (!j->filter_prog)
   1110 			goto bad_filters;
   1111 
   1112 		j->filter_prog->len = ninstrs;
   1113 		j->filter_prog->filter = malloc(program_len);
   1114 		if (!j->filter_prog->filter)
   1115 			goto bad_filter_prog_instrs;
   1116 
   1117 		memcpy(j->filter_prog->filter, program, program_len);
   1118 	}
   1119 
   1120 	count = j->mounts_count;
   1121 	j->mounts_count = 0;
   1122 	for (i = 0; i < count; ++i) {
   1123 		unsigned long *flags;
   1124 		int *has_data;
   1125 		const char *dest;
   1126 		const char *type;
   1127 		const char *data = NULL;
   1128 		const char *src = consumestr(&serialized, &length);
   1129 		if (!src)
   1130 			goto bad_mounts;
   1131 		dest = consumestr(&serialized, &length);
   1132 		if (!dest)
   1133 			goto bad_mounts;
   1134 		type = consumestr(&serialized, &length);
   1135 		if (!type)
   1136 			goto bad_mounts;
   1137 		has_data = consumebytes(sizeof(*has_data), &serialized,
   1138 					&length);
   1139 		if (!has_data)
   1140 			goto bad_mounts;
   1141 		if (*has_data) {
   1142 			data = consumestr(&serialized, &length);
   1143 			if (!data)
   1144 				goto bad_mounts;
   1145 		}
   1146 		flags = consumebytes(sizeof(*flags), &serialized, &length);
   1147 		if (!flags)
   1148 			goto bad_mounts;
   1149 		if (minijail_mount_with_data(j, src, dest, type, *flags, data))
   1150 			goto bad_mounts;
   1151 	}
   1152 
   1153 	count = j->cgroup_count;
   1154 	j->cgroup_count = 0;
   1155 	for (i = 0; i < count; ++i) {
   1156 		char *cgroup = consumestr(&serialized, &length);
   1157 		if (!cgroup)
   1158 			goto bad_cgroups;
   1159 		j->cgroups[i] = strdup(cgroup);
   1160 		if (!j->cgroups[i])
   1161 			goto bad_cgroups;
   1162 		++j->cgroup_count;
   1163 	}
   1164 
   1165 	return 0;
   1166 
   1167 bad_cgroups:
   1168 	free_mounts_list(j);
   1169 	for (i = 0; i < j->cgroup_count; ++i)
   1170 		free(j->cgroups[i]);
   1171 bad_mounts:
   1172 	if (j->flags.seccomp_filter && j->filter_len > 0) {
   1173 		free(j->filter_prog->filter);
   1174 		free(j->filter_prog);
   1175 	}
   1176 bad_filter_prog_instrs:
   1177 	if (j->filter_prog)
   1178 		free(j->filter_prog);
   1179 bad_filters:
   1180 	if (j->alt_syscall_table)
   1181 		free(j->alt_syscall_table);
   1182 bad_syscall_table:
   1183 	if (j->chrootdir)
   1184 		free(j->chrootdir);
   1185 bad_chrootdir:
   1186 	if (j->hostname)
   1187 		free(j->hostname);
   1188 bad_hostname:
   1189 	if (j->suppl_gid_list)
   1190 		free(j->suppl_gid_list);
   1191 bad_gid_list:
   1192 	if (j->user)
   1193 		free(j->user);
   1194 clear_pointers:
   1195 	j->user = NULL;
   1196 	j->suppl_gid_list = NULL;
   1197 	j->chrootdir = NULL;
   1198 	j->hostname = NULL;
   1199 	j->alt_syscall_table = NULL;
   1200 	j->cgroup_count = 0;
   1201 out:
   1202 	return ret;
   1203 }
   1204 
   1205 struct dev_spec {
   1206 	const char *name;
   1207 	mode_t mode;
   1208 	dev_t major, minor;
   1209 };
   1210 
   1211 static const struct dev_spec device_nodes[] = {
   1212 	{
   1213 		"null",
   1214 		S_IFCHR | 0666, 1, 3,
   1215 	},
   1216 	{
   1217 		"zero",
   1218 		S_IFCHR | 0666, 1, 5,
   1219 	},
   1220 	{
   1221 		"full",
   1222 		S_IFCHR | 0666, 1, 7,
   1223 	},
   1224 	{
   1225 		"urandom",
   1226 		S_IFCHR | 0444, 1, 9,
   1227 	},
   1228 	{
   1229 		"tty",
   1230 		S_IFCHR | 0666, 5, 0,
   1231 	},
   1232 };
   1233 
   1234 struct dev_sym_spec {
   1235 	const char *source, *dest;
   1236 };
   1237 
   1238 static const struct dev_sym_spec device_symlinks[] = {
   1239 	{ "ptmx", "pts/ptmx", },
   1240 	{ "fd", "/proc/self/fd", },
   1241 	{ "stdin", "fd/0", },
   1242 	{ "stdout", "fd/1", },
   1243 	{ "stderr", "fd/2", },
   1244 };
   1245 
   1246 /*
   1247  * Clean up the temporary dev path we had setup previously.  In case of errors,
   1248  * we don't want to go leaking empty tempdirs.
   1249  */
   1250 static void mount_dev_cleanup(char *dev_path)
   1251 {
   1252 	umount2(dev_path, MNT_DETACH);
   1253 	rmdir(dev_path);
   1254 	free(dev_path);
   1255 }
   1256 
   1257 /*
   1258  * Set up the pseudo /dev path at the temporary location.
   1259  * See mount_dev_finalize for more details.
   1260  */
   1261 static int mount_dev(char **dev_path_ret)
   1262 {
   1263 	int ret;
   1264 	int dev_fd;
   1265 	size_t i;
   1266 	mode_t mask;
   1267 	char *dev_path;
   1268 
   1269 	/*
   1270 	 * Create a temp path for the /dev init.  We'll relocate this to the
   1271 	 * final location later on in the startup process.
   1272 	 */
   1273 	dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX");
   1274 	if (dev_path == NULL || mkdtemp(dev_path) == NULL)
   1275 		pdie("could not create temp path for /dev");
   1276 
   1277 	/* Set up the empty /dev mount point first. */
   1278 	ret = mount("minijail-devfs", dev_path, "tmpfs",
   1279 	            MS_NOEXEC | MS_NOSUID, "size=5M,mode=755");
   1280 	if (ret) {
   1281 		rmdir(dev_path);
   1282 		return ret;
   1283 	}
   1284 
   1285 	/* We want to set the mode directly from the spec. */
   1286 	mask = umask(0);
   1287 
   1288 	/* Get a handle to the temp dev path for *at funcs below. */
   1289 	dev_fd = open(dev_path, O_DIRECTORY|O_PATH|O_CLOEXEC);
   1290 	if (dev_fd < 0) {
   1291 		ret = 1;
   1292 		goto done;
   1293 	}
   1294 
   1295 	/* Create all the nodes in /dev. */
   1296 	for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) {
   1297 		const struct dev_spec *ds = &device_nodes[i];
   1298 		ret = mknodat(dev_fd, ds->name, ds->mode,
   1299 		              makedev(ds->major, ds->minor));
   1300 		if (ret)
   1301 			goto done;
   1302 	}
   1303 
   1304 	/* Create all the symlinks in /dev. */
   1305 	for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) {
   1306 		const struct dev_sym_spec *ds = &device_symlinks[i];
   1307 		ret = symlinkat(ds->dest, dev_fd, ds->source);
   1308 		if (ret)
   1309 			goto done;
   1310 	}
   1311 
   1312 	/* Restore old mask. */
   1313  done:
   1314 	close(dev_fd);
   1315 	umask(mask);
   1316 
   1317 	if (ret)
   1318 		mount_dev_cleanup(dev_path);
   1319 
   1320 	return ret;
   1321 }
   1322 
   1323 /*
   1324  * Relocate the temporary /dev mount to its final /dev place.
   1325  * We have to do this two step process so people can bind mount extra
   1326  * /dev paths like /dev/log.
   1327  */
   1328 static int mount_dev_finalize(const struct minijail *j, char *dev_path)
   1329 {
   1330 	int ret = -1;
   1331 	char *dest = NULL;
   1332 
   1333 	/* Unmount the /dev mount if possible. */
   1334 	if (umount2("/dev", MNT_DETACH))
   1335 		goto done;
   1336 
   1337 	if (asprintf(&dest, "%s/dev", j->chrootdir ? : "") < 0)
   1338 		goto done;
   1339 
   1340 	if (mount(dev_path, dest, NULL, MS_MOVE, NULL))
   1341 		goto done;
   1342 
   1343 	ret = 0;
   1344  done:
   1345 	free(dest);
   1346 	mount_dev_cleanup(dev_path);
   1347 
   1348 	return ret;
   1349 }
   1350 
   1351 /*
   1352  * mount_one: Applies mounts from @m for @j, recursing as needed.
   1353  * @j Minijail these mounts are for
   1354  * @m Head of list of mounts
   1355  *
   1356  * Returns 0 for success.
   1357  */
   1358 static int mount_one(const struct minijail *j, struct mountpoint *m,
   1359 		     const char *dev_path)
   1360 {
   1361 	int ret;
   1362 	char *dest;
   1363 	int remount_ro = 0;
   1364 
   1365 	/* We assume |dest| has a leading "/". */
   1366 	if (dev_path && strncmp("/dev/", m->dest, 5) == 0) {
   1367 		/* Since the temp path is rooted at /dev, skip that dest part. */
   1368 		if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0)
   1369 			return -ENOMEM;
   1370 	} else {
   1371 		if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0)
   1372 			return -ENOMEM;
   1373 	}
   1374 
   1375 	ret = setup_mount_destination(m->src, dest, j->uid, j->gid,
   1376 				      (m->flags & MS_BIND));
   1377 	if (ret) {
   1378 		pwarn("creating mount target '%s' failed", dest);
   1379 		goto error;
   1380 	}
   1381 
   1382 	/*
   1383 	 * R/O bind mounts have to be remounted since 'bind' and 'ro'
   1384 	 * can't both be specified in the original bind mount.
   1385 	 * Remount R/O after the initial mount.
   1386 	 */
   1387 	if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
   1388 		remount_ro = 1;
   1389 		m->flags &= ~MS_RDONLY;
   1390 	}
   1391 
   1392 	ret = mount(m->src, dest, m->type, m->flags, m->data);
   1393 	if (ret) {
   1394 		pwarn("mount: %s -> %s", m->src, dest);
   1395 		goto error;
   1396 	}
   1397 
   1398 	if (remount_ro) {
   1399 		m->flags |= MS_RDONLY;
   1400 		ret = mount(m->src, dest, NULL,
   1401 			    m->flags | MS_REMOUNT, m->data);
   1402 		if (ret) {
   1403 			pwarn("bind ro: %s -> %s", m->src, dest);
   1404 			goto error;
   1405 		}
   1406 	}
   1407 
   1408 	free(dest);
   1409 	if (m->next)
   1410 		return mount_one(j, m->next, dev_path);
   1411 	return 0;
   1412 
   1413 error:
   1414 	free(dest);
   1415 	return ret;
   1416 }
   1417 
   1418 static void process_mounts_or_die(const struct minijail *j)
   1419 {
   1420 	/*
   1421 	 * We have to mount /dev first in case there are bind mounts from
   1422 	 * the original /dev into the new unique tmpfs one.
   1423 	 */
   1424 	char *dev_path = NULL;
   1425 	if (j->flags.mount_dev && mount_dev(&dev_path))
   1426 		pdie("mount_dev failed");
   1427 
   1428 	if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) {
   1429 		if (dev_path) {
   1430 			int saved_errno = errno;
   1431 			mount_dev_cleanup(dev_path);
   1432 			errno = saved_errno;
   1433 		}
   1434 		pdie("mount_one failed");
   1435 	}
   1436 
   1437 	/*
   1438 	 * Once all bind mounts have been processed, move the temp dev to
   1439 	 * its final /dev home.
   1440 	 */
   1441 	if (j->flags.mount_dev && mount_dev_finalize(j, dev_path))
   1442 		pdie("mount_dev_finalize failed");
   1443 }
   1444 
   1445 static int enter_chroot(const struct minijail *j)
   1446 {
   1447 	run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
   1448 
   1449 	if (chroot(j->chrootdir))
   1450 		return -errno;
   1451 
   1452 	if (chdir("/"))
   1453 		return -errno;
   1454 
   1455 	return 0;
   1456 }
   1457 
   1458 static int enter_pivot_root(const struct minijail *j)
   1459 {
   1460 	int oldroot, newroot;
   1461 
   1462 	run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
   1463 
   1464 	/*
   1465 	 * Keep the fd for both old and new root.
   1466 	 * It will be used in fchdir(2) later.
   1467 	 */
   1468 	oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   1469 	if (oldroot < 0)
   1470 		pdie("failed to open / for fchdir");
   1471 	newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
   1472 	if (newroot < 0)
   1473 		pdie("failed to open %s for fchdir", j->chrootdir);
   1474 
   1475 	/*
   1476 	 * To ensure j->chrootdir is the root of a filesystem,
   1477 	 * do a self bind mount.
   1478 	 */
   1479 	if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
   1480 		pdie("failed to bind mount '%s'", j->chrootdir);
   1481 	if (chdir(j->chrootdir))
   1482 		return -errno;
   1483 	if (syscall(SYS_pivot_root, ".", "."))
   1484 		pdie("pivot_root");
   1485 
   1486 	/*
   1487 	 * Now the old root is mounted on top of the new root. Use fchdir(2) to
   1488 	 * change to the old root and unmount it.
   1489 	 */
   1490 	if (fchdir(oldroot))
   1491 		pdie("failed to fchdir to old /");
   1492 
   1493 	/*
   1494 	 * If skip_remount_private was enabled for minijail_enter(),
   1495 	 * there could be a shared mount point under |oldroot|. In that case,
   1496 	 * mounts under this shared mount point will be unmounted below, and
   1497 	 * this unmounting will propagate to the original mount namespace
   1498 	 * (because the mount point is shared). To prevent this unexpected
   1499 	 * unmounting, remove these mounts from their peer groups by recursively
   1500 	 * remounting them as MS_PRIVATE.
   1501 	 */
   1502 	if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
   1503 		pdie("failed to mount(/, private) before umount(/)");
   1504 	/* The old root might be busy, so use lazy unmount. */
   1505 	if (umount2(".", MNT_DETACH))
   1506 		pdie("umount(/)");
   1507 	/* Change back to the new root. */
   1508 	if (fchdir(newroot))
   1509 		return -errno;
   1510 	if (close(oldroot))
   1511 		return -errno;
   1512 	if (close(newroot))
   1513 		return -errno;
   1514 	if (chroot("/"))
   1515 		return -errno;
   1516 	/* Set correct CWD for getcwd(3). */
   1517 	if (chdir("/"))
   1518 		return -errno;
   1519 
   1520 	return 0;
   1521 }
   1522 
   1523 static int mount_tmp(const struct minijail *j)
   1524 {
   1525 	const char fmt[] = "size=%zu,mode=1777";
   1526 	/* Count for the user storing ULLONG_MAX literally + extra space. */
   1527 	char data[sizeof(fmt) + sizeof("18446744073709551615ULL")];
   1528 	int ret;
   1529 
   1530 	ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size);
   1531 
   1532 	if (ret <= 0)
   1533 		pdie("tmpfs size spec error");
   1534 	else if ((size_t)ret >= sizeof(data))
   1535 		pdie("tmpfs size spec too large");
   1536 	return mount("none", "/tmp", "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID,
   1537 		     data);
   1538 }
   1539 
   1540 static int remount_proc_readonly(const struct minijail *j)
   1541 {
   1542 	const char *kProcPath = "/proc";
   1543 	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
   1544 	/*
   1545 	 * Right now, we're holding a reference to our parent's old mount of
   1546 	 * /proc in our namespace, which means using MS_REMOUNT here would
   1547 	 * mutate our parent's mount as well, even though we're in a VFS
   1548 	 * namespace (!). Instead, remove their mount from our namespace lazily
   1549 	 * (MNT_DETACH) and make our own.
   1550 	 */
   1551 	if (umount2(kProcPath, MNT_DETACH)) {
   1552 		/*
   1553 		 * If we are in a new user namespace, umount(2) will fail.
   1554 		 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
   1555 		 */
   1556 		if (j->flags.userns) {
   1557 			info("umount(/proc, MNT_DETACH) failed, "
   1558 			     "this is expected when using user namespaces");
   1559 		} else {
   1560 			return -errno;
   1561 		}
   1562 	}
   1563 	if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
   1564 		return -errno;
   1565 	return 0;
   1566 }
   1567 
   1568 static void kill_child_and_die(const struct minijail *j, const char *msg)
   1569 {
   1570 	kill(j->initpid, SIGKILL);
   1571 	die("%s", msg);
   1572 }
   1573 
   1574 static void write_pid_file_or_die(const struct minijail *j)
   1575 {
   1576 	if (write_pid_to_path(j->initpid, j->pid_file_path))
   1577 		kill_child_and_die(j, "failed to write pid file");
   1578 }
   1579 
   1580 static void add_to_cgroups_or_die(const struct minijail *j)
   1581 {
   1582 	size_t i;
   1583 
   1584 	for (i = 0; i < j->cgroup_count; ++i) {
   1585 		if (write_pid_to_path(j->initpid, j->cgroups[i]))
   1586 			kill_child_and_die(j, "failed to add to cgroups");
   1587 	}
   1588 }
   1589 
   1590 static void set_rlimits_or_die(const struct minijail *j)
   1591 {
   1592 	size_t i;
   1593 
   1594 	for (i = 0; i < j->rlimit_count; ++i) {
   1595 		struct rlimit limit;
   1596 		limit.rlim_cur = j->rlimits[i].cur;
   1597 		limit.rlim_max = j->rlimits[i].max;
   1598 		if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL))
   1599 			kill_child_and_die(j, "failed to set rlimit");
   1600 	}
   1601 }
   1602 
   1603 static void write_ugid_maps_or_die(const struct minijail *j)
   1604 {
   1605 	if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
   1606 		kill_child_and_die(j, "failed to write uid_map");
   1607 	if (j->gidmap && j->flags.disable_setgroups) {
   1608 		/* Older kernels might not have the /proc/<pid>/setgroups files. */
   1609 		int ret = write_proc_file(j->initpid, "deny", "setgroups");
   1610 		if (ret != 0) {
   1611 			if (ret == -ENOENT) {
   1612 				/* See http://man7.org/linux/man-pages/man7/user_namespaces.7.html. */
   1613 				warn("could not disable setgroups(2)");
   1614 			} else
   1615 				kill_child_and_die(j, "failed to disable setgroups(2)");
   1616 		}
   1617 	}
   1618 	if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
   1619 		kill_child_and_die(j, "failed to write gid_map");
   1620 }
   1621 
   1622 static void enter_user_namespace(const struct minijail *j)
   1623 {
   1624 	int uid = j->flags.uid ? j->uid : 0;
   1625 	int gid = j->flags.gid ? j->gid : 0;
   1626 	if (j->gidmap && setresgid(gid, gid, gid)) {
   1627 		pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid,
   1628 		     gid);
   1629 	}
   1630 	if (j->uidmap && setresuid(uid, uid, uid)) {
   1631 		pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid,
   1632 		     uid);
   1633 	}
   1634 }
   1635 
   1636 static void parent_setup_complete(int *pipe_fds)
   1637 {
   1638 	close(pipe_fds[0]);
   1639 	close(pipe_fds[1]);
   1640 }
   1641 
   1642 /*
   1643  * wait_for_parent_setup: Called by the child process to wait for any
   1644  * further parent-side setup to complete before continuing.
   1645  */
   1646 static void wait_for_parent_setup(int *pipe_fds)
   1647 {
   1648 	char buf;
   1649 
   1650 	close(pipe_fds[1]);
   1651 
   1652 	/* Wait for parent to complete setup and close the pipe. */
   1653 	if (read(pipe_fds[0], &buf, 1) != 0)
   1654 		die("failed to sync with parent");
   1655 	close(pipe_fds[0]);
   1656 }
   1657 
   1658 static void drop_ugid(const struct minijail *j)
   1659 {
   1660 	if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids +
   1661 	    j->flags.set_suppl_gids > 1) {
   1662 		die("can only do one of inherit, keep, or set supplementary "
   1663 		    "groups");
   1664 	}
   1665 
   1666 	if (j->flags.inherit_suppl_gids) {
   1667 		if (initgroups(j->user, j->usergid))
   1668 			pdie("initgroups(%s, %d) failed", j->user, j->usergid);
   1669 	} else if (j->flags.set_suppl_gids) {
   1670 		if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
   1671 			pdie("setgroups(suppl_gids) failed");
   1672 	} else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) {
   1673 		/*
   1674 		 * Only attempt to clear supplementary groups if we are changing
   1675 		 * users or groups, and if the caller did not request to disable
   1676 		 * setgroups (used when entering a user namespace as a
   1677 		 * non-privileged user).
   1678 		 */
   1679 		if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
   1680 			pdie("setgroups(0, NULL) failed");
   1681 	}
   1682 
   1683 	if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
   1684 		pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid);
   1685 
   1686 	if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
   1687 		pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid);
   1688 }
   1689 
   1690 static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
   1691 {
   1692 	const uint64_t one = 1;
   1693 	unsigned int i;
   1694 	for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
   1695 		if (keep_mask & (one << i))
   1696 			continue;
   1697 		if (prctl(PR_CAPBSET_DROP, i))
   1698 			pdie("could not drop capability from bounding set");
   1699 	}
   1700 }
   1701 
   1702 static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
   1703 {
   1704 	if (!j->flags.use_caps)
   1705 		return;
   1706 
   1707 	cap_t caps = cap_get_proc();
   1708 	cap_value_t flag[1];
   1709 	const size_t ncaps = sizeof(j->caps) * 8;
   1710 	const uint64_t one = 1;
   1711 	unsigned int i;
   1712 	if (!caps)
   1713 		die("can't get process caps");
   1714 	if (cap_clear(caps))
   1715 		die("can't clear caps");
   1716 
   1717 	for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
   1718 		/* Keep CAP_SETPCAP for dropping bounding set bits. */
   1719 		if (i != CAP_SETPCAP && !(j->caps & (one << i)))
   1720 			continue;
   1721 		flag[0] = i;
   1722 		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
   1723 			die("can't add effective cap");
   1724 		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
   1725 			die("can't add permitted cap");
   1726 		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
   1727 			die("can't add inheritable cap");
   1728 	}
   1729 	if (cap_set_proc(caps))
   1730 		die("can't apply initial cleaned capset");
   1731 
   1732 	/*
   1733 	 * Instead of dropping bounding set first, do it here in case
   1734 	 * the caller had a more permissive bounding set which could
   1735 	 * have been used above to raise a capability that wasn't already
   1736 	 * present. This requires CAP_SETPCAP, so we raised/kept it above.
   1737 	 */
   1738 	drop_capbset(j->caps, last_valid_cap);
   1739 
   1740 	/* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
   1741 	if ((j->caps & (one << CAP_SETPCAP)) == 0) {
   1742 		flag[0] = CAP_SETPCAP;
   1743 		if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
   1744 			die("can't clear effective cap");
   1745 		if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
   1746 			die("can't clear permitted cap");
   1747 		if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
   1748 			die("can't clear inheritable cap");
   1749 	}
   1750 
   1751 	if (cap_set_proc(caps))
   1752 		die("can't apply final cleaned capset");
   1753 
   1754 	/*
   1755 	 * If ambient capabilities are supported, clear all capabilities first,
   1756 	 * then raise the requested ones.
   1757 	 */
   1758 	if (j->flags.set_ambient_caps) {
   1759 		if (!cap_ambient_supported()) {
   1760 			pdie("ambient capabilities not supported");
   1761 		}
   1762 		if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) !=
   1763 		    0) {
   1764 			pdie("can't clear ambient capabilities");
   1765 		}
   1766 
   1767 		for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
   1768 			if (!(j->caps & (one << i)))
   1769 				continue;
   1770 
   1771 			if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0,
   1772 				  0) != 0) {
   1773 				pdie("prctl(PR_CAP_AMBIENT, "
   1774 				     "PR_CAP_AMBIENT_RAISE, %u) failed",
   1775 				     i);
   1776 			}
   1777 		}
   1778 	}
   1779 
   1780 	cap_free(caps);
   1781 }
   1782 
   1783 static void set_seccomp_filter(const struct minijail *j)
   1784 {
   1785 	/*
   1786 	 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
   1787 	 * in the kernel source tree for an explanation of the parameters.
   1788 	 */
   1789 	if (j->flags.no_new_privs) {
   1790 		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
   1791 			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
   1792 	}
   1793 
   1794 	/*
   1795 	 * Code running with ASan
   1796 	 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
   1797 	 * will make system calls not included in the syscall filter policy,
   1798 	 * which will likely crash the program. Skip setting seccomp filter in
   1799 	 * that case.
   1800 	 * 'running_with_asan()' has no inputs and is completely defined at
   1801 	 * build time, so this cannot be used by an attacker to skip setting
   1802 	 * seccomp filter.
   1803 	 */
   1804 	if (j->flags.seccomp_filter && running_with_asan()) {
   1805 		warn("running with ASan, not setting seccomp filter");
   1806 		return;
   1807 	}
   1808 
   1809 	if (j->flags.seccomp_filter) {
   1810 		if (j->flags.seccomp_filter_logging) {
   1811 			/*
   1812 			 * If logging seccomp filter failures,
   1813 			 * install the SIGSYS handler first.
   1814 			 */
   1815 			if (install_sigsys_handler())
   1816 				pdie("failed to install SIGSYS handler");
   1817 			warn("logging seccomp filter failures");
   1818 		} else if (j->flags.seccomp_filter_tsync) {
   1819 			/*
   1820 			 * If setting thread sync,
   1821 			 * reset the SIGSYS signal handler so that
   1822 			 * the entire thread group is killed.
   1823 			 */
   1824 			if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
   1825 				pdie("failed to reset SIGSYS disposition");
   1826 			info("reset SIGSYS disposition");
   1827 		}
   1828 	}
   1829 
   1830 	/*
   1831 	 * Install the syscall filter.
   1832 	 */
   1833 	if (j->flags.seccomp_filter) {
   1834 		if (j->flags.seccomp_filter_tsync) {
   1835 			if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
   1836 					SECCOMP_FILTER_FLAG_TSYNC,
   1837 					j->filter_prog)) {
   1838 				pdie("seccomp(tsync) failed");
   1839 			}
   1840 		} else {
   1841 			if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
   1842 				  j->filter_prog)) {
   1843 				pdie("prctl(seccomp_filter) failed");
   1844 			}
   1845 		}
   1846 	}
   1847 }
   1848 
   1849 static pid_t forward_pid = -1;
   1850 
   1851 static void forward_signal(__attribute__((unused)) int nr,
   1852 			   __attribute__((unused)) siginfo_t *siginfo,
   1853 			   __attribute__((unused)) void *void_context)
   1854 {
   1855 	if (forward_pid != -1) {
   1856 		kill(forward_pid, nr);
   1857 	}
   1858 }
   1859 
   1860 static void install_signal_handlers(void)
   1861 {
   1862 	struct sigaction act;
   1863 
   1864 	memset(&act, 0, sizeof(act));
   1865 	act.sa_sigaction = &forward_signal;
   1866 	act.sa_flags = SA_SIGINFO | SA_RESTART;
   1867 
   1868 	/* Handle all signals, except SIGCHLD. */
   1869 	for (int nr = 1; nr < NSIG; nr++) {
   1870 		/*
   1871 		 * We don't care if we get EINVAL: that just means that we
   1872 		 * can't handle this signal, so let's skip it and continue.
   1873 		 */
   1874 		sigaction(nr, &act, NULL);
   1875 	}
   1876 	/* Reset SIGCHLD's handler. */
   1877 	signal(SIGCHLD, SIG_DFL);
   1878 
   1879 	/* Handle real-time signals. */
   1880 	for (int nr = SIGRTMIN; nr <= SIGRTMAX; nr++) {
   1881 		sigaction(nr, &act, NULL);
   1882 	}
   1883 }
   1884 
   1885 static const char *lookup_hook_name(minijail_hook_event_t event)
   1886 {
   1887 	switch (event) {
   1888 	case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS:
   1889 		return "pre-drop-caps";
   1890 	case MINIJAIL_HOOK_EVENT_PRE_EXECVE:
   1891 		return "pre-execve";
   1892 	case MINIJAIL_HOOK_EVENT_PRE_CHROOT:
   1893 		return "pre-chroot";
   1894 	case MINIJAIL_HOOK_EVENT_MAX:
   1895 		/*
   1896 		 * Adding this in favor of a default case to force the
   1897 		 * compiler to error out if a new enum value is added.
   1898 		 */
   1899 		break;
   1900 	}
   1901 	return "unknown";
   1902 }
   1903 
   1904 static void run_hooks_or_die(const struct minijail *j,
   1905 			     minijail_hook_event_t event)
   1906 {
   1907 	int rc;
   1908 	int hook_index = 0;
   1909 	for (struct hook *c = j->hooks_head; c; c = c->next) {
   1910 		if (c->event != event)
   1911 			continue;
   1912 		rc = c->hook(c->payload);
   1913 		if (rc != 0) {
   1914 			errno = -rc;
   1915 			pdie("%s hook (index %d) failed",
   1916 			     lookup_hook_name(event), hook_index);
   1917 		}
   1918 		/* Only increase the index within the same hook event type. */
   1919 		++hook_index;
   1920 	}
   1921 }
   1922 
   1923 void API minijail_enter(const struct minijail *j)
   1924 {
   1925 	/*
   1926 	 * If we're dropping caps, get the last valid cap from /proc now,
   1927 	 * since /proc can be unmounted before drop_caps() is called.
   1928 	 */
   1929 	unsigned int last_valid_cap = 0;
   1930 	if (j->flags.capbset_drop || j->flags.use_caps)
   1931 		last_valid_cap = get_last_valid_cap();
   1932 
   1933 	if (j->flags.pids)
   1934 		die("tried to enter a pid-namespaced jail;"
   1935 		    " try minijail_run()?");
   1936 
   1937 	if (j->flags.inherit_suppl_gids && !j->user)
   1938 		die("cannot inherit supplementary groups without setting a "
   1939 		    "username");
   1940 
   1941 	/*
   1942 	 * We can't recover from failures if we've dropped privileges partially,
   1943 	 * so we don't even try. If any of our operations fail, we abort() the
   1944 	 * entire process.
   1945 	 */
   1946 	if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
   1947 		pdie("setns(CLONE_NEWNS) failed");
   1948 
   1949 	if (j->flags.vfs) {
   1950 		if (unshare(CLONE_NEWNS))
   1951 			pdie("unshare(CLONE_NEWNS) failed");
   1952 		/*
   1953 		 * By default, remount all filesystems as private, unless
   1954 		 * - Passed a specific remount mode, in which case remount with that,
   1955 		 * - Asked not to remount at all, in which case skip the mount(2) call.
   1956 		 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
   1957 		 */
   1958 		if (j->remount_mode) {
   1959 			if (mount(NULL, "/", NULL, MS_REC | j->remount_mode, NULL))
   1960 				pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE,"
   1961 				     " NULL) failed");
   1962 		}
   1963 	}
   1964 
   1965 	if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
   1966 		pdie("unshare(CLONE_NEWIPC) failed");
   1967 	}
   1968 
   1969 	if (j->flags.uts) {
   1970 		if (unshare(CLONE_NEWUTS))
   1971 			pdie("unshare(CLONE_NEWUTS) failed");
   1972 
   1973 		if (j->hostname && sethostname(j->hostname, strlen(j->hostname)))
   1974 			pdie("sethostname(%s) failed", j->hostname);
   1975 	}
   1976 
   1977 	if (j->flags.enter_net) {
   1978 		if (setns(j->netns_fd, CLONE_NEWNET))
   1979 			pdie("setns(CLONE_NEWNET) failed");
   1980 	} else if (j->flags.net) {
   1981 		if (unshare(CLONE_NEWNET))
   1982 			pdie("unshare(CLONE_NEWNET) failed");
   1983 		config_net_loopback();
   1984 	}
   1985 
   1986 	if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
   1987 		pdie("unshare(CLONE_NEWCGROUP) failed");
   1988 
   1989 	if (j->flags.new_session_keyring) {
   1990 		if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0)
   1991 			pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed");
   1992 	}
   1993 
   1994 	/* We have to process all the mounts before we chroot/pivot_root. */
   1995 	process_mounts_or_die(j);
   1996 
   1997 	if (j->flags.chroot && enter_chroot(j))
   1998 		pdie("chroot");
   1999 
   2000 	if (j->flags.pivot_root && enter_pivot_root(j))
   2001 		pdie("pivot_root");
   2002 
   2003 	if (j->flags.mount_tmp && mount_tmp(j))
   2004 		pdie("mount_tmp");
   2005 
   2006 	if (j->flags.remount_proc_ro && remount_proc_readonly(j))
   2007 		pdie("remount");
   2008 
   2009 	run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS);
   2010 
   2011 	/*
   2012 	 * If we're only dropping capabilities from the bounding set, but not
   2013 	 * from the thread's (permitted|inheritable|effective) sets, do it now.
   2014 	 */
   2015 	if (j->flags.capbset_drop) {
   2016 		drop_capbset(j->cap_bset, last_valid_cap);
   2017 	}
   2018 
   2019 	if (j->flags.use_caps) {
   2020 		/*
   2021 		 * POSIX capabilities are a bit tricky. If we drop our
   2022 		 * capability to change uids, our attempt to use setuid()
   2023 		 * below will fail. Hang on to root caps across setuid(), then
   2024 		 * lock securebits.
   2025 		 */
   2026 		if (prctl(PR_SET_KEEPCAPS, 1))
   2027 			pdie("prctl(PR_SET_KEEPCAPS) failed");
   2028 
   2029 		if (lock_securebits(j->securebits_skip_mask) < 0) {
   2030 			pdie("locking securebits failed");
   2031 		}
   2032 	}
   2033 
   2034 	if (j->flags.no_new_privs) {
   2035 		/*
   2036 		 * If we're setting no_new_privs, we can drop privileges
   2037 		 * before setting seccomp filter. This way filter policies
   2038 		 * don't need to allow privilege-dropping syscalls.
   2039 		 */
   2040 		drop_ugid(j);
   2041 		drop_caps(j, last_valid_cap);
   2042 		set_seccomp_filter(j);
   2043 	} else {
   2044 		/*
   2045 		 * If we're not setting no_new_privs,
   2046 		 * we need to set seccomp filter *before* dropping privileges.
   2047 		 * WARNING: this means that filter policies *must* allow
   2048 		 * setgroups()/setresgid()/setresuid() for dropping root and
   2049 		 * capget()/capset()/prctl() for dropping caps.
   2050 		 */
   2051 		set_seccomp_filter(j);
   2052 		drop_ugid(j);
   2053 		drop_caps(j, last_valid_cap);
   2054 	}
   2055 
   2056 	/*
   2057 	 * Select the specified alternate syscall table.  The table must not
   2058 	 * block prctl(2) if we're using seccomp as well.
   2059 	 */
   2060 	if (j->flags.alt_syscall) {
   2061 		if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
   2062 			pdie("prctl(PR_ALT_SYSCALL) failed");
   2063 	}
   2064 
   2065 	/*
   2066 	 * seccomp has to come last since it cuts off all the other
   2067 	 * privilege-dropping syscalls :)
   2068 	 */
   2069 	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
   2070 		if ((errno == EINVAL) && seccomp_can_softfail()) {
   2071 			warn("seccomp not supported");
   2072 			return;
   2073 		}
   2074 		pdie("prctl(PR_SET_SECCOMP) failed");
   2075 	}
   2076 }
   2077 
   2078 /* TODO(wad): will visibility affect this variable? */
   2079 static int init_exitstatus = 0;
   2080 
   2081 void init_term(int __attribute__ ((unused)) sig)
   2082 {
   2083 	_exit(init_exitstatus);
   2084 }
   2085 
   2086 void init(pid_t rootpid)
   2087 {
   2088 	pid_t pid;
   2089 	int status;
   2090 	/* So that we exit with the right status. */
   2091 	signal(SIGTERM, init_term);
   2092 	/* TODO(wad): self jail with seccomp filters here. */
   2093 	while ((pid = wait(&status)) > 0) {
   2094 		/*
   2095 		 * This loop will only end when either there are no processes
   2096 		 * left inside our pid namespace or we get a signal.
   2097 		 */
   2098 		if (pid == rootpid)
   2099 			init_exitstatus = status;
   2100 	}
   2101 	if (!WIFEXITED(init_exitstatus))
   2102 		_exit(MINIJAIL_ERR_INIT);
   2103 	_exit(WEXITSTATUS(init_exitstatus));
   2104 }
   2105 
   2106 int API minijail_from_fd(int fd, struct minijail *j)
   2107 {
   2108 	size_t sz = 0;
   2109 	size_t bytes = read(fd, &sz, sizeof(sz));
   2110 	char *buf;
   2111 	int r;
   2112 	if (sizeof(sz) != bytes)
   2113 		return -EINVAL;
   2114 	if (sz > USHRT_MAX)	/* arbitrary sanity check */
   2115 		return -E2BIG;
   2116 	buf = malloc(sz);
   2117 	if (!buf)
   2118 		return -ENOMEM;
   2119 	bytes = read(fd, buf, sz);
   2120 	if (bytes != sz) {
   2121 		free(buf);
   2122 		return -EINVAL;
   2123 	}
   2124 	r = minijail_unmarshal(j, buf, sz);
   2125 	free(buf);
   2126 	return r;
   2127 }
   2128 
   2129 int API minijail_to_fd(struct minijail *j, int fd)
   2130 {
   2131 	char *buf;
   2132 	size_t sz = minijail_size(j);
   2133 	ssize_t written;
   2134 	int r;
   2135 
   2136 	if (!sz)
   2137 		return -EINVAL;
   2138 	buf = malloc(sz);
   2139 	r = minijail_marshal(j, buf, sz);
   2140 	if (r) {
   2141 		free(buf);
   2142 		return r;
   2143 	}
   2144 	/* Sends [size][minijail]. */
   2145 	written = write(fd, &sz, sizeof(sz));
   2146 	if (written != sizeof(sz)) {
   2147 		free(buf);
   2148 		return -EFAULT;
   2149 	}
   2150 	written = write(fd, buf, sz);
   2151 	if (written < 0 || (size_t) written != sz) {
   2152 		free(buf);
   2153 		return -EFAULT;
   2154 	}
   2155 	free(buf);
   2156 	return 0;
   2157 }
   2158 
   2159 int setup_preload(void)
   2160 {
   2161 #if defined(__ANDROID__)
   2162 	/* Don't use LDPRELOAD on Android. */
   2163 	return 0;
   2164 #else
   2165 	char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
   2166 	char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
   2167 	if (!newenv)
   2168 		return -ENOMEM;
   2169 
   2170 	/* Only insert a separating space if we have something to separate... */
   2171 	sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
   2172 		PRELOADPATH);
   2173 
   2174 	/* setenv() makes a copy of the string we give it. */
   2175 	setenv(kLdPreloadEnvVar, newenv, 1);
   2176 	free(newenv);
   2177 	return 0;
   2178 #endif
   2179 }
   2180 
   2181 static int setup_pipe(int fds[2])
   2182 {
   2183 	int r = pipe(fds);
   2184 	char fd_buf[11];
   2185 	if (r)
   2186 		return r;
   2187 	r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
   2188 	if (r <= 0)
   2189 		return -EINVAL;
   2190 	setenv(kFdEnvVar, fd_buf, 1);
   2191 	return 0;
   2192 }
   2193 
   2194 static int close_open_fds(int *inheritable_fds, size_t size)
   2195 {
   2196 	const char *kFdPath = "/proc/self/fd";
   2197 
   2198 	DIR *d = opendir(kFdPath);
   2199 	struct dirent *dir_entry;
   2200 
   2201 	if (d == NULL)
   2202 		return -1;
   2203 	int dir_fd = dirfd(d);
   2204 	while ((dir_entry = readdir(d)) != NULL) {
   2205 		size_t i;
   2206 		char *end;
   2207 		bool should_close = true;
   2208 		const int fd = strtol(dir_entry->d_name, &end, 10);
   2209 
   2210 		if ((*end) != '\0') {
   2211 			continue;
   2212 		}
   2213 		/*
   2214 		 * We might have set up some pipes that we want to share with
   2215 		 * the parent process, and should not be closed.
   2216 		 */
   2217 		for (i = 0; i < size; ++i) {
   2218 			if (fd == inheritable_fds[i]) {
   2219 				should_close = false;
   2220 				break;
   2221 			}
   2222 		}
   2223 		/* Also avoid closing the directory fd. */
   2224 		if (should_close && fd != dir_fd)
   2225 			close(fd);
   2226 	}
   2227 	closedir(d);
   2228 	return 0;
   2229 }
   2230 
   2231 static int redirect_fds(struct minijail *j)
   2232 {
   2233 	size_t i, i2;
   2234 	int closeable;
   2235 	for (i = 0; i < j->preserved_fd_count; i++) {
   2236 		if (dup2(j->preserved_fds[i].parent_fd,
   2237 			 j->preserved_fds[i].child_fd) == -1) {
   2238 			return -1;
   2239 		}
   2240 	}
   2241 	/*
   2242 	 * After all fds have been duped, we are now free to close all parent
   2243 	 * fds that are *not* child fds.
   2244 	 */
   2245 	for (i = 0; i < j->preserved_fd_count; i++) {
   2246 		closeable = true;
   2247 		for (i2 = 0; i2 < j->preserved_fd_count; i2++) {
   2248 			closeable &= j->preserved_fds[i].parent_fd !=
   2249 				     j->preserved_fds[i2].child_fd;
   2250 		}
   2251 		if (closeable)
   2252 			close(j->preserved_fds[i].parent_fd);
   2253 	}
   2254 	return 0;
   2255 }
   2256 
   2257 /*
   2258  * Structure that specifies how to start a minijail.
   2259  *
   2260  * filename - The program to exec in the child. Required if `exec_in_child` = 1.
   2261  * argv - Arguments for the child program. Required if `exec_in_child` = 1.
   2262  * use_preload - If true use LD_PRELOAD.
   2263  * exec_in_child - If true, run `filename`. Otherwise, the child will return to
   2264  *     the caller.
   2265  */
   2266 struct minijail_run_config {
   2267 	const char *filename;
   2268 	char *const *argv;
   2269 	int use_preload;
   2270 	int exec_in_child;
   2271 };
   2272 
   2273 /*
   2274  * Set of pointers to fill with values from minijail_run.
   2275  * All arguments are allowed to be NULL if unused.
   2276  *
   2277  * pstdin_fd - Filled with stdin pipe if non-NULL.
   2278  * pstdout_fd - Filled with stdout pipe if non-NULL.
   2279  * pstderr_fd - Filled with stderr pipe if non-NULL.
   2280  * pchild_pid - Filled with the pid of the child process if non-NULL.
   2281  */
   2282 struct minijail_run_status {
   2283 	int *pstdin_fd;
   2284 	int *pstdout_fd;
   2285 	int *pstderr_fd;
   2286 	pid_t *pchild_pid;
   2287 };
   2288 
   2289 static int minijail_run_internal(struct minijail *j,
   2290 				 const struct minijail_run_config *config,
   2291 				 struct minijail_run_status *status_out);
   2292 
   2293 int API minijail_run(struct minijail *j, const char *filename,
   2294 		     char *const argv[])
   2295 {
   2296 	struct minijail_run_config config = {
   2297 		.filename = filename,
   2298 		.argv = argv,
   2299 		.use_preload = true,
   2300 		.exec_in_child = true,
   2301 	};
   2302 	struct minijail_run_status status = {};
   2303 	return minijail_run_internal(j, &config, &status);
   2304 }
   2305 
   2306 int API minijail_run_pid(struct minijail *j, const char *filename,
   2307 			 char *const argv[], pid_t *pchild_pid)
   2308 {
   2309 	struct minijail_run_config config = {
   2310 		.filename = filename,
   2311 		.argv = argv,
   2312 		.use_preload = true,
   2313 		.exec_in_child = true,
   2314 	};
   2315 	struct minijail_run_status status = {
   2316 		.pchild_pid = pchild_pid,
   2317 	};
   2318 	return minijail_run_internal(j, &config, &status);
   2319 }
   2320 
   2321 int API minijail_run_pipe(struct minijail *j, const char *filename,
   2322 			  char *const argv[], int *pstdin_fd)
   2323 {
   2324 	struct minijail_run_config config = {
   2325 		.filename = filename,
   2326 		.argv = argv,
   2327 		.use_preload = true,
   2328 		.exec_in_child = true,
   2329 	};
   2330 	struct minijail_run_status status = {
   2331 		.pstdin_fd = pstdin_fd,
   2332 	};
   2333 	return minijail_run_internal(j, &config, &status);
   2334 }
   2335 
   2336 int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
   2337 			       char *const argv[], pid_t *pchild_pid,
   2338 			       int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
   2339 {
   2340 	struct minijail_run_config config = {
   2341 		.filename = filename,
   2342 		.argv = argv,
   2343 		.use_preload = true,
   2344 		.exec_in_child = true,
   2345 	};
   2346 	struct minijail_run_status status = {
   2347 		.pstdin_fd = pstdin_fd,
   2348 		.pstdout_fd = pstdout_fd,
   2349 		.pstderr_fd = pstderr_fd,
   2350 		.pchild_pid = pchild_pid,
   2351 	};
   2352 	return minijail_run_internal(j, &config, &status);
   2353 }
   2354 
   2355 int API minijail_run_no_preload(struct minijail *j, const char *filename,
   2356 				char *const argv[])
   2357 {
   2358 	struct minijail_run_config config = {
   2359 		.filename = filename,
   2360 		.argv = argv,
   2361 		.use_preload = false,
   2362 		.exec_in_child = true,
   2363 	};
   2364 	struct minijail_run_status status = {};
   2365 	return minijail_run_internal(j, &config, &status);
   2366 }
   2367 
   2368 int API minijail_run_pid_pipes_no_preload(struct minijail *j,
   2369 					  const char *filename,
   2370 					  char *const argv[],
   2371 					  pid_t *pchild_pid,
   2372 					  int *pstdin_fd,
   2373 					  int *pstdout_fd,
   2374 					  int *pstderr_fd)
   2375 {
   2376 	struct minijail_run_config config = {
   2377 		.filename = filename,
   2378 		.argv = argv,
   2379 		.use_preload = false,
   2380 		.exec_in_child = true,
   2381 	};
   2382 	struct minijail_run_status status = {
   2383 		.pstdin_fd = pstdin_fd,
   2384 		.pstdout_fd = pstdout_fd,
   2385 		.pstderr_fd = pstderr_fd,
   2386 		.pchild_pid = pchild_pid,
   2387 	};
   2388 	return minijail_run_internal(j, &config, &status);
   2389 }
   2390 
   2391 pid_t API minijail_fork(struct minijail *j)
   2392 {
   2393 	struct minijail_run_config config = {};
   2394 	struct minijail_run_status status = {};
   2395 	return minijail_run_internal(j, &config, &status);
   2396 }
   2397 
   2398 static int minijail_run_internal(struct minijail *j,
   2399 				 const struct minijail_run_config *config,
   2400 				 struct minijail_run_status *status_out)
   2401 {
   2402 	char *oldenv, *oldenv_copy = NULL;
   2403 	pid_t child_pid;
   2404 	int pipe_fds[2];
   2405 	int stdin_fds[2];
   2406 	int stdout_fds[2];
   2407 	int stderr_fds[2];
   2408 	int child_sync_pipe_fds[2];
   2409 	int sync_child = 0;
   2410 	int ret;
   2411 	/* We need to remember this across the minijail_preexec() call. */
   2412 	int pid_namespace = j->flags.pids;
   2413 	/*
   2414 	 * Create an init process if we are entering a pid namespace, unless the
   2415 	 * user has explicitly opted out by calling minijail_run_as_init().
   2416 	 */
   2417 	int do_init = j->flags.do_init && !j->flags.run_as_init;
   2418 	int use_preload = config->use_preload;
   2419 
   2420 	if (use_preload) {
   2421 		if (j->hooks_head != NULL)
   2422 			die("Minijail hooks are not supported with LD_PRELOAD");
   2423 		if (!config->exec_in_child)
   2424 			die("minijail_fork is not supported with LD_PRELOAD");
   2425 
   2426 		oldenv = getenv(kLdPreloadEnvVar);
   2427 		if (oldenv) {
   2428 			oldenv_copy = strdup(oldenv);
   2429 			if (!oldenv_copy)
   2430 				return -ENOMEM;
   2431 		}
   2432 
   2433 		if (setup_preload())
   2434 			return -EFAULT;
   2435 	}
   2436 
   2437 	if (!use_preload) {
   2438 		if (j->flags.use_caps && j->caps != 0 &&
   2439 		    !j->flags.set_ambient_caps) {
   2440 			die("non-empty, non-ambient capabilities are not "
   2441 			    "supported without LD_PRELOAD");
   2442 		}
   2443 	}
   2444 
   2445 	if (use_preload) {
   2446 		/*
   2447 		 * Before we fork(2) and execve(2) the child process, we need
   2448 		 * to open a pipe(2) to send the minijail configuration over.
   2449 		 */
   2450 		if (setup_pipe(pipe_fds))
   2451 			return -EFAULT;
   2452 	}
   2453 
   2454 	/*
   2455 	 * If we want to write to the child process' standard input,
   2456 	 * create the pipe(2) now.
   2457 	 */
   2458 	if (status_out->pstdin_fd) {
   2459 		if (pipe(stdin_fds))
   2460 			return -EFAULT;
   2461 	}
   2462 
   2463 	/*
   2464 	 * If we want to read from the child process' standard output,
   2465 	 * create the pipe(2) now.
   2466 	 */
   2467 	if (status_out->pstdout_fd) {
   2468 		if (pipe(stdout_fds))
   2469 			return -EFAULT;
   2470 	}
   2471 
   2472 	/*
   2473 	 * If we want to read from the child process' standard error,
   2474 	 * create the pipe(2) now.
   2475 	 */
   2476 	if (status_out->pstderr_fd) {
   2477 		if (pipe(stderr_fds))
   2478 			return -EFAULT;
   2479 	}
   2480 
   2481 	/*
   2482 	 * If we want to set up a new uid/gid map in the user namespace,
   2483 	 * or if we need to add the child process to cgroups, create the pipe(2)
   2484 	 * to sync between parent and child.
   2485 	 */
   2486 	if (j->flags.userns || j->flags.cgroups) {
   2487 		sync_child = 1;
   2488 		if (pipe(child_sync_pipe_fds))
   2489 			return -EFAULT;
   2490 	}
   2491 
   2492 	/*
   2493 	 * Use sys_clone() if and only if we're creating a pid namespace.
   2494 	 *
   2495 	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
   2496 	 *
   2497 	 * In multithreaded programs, there are a bunch of locks inside libc,
   2498 	 * some of which may be held by other threads at the time that we call
   2499 	 * minijail_run_pid(). If we call fork(), glibc does its level best to
   2500 	 * ensure that we hold all of these locks before it calls clone()
   2501 	 * internally and drop them after clone() returns, but when we call
   2502 	 * sys_clone(2) directly, all that gets bypassed and we end up with a
   2503 	 * child address space where some of libc's important locks are held by
   2504 	 * other threads (which did not get cloned, and hence will never release
   2505 	 * those locks). This is okay so long as we call exec() immediately
   2506 	 * after, but a bunch of seemingly-innocent libc functions like setenv()
   2507 	 * take locks.
   2508 	 *
   2509 	 * Hence, only call sys_clone() if we need to, in order to get at pid
   2510 	 * namespacing. If we follow this path, the child's address space might
   2511 	 * have broken locks; you may only call functions that do not acquire
   2512 	 * any locks.
   2513 	 *
   2514 	 * Unfortunately, fork() acquires every lock it can get its hands on, as
   2515 	 * previously detailed, so this function is highly likely to deadlock
   2516 	 * later on (see "deadlock here") if we're multithreaded.
   2517 	 *
   2518 	 * We might hack around this by having the clone()d child (init of the
   2519 	 * pid namespace) return directly, rather than leaving the clone()d
   2520 	 * process hanging around to be init for the new namespace (and having
   2521 	 * its fork()ed child return in turn), but that process would be
   2522 	 * crippled with its libc locks potentially broken. We might try
   2523 	 * fork()ing in the parent before we clone() to ensure that we own all
   2524 	 * the locks, but then we have to have the forked child hanging around
   2525 	 * consuming resources (and possibly having file descriptors / shared
   2526 	 * memory regions / etc attached). We'd need to keep the child around to
   2527 	 * avoid having its children get reparented to init.
   2528 	 *
   2529 	 * TODO(ellyjones): figure out if the "forked child hanging around"
   2530 	 * problem is fixable or not. It would be nice if we worked in this
   2531 	 * case.
   2532 	 */
   2533 	if (pid_namespace) {
   2534 		int clone_flags = CLONE_NEWPID | SIGCHLD;
   2535 		if (j->flags.userns)
   2536 			clone_flags |= CLONE_NEWUSER;
   2537 		child_pid = syscall(SYS_clone, clone_flags, NULL);
   2538 	} else {
   2539 		child_pid = fork();
   2540 	}
   2541 
   2542 	if (child_pid < 0) {
   2543 		if (use_preload) {
   2544 			free(oldenv_copy);
   2545 		}
   2546 		die("failed to fork child");
   2547 	}
   2548 
   2549 	if (child_pid) {
   2550 		if (use_preload) {
   2551 			/* Restore parent's LD_PRELOAD. */
   2552 			if (oldenv_copy) {
   2553 				setenv(kLdPreloadEnvVar, oldenv_copy, 1);
   2554 				free(oldenv_copy);
   2555 			} else {
   2556 				unsetenv(kLdPreloadEnvVar);
   2557 			}
   2558 			unsetenv(kFdEnvVar);
   2559 		}
   2560 
   2561 		j->initpid = child_pid;
   2562 
   2563 		if (j->flags.forward_signals) {
   2564 			forward_pid = child_pid;
   2565 			install_signal_handlers();
   2566 		}
   2567 
   2568 		if (j->flags.pid_file)
   2569 			write_pid_file_or_die(j);
   2570 
   2571 		if (j->flags.cgroups)
   2572 			add_to_cgroups_or_die(j);
   2573 
   2574 		if (j->rlimit_count)
   2575 			set_rlimits_or_die(j);
   2576 
   2577 		if (j->flags.userns)
   2578 			write_ugid_maps_or_die(j);
   2579 
   2580 		if (sync_child)
   2581 			parent_setup_complete(child_sync_pipe_fds);
   2582 
   2583 		if (use_preload) {
   2584 			/* Send marshalled minijail. */
   2585 			close(pipe_fds[0]);	/* read endpoint */
   2586 			ret = minijail_to_fd(j, pipe_fds[1]);
   2587 			close(pipe_fds[1]);	/* write endpoint */
   2588 			if (ret) {
   2589 				kill(j->initpid, SIGKILL);
   2590 				die("failed to send marshalled minijail");
   2591 			}
   2592 		}
   2593 
   2594 		if (status_out->pchild_pid)
   2595 			*status_out->pchild_pid = child_pid;
   2596 
   2597 		/*
   2598 		 * If we want to write to the child process' standard input,
   2599 		 * set up the write end of the pipe.
   2600 		 */
   2601 		if (status_out->pstdin_fd)
   2602 			*status_out->pstdin_fd =
   2603 				setup_pipe_end(stdin_fds, 1 /* write end */);
   2604 
   2605 		/*
   2606 		 * If we want to read from the child process' standard output,
   2607 		 * set up the read end of the pipe.
   2608 		 */
   2609 		if (status_out->pstdout_fd)
   2610 			*status_out->pstdout_fd =
   2611 				setup_pipe_end(stdout_fds, 0 /* read end */);
   2612 
   2613 		/*
   2614 		 * If we want to read from the child process' standard error,
   2615 		 * set up the read end of the pipe.
   2616 		 */
   2617 		if (status_out->pstderr_fd)
   2618 			*status_out->pstderr_fd =
   2619 				setup_pipe_end(stderr_fds, 0 /* read end */);
   2620 
   2621 		/*
   2622 		 * If forking return the child pid, in the normal exec case
   2623 		 * return 0 for success.
   2624 		 */
   2625 		if (!config->exec_in_child)
   2626 			return child_pid;
   2627 		return 0;
   2628 	}
   2629 	/* Child process. */
   2630 	free(oldenv_copy);
   2631 
   2632 	if (j->flags.reset_signal_mask) {
   2633 		sigset_t signal_mask;
   2634 		if (sigemptyset(&signal_mask) != 0)
   2635 			pdie("sigemptyset failed");
   2636 		if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
   2637 			pdie("sigprocmask failed");
   2638 	}
   2639 
   2640 	if (j->flags.close_open_fds) {
   2641 		const size_t kMaxInheritableFdsSize = 10 + MAX_PRESERVED_FDS;
   2642 		int inheritable_fds[kMaxInheritableFdsSize];
   2643 		size_t size = 0;
   2644 		size_t i;
   2645 		if (use_preload) {
   2646 			inheritable_fds[size++] = pipe_fds[0];
   2647 			inheritable_fds[size++] = pipe_fds[1];
   2648 		}
   2649 		if (sync_child) {
   2650 			inheritable_fds[size++] = child_sync_pipe_fds[0];
   2651 			inheritable_fds[size++] = child_sync_pipe_fds[1];
   2652 		}
   2653 		if (status_out->pstdin_fd) {
   2654 			inheritable_fds[size++] = stdin_fds[0];
   2655 			inheritable_fds[size++] = stdin_fds[1];
   2656 		}
   2657 		if (status_out->pstdout_fd) {
   2658 			inheritable_fds[size++] = stdout_fds[0];
   2659 			inheritable_fds[size++] = stdout_fds[1];
   2660 		}
   2661 		if (status_out->pstderr_fd) {
   2662 			inheritable_fds[size++] = stderr_fds[0];
   2663 			inheritable_fds[size++] = stderr_fds[1];
   2664 		}
   2665 		for (i = 0; i < j->preserved_fd_count; i++) {
   2666 			/*
   2667 			 * Preserve all parent_fds. They will be dup2(2)-ed in
   2668 			 * the child later.
   2669 			 */
   2670 			inheritable_fds[size++] = j->preserved_fds[i].parent_fd;
   2671 		}
   2672 
   2673 		if (close_open_fds(inheritable_fds, size) < 0)
   2674 			die("failed to close open file descriptors");
   2675 	}
   2676 
   2677 	if (redirect_fds(j))
   2678 		die("failed to set up fd redirections");
   2679 
   2680 	if (sync_child)
   2681 		wait_for_parent_setup(child_sync_pipe_fds);
   2682 
   2683 	if (j->flags.userns)
   2684 		enter_user_namespace(j);
   2685 
   2686 	/*
   2687 	 * If we want to write to the jailed process' standard input,
   2688 	 * set up the read end of the pipe.
   2689 	 */
   2690 	if (status_out->pstdin_fd) {
   2691 		if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
   2692 					    STDIN_FILENO) < 0)
   2693 			die("failed to set up stdin pipe");
   2694 	}
   2695 
   2696 	/*
   2697 	 * If we want to read from the jailed process' standard output,
   2698 	 * set up the write end of the pipe.
   2699 	 */
   2700 	if (status_out->pstdout_fd) {
   2701 		if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
   2702 					    STDOUT_FILENO) < 0)
   2703 			die("failed to set up stdout pipe");
   2704 	}
   2705 
   2706 	/*
   2707 	 * If we want to read from the jailed process' standard error,
   2708 	 * set up the write end of the pipe.
   2709 	 */
   2710 	if (status_out->pstderr_fd) {
   2711 		if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
   2712 					    STDERR_FILENO) < 0)
   2713 			die("failed to set up stderr pipe");
   2714 	}
   2715 
   2716 	/*
   2717 	 * If any of stdin, stdout, or stderr are TTYs, create a new session.
   2718 	 * This prevents the jailed process from using the TIOCSTI ioctl
   2719 	 * to push characters into the parent process terminal's input buffer,
   2720 	 * therefore escaping the jail.
   2721 	 *
   2722 	 * Since it has just forked, the child will not be a process group
   2723 	 * leader, and this call to setsid() should always succeed.
   2724 	 */
   2725 	if (isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) ||
   2726 	    isatty(STDERR_FILENO)) {
   2727 		if (setsid() < 0) {
   2728 			pdie("setsid() failed");
   2729 		}
   2730 	}
   2731 
   2732 	/* If running an init program, let it decide when/how to mount /proc. */
   2733 	if (pid_namespace && !do_init)
   2734 		j->flags.remount_proc_ro = 0;
   2735 
   2736 	if (use_preload) {
   2737 		/* Strip out flags that cannot be inherited across execve(2). */
   2738 		minijail_preexec(j);
   2739 	} else {
   2740 		/*
   2741 		 * If not using LD_PRELOAD, do all jailing before execve(2).
   2742 		 * Note that PID namespaces can only be entered on fork(2),
   2743 		 * so that flag is still cleared.
   2744 		 */
   2745 		j->flags.pids = 0;
   2746 	}
   2747 
   2748 	/*
   2749 	 * Jail this process.
   2750 	 * If forking, return.
   2751 	 * If not, execve(2) the target.
   2752 	 */
   2753 	minijail_enter(j);
   2754 
   2755 	if (config->exec_in_child && pid_namespace && do_init) {
   2756 		/*
   2757 		 * pid namespace: this process will become init inside the new
   2758 		 * namespace. We don't want all programs we might exec to have
   2759 		 * to know how to be init. Normally (do_init == 1) we fork off
   2760 		 * a child to actually run the program. If |do_init == 0|, we
   2761 		 * let the program keep pid 1 and be init.
   2762 		 *
   2763 		 * If we're multithreaded, we'll probably deadlock here. See
   2764 		 * WARNING above.
   2765 		 */
   2766 		child_pid = fork();
   2767 		if (child_pid < 0) {
   2768 			_exit(child_pid);
   2769 		} else if (child_pid > 0) {
   2770 			/*
   2771 			 * Best effort. Don't bother checking the return value.
   2772 			 */
   2773 			prctl(PR_SET_NAME, "minijail-init");
   2774 			init(child_pid);	/* Never returns. */
   2775 		}
   2776 	}
   2777 
   2778 	run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE);
   2779 
   2780 	if (!config->exec_in_child)
   2781 		return 0;
   2782 
   2783 	/*
   2784 	 * If we aren't pid-namespaced, or the jailed program asked to be init:
   2785 	 *   calling process
   2786 	 *   -> execve()-ing process
   2787 	 * If we are:
   2788 	 *   calling process
   2789 	 *   -> init()-ing process
   2790 	 *      -> execve()-ing process
   2791 	 */
   2792 	ret = execve(config->filename, config->argv, environ);
   2793 	if (ret == -1) {
   2794 		pwarn("execve(%s) failed", config->filename);
   2795 	}
   2796 	_exit(ret);
   2797 }
   2798 
   2799 int API minijail_kill(struct minijail *j)
   2800 {
   2801 	int st;
   2802 	if (kill(j->initpid, SIGTERM))
   2803 		return -errno;
   2804 	if (waitpid(j->initpid, &st, 0) < 0)
   2805 		return -errno;
   2806 	return st;
   2807 }
   2808 
   2809 int API minijail_wait(struct minijail *j)
   2810 {
   2811 	int st;
   2812 	if (waitpid(j->initpid, &st, 0) < 0)
   2813 		return -errno;
   2814 
   2815 	if (!WIFEXITED(st)) {
   2816 		int error_status = st;
   2817 		if (WIFSIGNALED(st)) {
   2818 			int signum = WTERMSIG(st);
   2819 			warn("child process %d received signal %d",
   2820 			     j->initpid, signum);
   2821 			/*
   2822 			 * We return MINIJAIL_ERR_JAIL if the process received
   2823 			 * SIGSYS, which happens when a syscall is blocked by
   2824 			 * seccomp filters.
   2825 			 * If not, we do what bash(1) does:
   2826 			 * $? = 128 + signum
   2827 			 */
   2828 			if (signum == SIGSYS) {
   2829 				error_status = MINIJAIL_ERR_JAIL;
   2830 			} else {
   2831 				error_status = 128 + signum;
   2832 			}
   2833 		}
   2834 		return error_status;
   2835 	}
   2836 
   2837 	int exit_status = WEXITSTATUS(st);
   2838 	if (exit_status != 0)
   2839 		info("child process %d exited with status %d",
   2840 		     j->initpid, exit_status);
   2841 
   2842 	return exit_status;
   2843 }
   2844 
   2845 void API minijail_destroy(struct minijail *j)
   2846 {
   2847 	size_t i;
   2848 
   2849 	if (j->flags.seccomp_filter && j->filter_prog) {
   2850 		free(j->filter_prog->filter);
   2851 		free(j->filter_prog);
   2852 	}
   2853 	free_mounts_list(j);
   2854 	while (j->hooks_head) {
   2855 		struct hook *c = j->hooks_head;
   2856 		j->hooks_head = c->next;
   2857 		free(c);
   2858 	}
   2859 	j->hooks_tail = NULL;
   2860 	if (j->user)
   2861 		free(j->user);
   2862 	if (j->suppl_gid_list)
   2863 		free(j->suppl_gid_list);
   2864 	if (j->chrootdir)
   2865 		free(j->chrootdir);
   2866 	if (j->pid_file_path)
   2867 		free(j->pid_file_path);
   2868 	if (j->uidmap)
   2869 		free(j->uidmap);
   2870 	if (j->gidmap)
   2871 		free(j->gidmap);
   2872 	if (j->hostname)
   2873 		free(j->hostname);
   2874 	if (j->alt_syscall_table)
   2875 		free(j->alt_syscall_table);
   2876 	for (i = 0; i < j->cgroup_count; ++i)
   2877 		free(j->cgroups[i]);
   2878 	free(j);
   2879 }
   2880 
   2881 void API minijail_log_to_fd(int fd, int min_priority)
   2882 {
   2883 	init_logging(LOG_TO_FD, fd, min_priority);
   2884 }
   2885