1 /* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 #define _BSD_SOURCE 7 #define _DEFAULT_SOURCE 8 #define _GNU_SOURCE 9 10 #include <asm/unistd.h> 11 #include <dirent.h> 12 #include <errno.h> 13 #include <fcntl.h> 14 #include <grp.h> 15 #include <linux/capability.h> 16 #include <linux/filter.h> 17 #include <sched.h> 18 #include <signal.h> 19 #include <stdbool.h> 20 #include <stddef.h> 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 #include <sys/capability.h> 25 #include <sys/mount.h> 26 #include <sys/param.h> 27 #include <sys/prctl.h> 28 #include <sys/resource.h> 29 #include <sys/stat.h> 30 #include <sys/sysmacros.h> 31 #include <sys/types.h> 32 #include <sys/user.h> 33 #include <sys/wait.h> 34 #include <syscall.h> 35 #include <unistd.h> 36 37 #include "libminijail.h" 38 #include "libminijail-private.h" 39 40 #include "signal_handler.h" 41 #include "syscall_filter.h" 42 #include "syscall_wrapper.h" 43 #include "system.h" 44 #include "util.h" 45 46 /* Until these are reliably available in linux/prctl.h. */ 47 #ifndef PR_ALT_SYSCALL 48 # define PR_ALT_SYSCALL 0x43724f53 49 #endif 50 51 /* Seccomp filter related flags. */ 52 #ifndef PR_SET_NO_NEW_PRIVS 53 # define PR_SET_NO_NEW_PRIVS 38 54 #endif 55 56 #ifndef SECCOMP_MODE_FILTER 57 #define SECCOMP_MODE_FILTER 2 /* Uses user-supplied filter. */ 58 #endif 59 60 #ifndef SECCOMP_SET_MODE_STRICT 61 # define SECCOMP_SET_MODE_STRICT 0 62 #endif 63 #ifndef SECCOMP_SET_MODE_FILTER 64 # define SECCOMP_SET_MODE_FILTER 1 65 #endif 66 67 #ifndef SECCOMP_FILTER_FLAG_TSYNC 68 # define SECCOMP_FILTER_FLAG_TSYNC 1 69 #endif 70 /* End seccomp filter related flags. */ 71 72 /* New cgroup namespace might not be in linux-headers yet. */ 73 #ifndef CLONE_NEWCGROUP 74 # define CLONE_NEWCGROUP 0x02000000 75 #endif 76 77 #define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 78 79 #define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */ 80 81 #define MAX_PRESERVED_FDS 32U 82 83 /* Keyctl commands. */ 84 #define KEYCTL_JOIN_SESSION_KEYRING 1 85 86 /* 87 * The userspace equivalent of MNT_USER_SETTABLE_MASK, which is the mask of all 88 * flags that can be modified by MS_REMOUNT. 89 */ 90 #define MS_USER_SETTABLE_MASK \ 91 (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | \ 92 MS_RELATIME | MS_RDONLY) 93 94 struct minijail_rlimit { 95 int type; 96 rlim_t cur; 97 rlim_t max; 98 }; 99 100 struct mountpoint { 101 char *src; 102 char *dest; 103 char *type; 104 char *data; 105 int has_data; 106 unsigned long flags; 107 struct mountpoint *next; 108 }; 109 110 struct hook { 111 minijail_hook_t hook; 112 void *payload; 113 minijail_hook_event_t event; 114 struct hook *next; 115 }; 116 117 struct preserved_fd { 118 int parent_fd; 119 int child_fd; 120 }; 121 122 struct minijail { 123 /* 124 * WARNING: if you add a flag here you need to make sure it's 125 * accounted for in minijail_pre{enter|exec}() below. 126 */ 127 struct { 128 int uid : 1; 129 int gid : 1; 130 int inherit_suppl_gids : 1; 131 int set_suppl_gids : 1; 132 int keep_suppl_gids : 1; 133 int use_caps : 1; 134 int capbset_drop : 1; 135 int set_ambient_caps : 1; 136 int vfs : 1; 137 int enter_vfs : 1; 138 int pids : 1; 139 int ipc : 1; 140 int uts : 1; 141 int net : 1; 142 int enter_net : 1; 143 int ns_cgroups : 1; 144 int userns : 1; 145 int disable_setgroups : 1; 146 int seccomp : 1; 147 int remount_proc_ro : 1; 148 int no_new_privs : 1; 149 int seccomp_filter : 1; 150 int seccomp_filter_tsync : 1; 151 int seccomp_filter_logging : 1; 152 int chroot : 1; 153 int pivot_root : 1; 154 int mount_dev : 1; 155 int mount_tmp : 1; 156 int do_init : 1; 157 int run_as_init : 1; 158 int pid_file : 1; 159 int cgroups : 1; 160 int alt_syscall : 1; 161 int reset_signal_mask : 1; 162 int reset_signal_handlers : 1; 163 int close_open_fds : 1; 164 int new_session_keyring : 1; 165 int forward_signals : 1; 166 } flags; 167 uid_t uid; 168 gid_t gid; 169 gid_t usergid; 170 char *user; 171 size_t suppl_gid_count; 172 gid_t *suppl_gid_list; 173 uint64_t caps; 174 uint64_t cap_bset; 175 pid_t initpid; 176 int mountns_fd; 177 int netns_fd; 178 char *chrootdir; 179 char *pid_file_path; 180 char *uidmap; 181 char *gidmap; 182 char *hostname; 183 char *preload_path; 184 size_t filter_len; 185 struct sock_fprog *filter_prog; 186 char *alt_syscall_table; 187 struct mountpoint *mounts_head; 188 struct mountpoint *mounts_tail; 189 size_t mounts_count; 190 unsigned long remount_mode; 191 size_t tmpfs_size; 192 char *cgroups[MAX_CGROUPS]; 193 size_t cgroup_count; 194 struct minijail_rlimit rlimits[MAX_RLIMITS]; 195 size_t rlimit_count; 196 uint64_t securebits_skip_mask; 197 struct hook *hooks_head; 198 struct hook *hooks_tail; 199 struct preserved_fd preserved_fds[MAX_PRESERVED_FDS]; 200 size_t preserved_fd_count; 201 }; 202 203 static void run_hooks_or_die(const struct minijail *j, 204 minijail_hook_event_t event); 205 206 static void free_mounts_list(struct minijail *j) 207 { 208 while (j->mounts_head) { 209 struct mountpoint *m = j->mounts_head; 210 j->mounts_head = j->mounts_head->next; 211 free(m->data); 212 free(m->type); 213 free(m->dest); 214 free(m->src); 215 free(m); 216 } 217 // No need to clear mounts_head as we know it's NULL after the loop. 218 j->mounts_tail = NULL; 219 } 220 221 /* 222 * Strip out flags meant for the parent. 223 * We keep things that are not inherited across execve(2) (e.g. capabilities), 224 * or are easier to set after execve(2) (e.g. seccomp filters). 225 */ 226 void minijail_preenter(struct minijail *j) 227 { 228 j->flags.vfs = 0; 229 j->flags.enter_vfs = 0; 230 j->flags.ns_cgroups = 0; 231 j->flags.net = 0; 232 j->flags.uts = 0; 233 j->flags.remount_proc_ro = 0; 234 j->flags.pids = 0; 235 j->flags.do_init = 0; 236 j->flags.run_as_init = 0; 237 j->flags.pid_file = 0; 238 j->flags.cgroups = 0; 239 j->flags.forward_signals = 0; 240 j->remount_mode = 0; 241 } 242 243 /* 244 * Strip out flags meant for the child. 245 * We keep things that are inherited across execve(2). 246 */ 247 void minijail_preexec(struct minijail *j) 248 { 249 int vfs = j->flags.vfs; 250 int enter_vfs = j->flags.enter_vfs; 251 int ns_cgroups = j->flags.ns_cgroups; 252 int net = j->flags.net; 253 int uts = j->flags.uts; 254 int remount_proc_ro = j->flags.remount_proc_ro; 255 int userns = j->flags.userns; 256 if (j->user) 257 free(j->user); 258 j->user = NULL; 259 if (j->suppl_gid_list) 260 free(j->suppl_gid_list); 261 j->suppl_gid_list = NULL; 262 if (j->preload_path) 263 free(j->preload_path); 264 j->preload_path = NULL; 265 free_mounts_list(j); 266 memset(&j->flags, 0, sizeof(j->flags)); 267 /* Now restore anything we meant to keep. */ 268 j->flags.vfs = vfs; 269 j->flags.enter_vfs = enter_vfs; 270 j->flags.ns_cgroups = ns_cgroups; 271 j->flags.net = net; 272 j->flags.uts = uts; 273 j->flags.remount_proc_ro = remount_proc_ro; 274 j->flags.userns = userns; 275 /* Note, |pids| will already have been used before this call. */ 276 } 277 278 /* Minijail API. */ 279 280 struct minijail API *minijail_new(void) 281 { 282 struct minijail *j = calloc(1, sizeof(struct minijail)); 283 j->remount_mode = MS_PRIVATE; 284 return j; 285 } 286 287 void API minijail_change_uid(struct minijail *j, uid_t uid) 288 { 289 if (uid == 0) 290 die("useless change to uid 0"); 291 j->uid = uid; 292 j->flags.uid = 1; 293 } 294 295 void API minijail_change_gid(struct minijail *j, gid_t gid) 296 { 297 if (gid == 0) 298 die("useless change to gid 0"); 299 j->gid = gid; 300 j->flags.gid = 1; 301 } 302 303 void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 304 const gid_t *list) 305 { 306 size_t i; 307 308 if (j->flags.inherit_suppl_gids) 309 die("cannot inherit *and* set supplementary groups"); 310 if (j->flags.keep_suppl_gids) 311 die("cannot keep *and* set supplementary groups"); 312 313 if (size == 0) { 314 /* Clear supplementary groups. */ 315 j->suppl_gid_list = NULL; 316 j->suppl_gid_count = 0; 317 j->flags.set_suppl_gids = 1; 318 return; 319 } 320 321 /* Copy the gid_t array. */ 322 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 323 if (!j->suppl_gid_list) { 324 die("failed to allocate internal supplementary group array"); 325 } 326 for (i = 0; i < size; i++) { 327 j->suppl_gid_list[i] = list[i]; 328 } 329 j->suppl_gid_count = size; 330 j->flags.set_suppl_gids = 1; 331 } 332 333 void API minijail_keep_supplementary_gids(struct minijail *j) { 334 j->flags.keep_suppl_gids = 1; 335 } 336 337 int API minijail_change_user(struct minijail *j, const char *user) 338 { 339 uid_t uid; 340 gid_t gid; 341 int rc = lookup_user(user, &uid, &gid); 342 if (rc) 343 return rc; 344 minijail_change_uid(j, uid); 345 j->user = strdup(user); 346 if (!j->user) 347 return -ENOMEM; 348 j->usergid = gid; 349 return 0; 350 } 351 352 int API minijail_change_group(struct minijail *j, const char *group) 353 { 354 gid_t gid; 355 int rc = lookup_group(group, &gid); 356 if (rc) 357 return rc; 358 minijail_change_gid(j, gid); 359 return 0; 360 } 361 362 void API minijail_use_seccomp(struct minijail *j) 363 { 364 j->flags.seccomp = 1; 365 } 366 367 void API minijail_no_new_privs(struct minijail *j) 368 { 369 j->flags.no_new_privs = 1; 370 } 371 372 void API minijail_use_seccomp_filter(struct minijail *j) 373 { 374 j->flags.seccomp_filter = 1; 375 } 376 377 void API minijail_set_seccomp_filter_tsync(struct minijail *j) 378 { 379 if (j->filter_len > 0 && j->filter_prog != NULL) { 380 die("minijail_set_seccomp_filter_tsync() must be called " 381 "before minijail_parse_seccomp_filters()"); 382 } 383 j->flags.seccomp_filter_tsync = 1; 384 } 385 386 void API minijail_log_seccomp_filter_failures(struct minijail *j) 387 { 388 if (j->filter_len > 0 && j->filter_prog != NULL) { 389 die("minijail_log_seccomp_filter_failures() must be called " 390 "before minijail_parse_seccomp_filters()"); 391 } 392 #ifdef ALLOW_DEBUG_LOGGING 393 j->flags.seccomp_filter_logging = 1; 394 #else 395 warn("non-debug build: ignoring request to enable seccomp logging"); 396 #endif 397 } 398 399 void API minijail_use_caps(struct minijail *j, uint64_t capmask) 400 { 401 /* 402 * 'minijail_use_caps' configures a runtime-capabilities-only 403 * environment, including a bounding set matching the thread's runtime 404 * (permitted|inheritable|effective) sets. 405 * Therefore, it will override any existing bounding set configurations 406 * since the latter would allow gaining extra runtime capabilities from 407 * file capabilities. 408 */ 409 if (j->flags.capbset_drop) { 410 warn("overriding bounding set configuration"); 411 j->cap_bset = 0; 412 j->flags.capbset_drop = 0; 413 } 414 j->caps = capmask; 415 j->flags.use_caps = 1; 416 } 417 418 void API minijail_capbset_drop(struct minijail *j, uint64_t capmask) 419 { 420 if (j->flags.use_caps) { 421 /* 422 * 'minijail_use_caps' will have already configured a capability 423 * bounding set matching the (permitted|inheritable|effective) 424 * sets. Abort if the user tries to configure a separate 425 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps' 426 * are mutually exclusive. 427 */ 428 die("runtime capabilities already configured, can't drop " 429 "bounding set separately"); 430 } 431 j->cap_bset = capmask; 432 j->flags.capbset_drop = 1; 433 } 434 435 void API minijail_set_ambient_caps(struct minijail *j) 436 { 437 j->flags.set_ambient_caps = 1; 438 } 439 440 void API minijail_reset_signal_mask(struct minijail *j) 441 { 442 j->flags.reset_signal_mask = 1; 443 } 444 445 void API minijail_reset_signal_handlers(struct minijail *j) 446 { 447 j->flags.reset_signal_handlers = 1; 448 } 449 450 void API minijail_namespace_vfs(struct minijail *j) 451 { 452 j->flags.vfs = 1; 453 } 454 455 void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 456 { 457 /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */ 458 int ns_fd = open(ns_path, O_RDONLY); 459 if (ns_fd < 0) { 460 pdie("failed to open namespace '%s'", ns_path); 461 } 462 j->mountns_fd = ns_fd; 463 j->flags.enter_vfs = 1; 464 } 465 466 void API minijail_new_session_keyring(struct minijail *j) 467 { 468 j->flags.new_session_keyring = 1; 469 } 470 471 void API minijail_skip_setting_securebits(struct minijail *j, 472 uint64_t securebits_skip_mask) 473 { 474 j->securebits_skip_mask = securebits_skip_mask; 475 } 476 477 void API minijail_remount_mode(struct minijail *j, unsigned long mode) 478 { 479 j->remount_mode = mode; 480 } 481 482 void API minijail_skip_remount_private(struct minijail *j) 483 { 484 j->remount_mode = 0; 485 } 486 487 void API minijail_namespace_pids(struct minijail *j) 488 { 489 j->flags.vfs = 1; 490 j->flags.remount_proc_ro = 1; 491 j->flags.pids = 1; 492 j->flags.do_init = 1; 493 } 494 495 void API minijail_namespace_pids_rw_proc(struct minijail *j) 496 { 497 j->flags.vfs = 1; 498 j->flags.pids = 1; 499 j->flags.do_init = 1; 500 } 501 502 void API minijail_namespace_ipc(struct minijail *j) 503 { 504 j->flags.ipc = 1; 505 } 506 507 void API minijail_namespace_uts(struct minijail *j) 508 { 509 j->flags.uts = 1; 510 } 511 512 int API minijail_namespace_set_hostname(struct minijail *j, const char *name) 513 { 514 if (j->hostname) 515 return -EINVAL; 516 minijail_namespace_uts(j); 517 j->hostname = strdup(name); 518 if (!j->hostname) 519 return -ENOMEM; 520 return 0; 521 } 522 523 void API minijail_namespace_net(struct minijail *j) 524 { 525 j->flags.net = 1; 526 } 527 528 void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 529 { 530 /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */ 531 int ns_fd = open(ns_path, O_RDONLY); 532 if (ns_fd < 0) { 533 pdie("failed to open namespace '%s'", ns_path); 534 } 535 j->netns_fd = ns_fd; 536 j->flags.enter_net = 1; 537 } 538 539 void API minijail_namespace_cgroups(struct minijail *j) 540 { 541 j->flags.ns_cgroups = 1; 542 } 543 544 void API minijail_close_open_fds(struct minijail *j) 545 { 546 j->flags.close_open_fds = 1; 547 } 548 549 void API minijail_remount_proc_readonly(struct minijail *j) 550 { 551 j->flags.vfs = 1; 552 j->flags.remount_proc_ro = 1; 553 } 554 555 void API minijail_namespace_user(struct minijail *j) 556 { 557 j->flags.userns = 1; 558 } 559 560 void API minijail_namespace_user_disable_setgroups(struct minijail *j) 561 { 562 j->flags.disable_setgroups = 1; 563 } 564 565 int API minijail_uidmap(struct minijail *j, const char *uidmap) 566 { 567 j->uidmap = strdup(uidmap); 568 if (!j->uidmap) 569 return -ENOMEM; 570 char *ch; 571 for (ch = j->uidmap; *ch; ch++) { 572 if (*ch == ',') 573 *ch = '\n'; 574 } 575 return 0; 576 } 577 578 int API minijail_gidmap(struct minijail *j, const char *gidmap) 579 { 580 j->gidmap = strdup(gidmap); 581 if (!j->gidmap) 582 return -ENOMEM; 583 char *ch; 584 for (ch = j->gidmap; *ch; ch++) { 585 if (*ch == ',') 586 *ch = '\n'; 587 } 588 return 0; 589 } 590 591 void API minijail_inherit_usergroups(struct minijail *j) 592 { 593 j->flags.inherit_suppl_gids = 1; 594 } 595 596 void API minijail_run_as_init(struct minijail *j) 597 { 598 /* 599 * Since the jailed program will become 'init' in the new PID namespace, 600 * Minijail does not need to fork an 'init' process. 601 */ 602 j->flags.run_as_init = 1; 603 } 604 605 int API minijail_enter_chroot(struct minijail *j, const char *dir) 606 { 607 if (j->chrootdir) 608 return -EINVAL; 609 j->chrootdir = strdup(dir); 610 if (!j->chrootdir) 611 return -ENOMEM; 612 j->flags.chroot = 1; 613 return 0; 614 } 615 616 int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 617 { 618 if (j->chrootdir) 619 return -EINVAL; 620 j->chrootdir = strdup(dir); 621 if (!j->chrootdir) 622 return -ENOMEM; 623 j->flags.pivot_root = 1; 624 return 0; 625 } 626 627 char API *minijail_get_original_path(struct minijail *j, 628 const char *path_inside_chroot) 629 { 630 struct mountpoint *b; 631 632 b = j->mounts_head; 633 while (b) { 634 /* 635 * If |path_inside_chroot| is the exact destination of a 636 * mount, then the original path is exactly the source of 637 * the mount. 638 * for example: "-b /some/path/exe,/chroot/path/exe" 639 * mount source = /some/path/exe, mount dest = 640 * /chroot/path/exe Then when getting the original path of 641 * "/chroot/path/exe", the source of that mount, 642 * "/some/path/exe" is what should be returned. 643 */ 644 if (!strcmp(b->dest, path_inside_chroot)) 645 return strdup(b->src); 646 647 /* 648 * If |path_inside_chroot| is within the destination path of a 649 * mount, take the suffix of the chroot path relative to the 650 * mount destination path, and append it to the mount source 651 * path. 652 */ 653 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 654 const char *relative_path = 655 path_inside_chroot + strlen(b->dest); 656 return path_join(b->src, relative_path); 657 } 658 b = b->next; 659 } 660 661 /* If there is a chroot path, append |path_inside_chroot| to that. */ 662 if (j->chrootdir) 663 return path_join(j->chrootdir, path_inside_chroot); 664 665 /* No chroot, so the path outside is the same as it is inside. */ 666 return strdup(path_inside_chroot); 667 } 668 669 size_t minijail_get_tmpfs_size(const struct minijail *j) 670 { 671 return j->tmpfs_size; 672 } 673 674 void API minijail_mount_dev(struct minijail *j) 675 { 676 j->flags.mount_dev = 1; 677 } 678 679 void API minijail_mount_tmp(struct minijail *j) 680 { 681 minijail_mount_tmp_size(j, 64 * 1024 * 1024); 682 } 683 684 void API minijail_mount_tmp_size(struct minijail *j, size_t size) 685 { 686 j->tmpfs_size = size; 687 j->flags.mount_tmp = 1; 688 } 689 690 int API minijail_write_pid_file(struct minijail *j, const char *path) 691 { 692 j->pid_file_path = strdup(path); 693 if (!j->pid_file_path) 694 return -ENOMEM; 695 j->flags.pid_file = 1; 696 return 0; 697 } 698 699 int API minijail_add_to_cgroup(struct minijail *j, const char *path) 700 { 701 if (j->cgroup_count >= MAX_CGROUPS) 702 return -ENOMEM; 703 j->cgroups[j->cgroup_count] = strdup(path); 704 if (!j->cgroups[j->cgroup_count]) 705 return -ENOMEM; 706 j->cgroup_count++; 707 j->flags.cgroups = 1; 708 return 0; 709 } 710 711 int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max) 712 { 713 size_t i; 714 715 if (j->rlimit_count >= MAX_RLIMITS) 716 return -ENOMEM; 717 /* It's an error if the caller sets the same rlimit multiple times. */ 718 for (i = 0; i < j->rlimit_count; i++) { 719 if (j->rlimits[i].type == type) 720 return -EEXIST; 721 } 722 723 j->rlimits[j->rlimit_count].type = type; 724 j->rlimits[j->rlimit_count].cur = cur; 725 j->rlimits[j->rlimit_count].max = max; 726 j->rlimit_count++; 727 return 0; 728 } 729 730 int API minijail_forward_signals(struct minijail *j) 731 { 732 j->flags.forward_signals = 1; 733 return 0; 734 } 735 736 int API minijail_mount_with_data(struct minijail *j, const char *src, 737 const char *dest, const char *type, 738 unsigned long flags, const char *data) 739 { 740 struct mountpoint *m; 741 742 if (*dest != '/') 743 return -EINVAL; 744 m = calloc(1, sizeof(*m)); 745 if (!m) 746 return -ENOMEM; 747 m->dest = strdup(dest); 748 if (!m->dest) 749 goto error; 750 m->src = strdup(src); 751 if (!m->src) 752 goto error; 753 m->type = strdup(type); 754 if (!m->type) 755 goto error; 756 757 if (!data || !data[0]) { 758 /* 759 * Set up secure defaults for certain filesystems. Adding this 760 * fs-specific logic here kind of sucks, but considering how 761 * people use these in practice, it's probably OK. If they want 762 * the kernel defaults, they can pass data="" instead of NULL. 763 */ 764 if (!strcmp(type, "tmpfs")) { 765 /* tmpfs defaults to mode=1777 and size=50%. */ 766 data = "mode=0755,size=10M"; 767 } 768 } 769 if (data) { 770 m->data = strdup(data); 771 if (!m->data) 772 goto error; 773 m->has_data = 1; 774 } 775 776 /* If they don't specify any flags, default to secure ones. */ 777 if (flags == 0) 778 flags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 779 m->flags = flags; 780 781 /* 782 * Force vfs namespacing so the mounts don't leak out into the 783 * containing vfs namespace. 784 */ 785 minijail_namespace_vfs(j); 786 787 if (j->mounts_tail) 788 j->mounts_tail->next = m; 789 else 790 j->mounts_head = m; 791 j->mounts_tail = m; 792 j->mounts_count++; 793 794 return 0; 795 796 error: 797 free(m->type); 798 free(m->src); 799 free(m->dest); 800 free(m); 801 return -ENOMEM; 802 } 803 804 int API minijail_mount(struct minijail *j, const char *src, const char *dest, 805 const char *type, unsigned long flags) 806 { 807 return minijail_mount_with_data(j, src, dest, type, flags, NULL); 808 } 809 810 int API minijail_bind(struct minijail *j, const char *src, const char *dest, 811 int writeable) 812 { 813 unsigned long flags = MS_BIND; 814 815 if (!writeable) 816 flags |= MS_RDONLY; 817 818 return minijail_mount(j, src, dest, "", flags); 819 } 820 821 int API minijail_add_hook(struct minijail *j, minijail_hook_t hook, 822 void *payload, minijail_hook_event_t event) 823 { 824 struct hook *c; 825 826 if (hook == NULL) 827 return -EINVAL; 828 if (event >= MINIJAIL_HOOK_EVENT_MAX) 829 return -EINVAL; 830 c = calloc(1, sizeof(*c)); 831 if (!c) 832 return -ENOMEM; 833 834 c->hook = hook; 835 c->payload = payload; 836 c->event = event; 837 838 if (j->hooks_tail) 839 j->hooks_tail->next = c; 840 else 841 j->hooks_head = c; 842 j->hooks_tail = c; 843 844 return 0; 845 } 846 847 int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd) 848 { 849 if (parent_fd < 0 || child_fd < 0) 850 return -EINVAL; 851 if (j->preserved_fd_count >= MAX_PRESERVED_FDS) 852 return -ENOMEM; 853 j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd; 854 j->preserved_fds[j->preserved_fd_count].child_fd = child_fd; 855 j->preserved_fd_count++; 856 return 0; 857 } 858 859 int API minijail_set_preload_path(struct minijail *j, const char *preload_path) 860 { 861 if (j->preload_path) 862 return -EINVAL; 863 j->preload_path = strdup(preload_path); 864 if (!j->preload_path) 865 return -ENOMEM; 866 return 0; 867 } 868 869 static void clear_seccomp_options(struct minijail *j) 870 { 871 j->flags.seccomp_filter = 0; 872 j->flags.seccomp_filter_tsync = 0; 873 j->flags.seccomp_filter_logging = 0; 874 j->filter_len = 0; 875 j->filter_prog = NULL; 876 j->flags.no_new_privs = 0; 877 } 878 879 static int seccomp_should_use_filters(struct minijail *j) 880 { 881 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) { 882 /* 883 * |errno| will be set to EINVAL when seccomp has not been 884 * compiled into the kernel. On certain platforms and kernel 885 * versions this is not a fatal failure. In that case, and only 886 * in that case, disable seccomp and skip loading the filters. 887 */ 888 if ((errno == EINVAL) && seccomp_can_softfail()) { 889 warn("not loading seccomp filters, seccomp filter not " 890 "supported"); 891 clear_seccomp_options(j); 892 return 0; 893 } 894 /* 895 * If |errno| != EINVAL or seccomp_can_softfail() is false, 896 * we can proceed. Worst case scenario minijail_enter() will 897 * abort() if seccomp fails. 898 */ 899 } 900 if (j->flags.seccomp_filter_tsync) { 901 /* Are the seccomp(2) syscall and the TSYNC option supported? */ 902 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 903 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) { 904 int saved_errno = errno; 905 if (saved_errno == ENOSYS && seccomp_can_softfail()) { 906 warn("seccomp(2) syscall not supported"); 907 clear_seccomp_options(j); 908 return 0; 909 } else if (saved_errno == EINVAL && 910 seccomp_can_softfail()) { 911 warn( 912 "seccomp filter thread sync not supported"); 913 clear_seccomp_options(j); 914 return 0; 915 } 916 /* 917 * Similar logic here. If seccomp_can_softfail() is 918 * false, or |errno| != ENOSYS, or |errno| != EINVAL, 919 * we can proceed. Worst case scenario minijail_enter() 920 * will abort() if seccomp or TSYNC fail. 921 */ 922 } 923 } 924 return 1; 925 } 926 927 static int set_seccomp_filters_internal(struct minijail *j, 928 struct sock_fprog *filter, bool owned) 929 { 930 struct sock_fprog *fprog; 931 932 if (owned) { 933 fprog = filter; 934 } else { 935 fprog = malloc(sizeof(struct sock_fprog)); 936 if (!fprog) 937 return -ENOMEM; 938 fprog->len = filter->len; 939 fprog->filter = malloc(sizeof(struct sock_filter) * fprog->len); 940 if (!fprog->filter) { 941 free(fprog); 942 return -ENOMEM; 943 } 944 memcpy(fprog->filter, filter->filter, 945 sizeof(struct sock_filter) * fprog->len); 946 } 947 948 if (j->filter_prog) { 949 free(j->filter_prog->filter); 950 free(j->filter_prog); 951 } 952 953 j->filter_len = fprog->len; 954 j->filter_prog = fprog; 955 return 0; 956 } 957 958 void API minijail_set_seccomp_filters(struct minijail *j, 959 const struct sock_fprog *filter) 960 { 961 if (!seccomp_should_use_filters(j)) 962 return; 963 964 if (j->flags.seccomp_filter_logging) { 965 die("minijail_log_seccomp_filter_failures() is incompatible " 966 "with minijail_set_seccomp_filters()"); 967 } 968 969 /* 970 * set_seccomp_filters_internal() can only fail with ENOMEM. 971 * Furthermore, since we won't own the incoming filter, it will not be 972 * modified. 973 */ 974 if (set_seccomp_filters_internal(j, (struct sock_fprog *)filter, 975 false) < 0) { 976 die("failed to copy seccomp filter"); 977 } 978 } 979 980 static int parse_seccomp_filters(struct minijail *j, const char *filename, 981 FILE *policy_file) 982 { 983 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 984 if (!fprog) 985 return -ENOMEM; 986 int use_ret_trap = 987 j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging; 988 int allow_logging = j->flags.seccomp_filter_logging; 989 990 if (compile_filter(filename, policy_file, fprog, use_ret_trap, 991 allow_logging)) { 992 free(fprog); 993 return -1; 994 } 995 996 return set_seccomp_filters_internal(j, fprog, true); 997 } 998 999 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 1000 { 1001 if (!seccomp_should_use_filters(j)) 1002 return; 1003 1004 FILE *file = fopen(path, "re"); 1005 if (!file) { 1006 pdie("failed to open seccomp filter file '%s'", path); 1007 } 1008 1009 if (parse_seccomp_filters(j, path, file) != 0) { 1010 die("failed to compile seccomp filter BPF program in '%s'", 1011 path); 1012 } 1013 fclose(file); 1014 } 1015 1016 void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd) 1017 { 1018 char *fd_path, *path; 1019 FILE *file; 1020 1021 if (!seccomp_should_use_filters(j)) 1022 return; 1023 1024 file = fdopen(fd, "r"); 1025 if (!file) { 1026 pdie("failed to associate stream with fd %d", fd); 1027 } 1028 1029 if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1) 1030 pdie("failed to create path for fd %d", fd); 1031 path = realpath(fd_path, NULL); 1032 if (path == NULL) 1033 pwarn("failed to get path of fd %d", fd); 1034 free(fd_path); 1035 1036 if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) { 1037 die("failed to compile seccomp filter BPF program from fd %d", 1038 fd); 1039 } 1040 free(path); 1041 fclose(file); 1042 } 1043 1044 int API minijail_use_alt_syscall(struct minijail *j, const char *table) 1045 { 1046 j->alt_syscall_table = strdup(table); 1047 if (!j->alt_syscall_table) 1048 return -ENOMEM; 1049 j->flags.alt_syscall = 1; 1050 return 0; 1051 } 1052 1053 struct marshal_state { 1054 size_t available; 1055 size_t total; 1056 char *buf; 1057 }; 1058 1059 void marshal_state_init(struct marshal_state *state, char *buf, 1060 size_t available) 1061 { 1062 state->available = available; 1063 state->buf = buf; 1064 state->total = 0; 1065 } 1066 1067 void marshal_append(struct marshal_state *state, void *src, size_t length) 1068 { 1069 size_t copy_len = MIN(state->available, length); 1070 1071 /* Up to |available| will be written. */ 1072 if (copy_len) { 1073 memcpy(state->buf, src, copy_len); 1074 state->buf += copy_len; 1075 state->available -= copy_len; 1076 } 1077 /* |total| will contain the expected length. */ 1078 state->total += length; 1079 } 1080 1081 void marshal_mount(struct marshal_state *state, const struct mountpoint *m) 1082 { 1083 marshal_append(state, m->src, strlen(m->src) + 1); 1084 marshal_append(state, m->dest, strlen(m->dest) + 1); 1085 marshal_append(state, m->type, strlen(m->type) + 1); 1086 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data)); 1087 if (m->has_data) 1088 marshal_append(state, m->data, strlen(m->data) + 1); 1089 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 1090 } 1091 1092 void minijail_marshal_helper(struct marshal_state *state, 1093 const struct minijail *j) 1094 { 1095 struct mountpoint *m = NULL; 1096 size_t i; 1097 1098 marshal_append(state, (char *)j, sizeof(*j)); 1099 if (j->user) 1100 marshal_append(state, j->user, strlen(j->user) + 1); 1101 if (j->suppl_gid_list) { 1102 marshal_append(state, j->suppl_gid_list, 1103 j->suppl_gid_count * sizeof(gid_t)); 1104 } 1105 if (j->chrootdir) 1106 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 1107 if (j->hostname) 1108 marshal_append(state, j->hostname, strlen(j->hostname) + 1); 1109 if (j->alt_syscall_table) { 1110 marshal_append(state, j->alt_syscall_table, 1111 strlen(j->alt_syscall_table) + 1); 1112 } 1113 if (j->flags.seccomp_filter && j->filter_prog) { 1114 struct sock_fprog *fp = j->filter_prog; 1115 marshal_append(state, (char *)fp->filter, 1116 fp->len * sizeof(struct sock_filter)); 1117 } 1118 for (m = j->mounts_head; m; m = m->next) { 1119 marshal_mount(state, m); 1120 } 1121 for (i = 0; i < j->cgroup_count; ++i) 1122 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 1123 } 1124 1125 size_t API minijail_size(const struct minijail *j) 1126 { 1127 struct marshal_state state; 1128 marshal_state_init(&state, NULL, 0); 1129 minijail_marshal_helper(&state, j); 1130 return state.total; 1131 } 1132 1133 int minijail_marshal(const struct minijail *j, char *buf, size_t available) 1134 { 1135 struct marshal_state state; 1136 marshal_state_init(&state, buf, available); 1137 minijail_marshal_helper(&state, j); 1138 return (state.total > available); 1139 } 1140 1141 int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 1142 { 1143 size_t i; 1144 size_t count; 1145 int ret = -EINVAL; 1146 1147 if (length < sizeof(*j)) 1148 goto out; 1149 memcpy((void *)j, serialized, sizeof(*j)); 1150 serialized += sizeof(*j); 1151 length -= sizeof(*j); 1152 1153 /* Potentially stale pointers not used as signals. */ 1154 j->preload_path = NULL; 1155 j->pid_file_path = NULL; 1156 j->uidmap = NULL; 1157 j->gidmap = NULL; 1158 j->mounts_head = NULL; 1159 j->mounts_tail = NULL; 1160 j->filter_prog = NULL; 1161 j->hooks_head = NULL; 1162 j->hooks_tail = NULL; 1163 1164 if (j->user) { /* stale pointer */ 1165 char *user = consumestr(&serialized, &length); 1166 if (!user) 1167 goto clear_pointers; 1168 j->user = strdup(user); 1169 if (!j->user) 1170 goto clear_pointers; 1171 } 1172 1173 if (j->suppl_gid_list) { /* stale pointer */ 1174 if (j->suppl_gid_count > NGROUPS_MAX) { 1175 goto bad_gid_list; 1176 } 1177 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 1178 void *gid_list_bytes = 1179 consumebytes(gid_list_size, &serialized, &length); 1180 if (!gid_list_bytes) 1181 goto bad_gid_list; 1182 1183 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 1184 if (!j->suppl_gid_list) 1185 goto bad_gid_list; 1186 1187 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 1188 } 1189 1190 if (j->chrootdir) { /* stale pointer */ 1191 char *chrootdir = consumestr(&serialized, &length); 1192 if (!chrootdir) 1193 goto bad_chrootdir; 1194 j->chrootdir = strdup(chrootdir); 1195 if (!j->chrootdir) 1196 goto bad_chrootdir; 1197 } 1198 1199 if (j->hostname) { /* stale pointer */ 1200 char *hostname = consumestr(&serialized, &length); 1201 if (!hostname) 1202 goto bad_hostname; 1203 j->hostname = strdup(hostname); 1204 if (!j->hostname) 1205 goto bad_hostname; 1206 } 1207 1208 if (j->alt_syscall_table) { /* stale pointer */ 1209 char *alt_syscall_table = consumestr(&serialized, &length); 1210 if (!alt_syscall_table) 1211 goto bad_syscall_table; 1212 j->alt_syscall_table = strdup(alt_syscall_table); 1213 if (!j->alt_syscall_table) 1214 goto bad_syscall_table; 1215 } 1216 1217 if (j->flags.seccomp_filter && j->filter_len > 0) { 1218 size_t ninstrs = j->filter_len; 1219 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 1220 ninstrs > USHRT_MAX) 1221 goto bad_filters; 1222 1223 size_t program_len = ninstrs * sizeof(struct sock_filter); 1224 void *program = consumebytes(program_len, &serialized, &length); 1225 if (!program) 1226 goto bad_filters; 1227 1228 j->filter_prog = malloc(sizeof(struct sock_fprog)); 1229 if (!j->filter_prog) 1230 goto bad_filters; 1231 1232 j->filter_prog->len = ninstrs; 1233 j->filter_prog->filter = malloc(program_len); 1234 if (!j->filter_prog->filter) 1235 goto bad_filter_prog_instrs; 1236 1237 memcpy(j->filter_prog->filter, program, program_len); 1238 } 1239 1240 count = j->mounts_count; 1241 j->mounts_count = 0; 1242 for (i = 0; i < count; ++i) { 1243 unsigned long *flags; 1244 int *has_data; 1245 const char *dest; 1246 const char *type; 1247 const char *data = NULL; 1248 const char *src = consumestr(&serialized, &length); 1249 if (!src) 1250 goto bad_mounts; 1251 dest = consumestr(&serialized, &length); 1252 if (!dest) 1253 goto bad_mounts; 1254 type = consumestr(&serialized, &length); 1255 if (!type) 1256 goto bad_mounts; 1257 has_data = consumebytes(sizeof(*has_data), &serialized, 1258 &length); 1259 if (!has_data) 1260 goto bad_mounts; 1261 if (*has_data) { 1262 data = consumestr(&serialized, &length); 1263 if (!data) 1264 goto bad_mounts; 1265 } 1266 flags = consumebytes(sizeof(*flags), &serialized, &length); 1267 if (!flags) 1268 goto bad_mounts; 1269 if (minijail_mount_with_data(j, src, dest, type, *flags, data)) 1270 goto bad_mounts; 1271 } 1272 1273 count = j->cgroup_count; 1274 j->cgroup_count = 0; 1275 for (i = 0; i < count; ++i) { 1276 char *cgroup = consumestr(&serialized, &length); 1277 if (!cgroup) 1278 goto bad_cgroups; 1279 j->cgroups[i] = strdup(cgroup); 1280 if (!j->cgroups[i]) 1281 goto bad_cgroups; 1282 ++j->cgroup_count; 1283 } 1284 1285 return 0; 1286 1287 bad_cgroups: 1288 free_mounts_list(j); 1289 for (i = 0; i < j->cgroup_count; ++i) 1290 free(j->cgroups[i]); 1291 bad_mounts: 1292 if (j->filter_prog && j->filter_prog->filter) 1293 free(j->filter_prog->filter); 1294 bad_filter_prog_instrs: 1295 if (j->filter_prog) 1296 free(j->filter_prog); 1297 bad_filters: 1298 if (j->alt_syscall_table) 1299 free(j->alt_syscall_table); 1300 bad_syscall_table: 1301 if (j->chrootdir) 1302 free(j->chrootdir); 1303 bad_chrootdir: 1304 if (j->hostname) 1305 free(j->hostname); 1306 bad_hostname: 1307 if (j->suppl_gid_list) 1308 free(j->suppl_gid_list); 1309 bad_gid_list: 1310 if (j->user) 1311 free(j->user); 1312 clear_pointers: 1313 j->user = NULL; 1314 j->suppl_gid_list = NULL; 1315 j->chrootdir = NULL; 1316 j->hostname = NULL; 1317 j->alt_syscall_table = NULL; 1318 j->cgroup_count = 0; 1319 out: 1320 return ret; 1321 } 1322 1323 struct dev_spec { 1324 const char *name; 1325 mode_t mode; 1326 dev_t major, minor; 1327 }; 1328 1329 static const struct dev_spec device_nodes[] = { 1330 { 1331 "null", 1332 S_IFCHR | 0666, 1, 3, 1333 }, 1334 { 1335 "zero", 1336 S_IFCHR | 0666, 1, 5, 1337 }, 1338 { 1339 "full", 1340 S_IFCHR | 0666, 1, 7, 1341 }, 1342 { 1343 "urandom", 1344 S_IFCHR | 0444, 1, 9, 1345 }, 1346 { 1347 "tty", 1348 S_IFCHR | 0666, 5, 0, 1349 }, 1350 }; 1351 1352 struct dev_sym_spec { 1353 const char *source, *dest; 1354 }; 1355 1356 static const struct dev_sym_spec device_symlinks[] = { 1357 { "ptmx", "pts/ptmx", }, 1358 { "fd", "/proc/self/fd", }, 1359 { "stdin", "fd/0", }, 1360 { "stdout", "fd/1", }, 1361 { "stderr", "fd/2", }, 1362 }; 1363 1364 /* 1365 * Clean up the temporary dev path we had setup previously. In case of errors, 1366 * we don't want to go leaking empty tempdirs. 1367 */ 1368 static void mount_dev_cleanup(char *dev_path) 1369 { 1370 umount2(dev_path, MNT_DETACH); 1371 rmdir(dev_path); 1372 free(dev_path); 1373 } 1374 1375 /* 1376 * Set up the pseudo /dev path at the temporary location. 1377 * See mount_dev_finalize for more details. 1378 */ 1379 static int mount_dev(char **dev_path_ret) 1380 { 1381 int ret; 1382 int dev_fd; 1383 size_t i; 1384 mode_t mask; 1385 char *dev_path; 1386 1387 /* 1388 * Create a temp path for the /dev init. We'll relocate this to the 1389 * final location later on in the startup process. 1390 */ 1391 dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX"); 1392 if (dev_path == NULL || mkdtemp(dev_path) == NULL) 1393 pdie("could not create temp path for /dev"); 1394 1395 /* Set up the empty /dev mount point first. */ 1396 ret = mount("minijail-devfs", dev_path, "tmpfs", 1397 MS_NOEXEC | MS_NOSUID, "size=5M,mode=755"); 1398 if (ret) { 1399 rmdir(dev_path); 1400 return ret; 1401 } 1402 1403 /* We want to set the mode directly from the spec. */ 1404 mask = umask(0); 1405 1406 /* Get a handle to the temp dev path for *at funcs below. */ 1407 dev_fd = open(dev_path, O_DIRECTORY|O_PATH|O_CLOEXEC); 1408 if (dev_fd < 0) { 1409 ret = 1; 1410 goto done; 1411 } 1412 1413 /* Create all the nodes in /dev. */ 1414 for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) { 1415 const struct dev_spec *ds = &device_nodes[i]; 1416 ret = mknodat(dev_fd, ds->name, ds->mode, 1417 makedev(ds->major, ds->minor)); 1418 if (ret) 1419 goto done; 1420 } 1421 1422 /* Create all the symlinks in /dev. */ 1423 for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) { 1424 const struct dev_sym_spec *ds = &device_symlinks[i]; 1425 ret = symlinkat(ds->dest, dev_fd, ds->source); 1426 if (ret) 1427 goto done; 1428 } 1429 1430 /* Restore old mask. */ 1431 done: 1432 close(dev_fd); 1433 umask(mask); 1434 1435 if (ret) 1436 mount_dev_cleanup(dev_path); 1437 1438 return ret; 1439 } 1440 1441 /* 1442 * Relocate the temporary /dev mount to its final /dev place. 1443 * We have to do this two step process so people can bind mount extra 1444 * /dev paths like /dev/log. 1445 */ 1446 static int mount_dev_finalize(const struct minijail *j, char *dev_path) 1447 { 1448 int ret = -1; 1449 char *dest = NULL; 1450 1451 /* Unmount the /dev mount if possible. */ 1452 if (umount2("/dev", MNT_DETACH)) 1453 goto done; 1454 1455 if (asprintf(&dest, "%s/dev", j->chrootdir ? : "") < 0) 1456 goto done; 1457 1458 if (mount(dev_path, dest, NULL, MS_MOVE, NULL)) 1459 goto done; 1460 1461 ret = 0; 1462 done: 1463 free(dest); 1464 mount_dev_cleanup(dev_path); 1465 1466 return ret; 1467 } 1468 1469 /* 1470 * mount_one: Applies mounts from @m for @j, recursing as needed. 1471 * @j Minijail these mounts are for 1472 * @m Head of list of mounts 1473 * 1474 * Returns 0 for success. 1475 */ 1476 static int mount_one(const struct minijail *j, struct mountpoint *m, 1477 const char *dev_path) 1478 { 1479 int ret; 1480 char *dest; 1481 int remount = 0; 1482 unsigned long original_mnt_flags = 0; 1483 1484 /* We assume |dest| has a leading "/". */ 1485 if (dev_path && strncmp("/dev/", m->dest, 5) == 0) { 1486 /* Since the temp path is rooted at /dev, skip that dest part. */ 1487 if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0) 1488 return -ENOMEM; 1489 } else { 1490 if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0) 1491 return -ENOMEM; 1492 } 1493 1494 ret = 1495 setup_mount_destination(m->src, dest, j->uid, j->gid, 1496 (m->flags & MS_BIND), &original_mnt_flags); 1497 if (ret) { 1498 warn("creating mount target '%s' failed", dest); 1499 goto error; 1500 } 1501 1502 /* 1503 * Bind mounts that change the 'ro' flag have to be remounted since 1504 * 'bind' and other flags can't both be specified in the same command. 1505 * Remount after the initial mount. 1506 */ 1507 if ((m->flags & MS_BIND) && 1508 ((m->flags & MS_RDONLY) != (original_mnt_flags & MS_RDONLY))) { 1509 remount = 1; 1510 /* 1511 * Restrict the mount flags to those that are user-settable in a 1512 * MS_REMOUNT request, but excluding MS_RDONLY. The 1513 * user-requested mount flags will dictate whether the remount 1514 * will have that flag or not. 1515 */ 1516 original_mnt_flags &= (MS_USER_SETTABLE_MASK & ~MS_RDONLY); 1517 } 1518 1519 ret = mount(m->src, dest, m->type, m->flags, m->data); 1520 if (ret) { 1521 pwarn("bind: %s -> %s flags=%#lx", m->src, dest, m->flags); 1522 goto error; 1523 } 1524 1525 if (remount) { 1526 ret = 1527 mount(m->src, dest, NULL, 1528 m->flags | original_mnt_flags | MS_REMOUNT, m->data); 1529 if (ret) { 1530 pwarn("bind remount: %s -> %s flags=%#lx", m->src, dest, 1531 m->flags | original_mnt_flags | MS_REMOUNT); 1532 goto error; 1533 } 1534 } 1535 1536 free(dest); 1537 if (m->next) 1538 return mount_one(j, m->next, dev_path); 1539 return 0; 1540 1541 error: 1542 free(dest); 1543 return ret; 1544 } 1545 1546 static void process_mounts_or_die(const struct minijail *j) 1547 { 1548 /* 1549 * We have to mount /dev first in case there are bind mounts from 1550 * the original /dev into the new unique tmpfs one. 1551 */ 1552 char *dev_path = NULL; 1553 if (j->flags.mount_dev && mount_dev(&dev_path)) 1554 pdie("mount_dev failed"); 1555 1556 if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) { 1557 if (dev_path) { 1558 int saved_errno = errno; 1559 mount_dev_cleanup(dev_path); 1560 errno = saved_errno; 1561 } 1562 pdie("mount_one failed"); 1563 } 1564 1565 /* 1566 * Once all bind mounts have been processed, move the temp dev to 1567 * its final /dev home. 1568 */ 1569 if (j->flags.mount_dev && mount_dev_finalize(j, dev_path)) 1570 pdie("mount_dev_finalize failed"); 1571 } 1572 1573 static int enter_chroot(const struct minijail *j) 1574 { 1575 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT); 1576 1577 if (chroot(j->chrootdir)) 1578 return -errno; 1579 1580 if (chdir("/")) 1581 return -errno; 1582 1583 return 0; 1584 } 1585 1586 static int enter_pivot_root(const struct minijail *j) 1587 { 1588 int oldroot, newroot; 1589 1590 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT); 1591 1592 /* 1593 * Keep the fd for both old and new root. 1594 * It will be used in fchdir(2) later. 1595 */ 1596 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1597 if (oldroot < 0) 1598 pdie("failed to open / for fchdir"); 1599 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); 1600 if (newroot < 0) 1601 pdie("failed to open %s for fchdir", j->chrootdir); 1602 1603 /* 1604 * To ensure j->chrootdir is the root of a filesystem, 1605 * do a self bind mount. 1606 */ 1607 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1608 pdie("failed to bind mount '%s'", j->chrootdir); 1609 if (chdir(j->chrootdir)) 1610 return -errno; 1611 if (syscall(SYS_pivot_root, ".", ".")) 1612 pdie("pivot_root"); 1613 1614 /* 1615 * Now the old root is mounted on top of the new root. Use fchdir(2) to 1616 * change to the old root and unmount it. 1617 */ 1618 if (fchdir(oldroot)) 1619 pdie("failed to fchdir to old /"); 1620 1621 /* 1622 * If skip_remount_private was enabled for minijail_enter(), 1623 * there could be a shared mount point under |oldroot|. In that case, 1624 * mounts under this shared mount point will be unmounted below, and 1625 * this unmounting will propagate to the original mount namespace 1626 * (because the mount point is shared). To prevent this unexpected 1627 * unmounting, remove these mounts from their peer groups by recursively 1628 * remounting them as MS_PRIVATE. 1629 */ 1630 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL)) 1631 pdie("failed to mount(/, private) before umount(/)"); 1632 /* The old root might be busy, so use lazy unmount. */ 1633 if (umount2(".", MNT_DETACH)) 1634 pdie("umount(/)"); 1635 /* Change back to the new root. */ 1636 if (fchdir(newroot)) 1637 return -errno; 1638 if (close(oldroot)) 1639 return -errno; 1640 if (close(newroot)) 1641 return -errno; 1642 if (chroot("/")) 1643 return -errno; 1644 /* Set correct CWD for getcwd(3). */ 1645 if (chdir("/")) 1646 return -errno; 1647 1648 return 0; 1649 } 1650 1651 static int mount_tmp(const struct minijail *j) 1652 { 1653 const char fmt[] = "size=%zu,mode=1777"; 1654 /* Count for the user storing ULLONG_MAX literally + extra space. */ 1655 char data[sizeof(fmt) + sizeof("18446744073709551615ULL")]; 1656 int ret; 1657 1658 ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size); 1659 1660 if (ret <= 0) 1661 pdie("tmpfs size spec error"); 1662 else if ((size_t)ret >= sizeof(data)) 1663 pdie("tmpfs size spec too large"); 1664 return mount("none", "/tmp", "tmpfs", MS_NODEV | MS_NOEXEC | MS_NOSUID, 1665 data); 1666 } 1667 1668 static int remount_proc_readonly(const struct minijail *j) 1669 { 1670 const char *kProcPath = "/proc"; 1671 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1672 /* 1673 * Right now, we're holding a reference to our parent's old mount of 1674 * /proc in our namespace, which means using MS_REMOUNT here would 1675 * mutate our parent's mount as well, even though we're in a VFS 1676 * namespace (!). Instead, remove their mount from our namespace lazily 1677 * (MNT_DETACH) and make our own. 1678 */ 1679 if (umount2(kProcPath, MNT_DETACH)) { 1680 /* 1681 * If we are in a new user namespace, umount(2) will fail. 1682 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html 1683 */ 1684 if (j->flags.userns) { 1685 info("umount(/proc, MNT_DETACH) failed, " 1686 "this is expected when using user namespaces"); 1687 } else { 1688 return -errno; 1689 } 1690 } 1691 if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1692 return -errno; 1693 return 0; 1694 } 1695 1696 static void kill_child_and_die(const struct minijail *j, const char *msg) 1697 { 1698 kill(j->initpid, SIGKILL); 1699 die("%s", msg); 1700 } 1701 1702 static void write_pid_file_or_die(const struct minijail *j) 1703 { 1704 if (write_pid_to_path(j->initpid, j->pid_file_path)) 1705 kill_child_and_die(j, "failed to write pid file"); 1706 } 1707 1708 static void add_to_cgroups_or_die(const struct minijail *j) 1709 { 1710 size_t i; 1711 1712 for (i = 0; i < j->cgroup_count; ++i) { 1713 if (write_pid_to_path(j->initpid, j->cgroups[i])) 1714 kill_child_and_die(j, "failed to add to cgroups"); 1715 } 1716 } 1717 1718 static void set_rlimits_or_die(const struct minijail *j) 1719 { 1720 size_t i; 1721 1722 for (i = 0; i < j->rlimit_count; ++i) { 1723 struct rlimit limit; 1724 limit.rlim_cur = j->rlimits[i].cur; 1725 limit.rlim_max = j->rlimits[i].max; 1726 if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL)) 1727 kill_child_and_die(j, "failed to set rlimit"); 1728 } 1729 } 1730 1731 static void write_ugid_maps_or_die(const struct minijail *j) 1732 { 1733 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0) 1734 kill_child_and_die(j, "failed to write uid_map"); 1735 if (j->gidmap && j->flags.disable_setgroups) { 1736 /* Older kernels might not have the /proc/<pid>/setgroups files. */ 1737 int ret = write_proc_file(j->initpid, "deny", "setgroups"); 1738 if (ret != 0) { 1739 if (ret == -ENOENT) { 1740 /* See http://man7.org/linux/man-pages/man7/user_namespaces.7.html. */ 1741 warn("could not disable setgroups(2)"); 1742 } else 1743 kill_child_and_die(j, "failed to disable setgroups(2)"); 1744 } 1745 } 1746 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0) 1747 kill_child_and_die(j, "failed to write gid_map"); 1748 } 1749 1750 static void enter_user_namespace(const struct minijail *j) 1751 { 1752 int uid = j->flags.uid ? j->uid : 0; 1753 int gid = j->flags.gid ? j->gid : 0; 1754 if (j->gidmap && setresgid(gid, gid, gid)) { 1755 pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid, 1756 gid); 1757 } 1758 if (j->uidmap && setresuid(uid, uid, uid)) { 1759 pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid, 1760 uid); 1761 } 1762 } 1763 1764 static void parent_setup_complete(int *pipe_fds) 1765 { 1766 close(pipe_fds[0]); 1767 close(pipe_fds[1]); 1768 } 1769 1770 /* 1771 * wait_for_parent_setup: Called by the child process to wait for any 1772 * further parent-side setup to complete before continuing. 1773 */ 1774 static void wait_for_parent_setup(int *pipe_fds) 1775 { 1776 char buf; 1777 1778 close(pipe_fds[1]); 1779 1780 /* Wait for parent to complete setup and close the pipe. */ 1781 if (read(pipe_fds[0], &buf, 1) != 0) 1782 die("failed to sync with parent"); 1783 close(pipe_fds[0]); 1784 } 1785 1786 static void drop_ugid(const struct minijail *j) 1787 { 1788 if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids + 1789 j->flags.set_suppl_gids > 1) { 1790 die("can only do one of inherit, keep, or set supplementary " 1791 "groups"); 1792 } 1793 1794 if (j->flags.inherit_suppl_gids) { 1795 if (initgroups(j->user, j->usergid)) 1796 pdie("initgroups(%s, %d) failed", j->user, j->usergid); 1797 } else if (j->flags.set_suppl_gids) { 1798 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) 1799 pdie("setgroups(suppl_gids) failed"); 1800 } else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) { 1801 /* 1802 * Only attempt to clear supplementary groups if we are changing 1803 * users or groups, and if the caller did not request to disable 1804 * setgroups (used when entering a user namespace as a 1805 * non-privileged user). 1806 */ 1807 if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL)) 1808 pdie("setgroups(0, NULL) failed"); 1809 } 1810 1811 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1812 pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid); 1813 1814 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1815 pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid); 1816 } 1817 1818 static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap) 1819 { 1820 const uint64_t one = 1; 1821 unsigned int i; 1822 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) { 1823 if (keep_mask & (one << i)) 1824 continue; 1825 if (prctl(PR_CAPBSET_DROP, i)) 1826 pdie("could not drop capability from bounding set"); 1827 } 1828 } 1829 1830 static void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1831 { 1832 if (!j->flags.use_caps) 1833 return; 1834 1835 cap_t caps = cap_get_proc(); 1836 cap_value_t flag[1]; 1837 const size_t ncaps = sizeof(j->caps) * 8; 1838 const uint64_t one = 1; 1839 unsigned int i; 1840 if (!caps) 1841 die("can't get process caps"); 1842 if (cap_clear(caps)) 1843 die("can't clear caps"); 1844 1845 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) { 1846 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1847 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1848 continue; 1849 flag[0] = i; 1850 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1851 die("can't add effective cap"); 1852 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1853 die("can't add permitted cap"); 1854 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1855 die("can't add inheritable cap"); 1856 } 1857 if (cap_set_proc(caps)) 1858 die("can't apply initial cleaned capset"); 1859 1860 /* 1861 * Instead of dropping the bounding set first, do it here in case 1862 * the caller had a more permissive bounding set which could 1863 * have been used above to raise a capability that wasn't already 1864 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1865 * 1866 * However, if we're asked to skip setting *and* locking the 1867 * SECURE_NOROOT securebit, also skip dropping the bounding set. 1868 * If the caller wants to regain all capabilities when executing a 1869 * set-user-ID-root program, allow them to do so. The default behavior 1870 * (i.e. the behavior without |securebits_skip_mask| set) will still put 1871 * the jailed process tree in a capabilities-only environment. 1872 * 1873 * We check the negated skip mask for SECURE_NOROOT and 1874 * SECURE_NOROOT_LOCKED. If the bits are set in the negated mask they 1875 * will *not* be skipped in lock_securebits(), and therefore we should 1876 * drop the bounding set. 1877 */ 1878 if (secure_noroot_set_and_locked(~j->securebits_skip_mask)) { 1879 drop_capbset(j->caps, last_valid_cap); 1880 } else { 1881 warn("SECURE_NOROOT not set, not dropping bounding set"); 1882 } 1883 1884 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1885 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1886 flag[0] = CAP_SETPCAP; 1887 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1888 die("can't clear effective cap"); 1889 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1890 die("can't clear permitted cap"); 1891 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1892 die("can't clear inheritable cap"); 1893 } 1894 1895 if (cap_set_proc(caps)) 1896 die("can't apply final cleaned capset"); 1897 1898 /* 1899 * If ambient capabilities are supported, clear all capabilities first, 1900 * then raise the requested ones. 1901 */ 1902 if (j->flags.set_ambient_caps) { 1903 if (!cap_ambient_supported()) { 1904 pdie("ambient capabilities not supported"); 1905 } 1906 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) != 1907 0) { 1908 pdie("can't clear ambient capabilities"); 1909 } 1910 1911 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) { 1912 if (!(j->caps & (one << i))) 1913 continue; 1914 1915 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0, 1916 0) != 0) { 1917 pdie("prctl(PR_CAP_AMBIENT, " 1918 "PR_CAP_AMBIENT_RAISE, %u) failed", 1919 i); 1920 } 1921 } 1922 } 1923 1924 cap_free(caps); 1925 } 1926 1927 static void set_seccomp_filter(const struct minijail *j) 1928 { 1929 /* 1930 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1931 * in the kernel source tree for an explanation of the parameters. 1932 */ 1933 if (j->flags.no_new_privs) { 1934 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1935 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1936 } 1937 1938 /* 1939 * Code running with ASan 1940 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1941 * will make system calls not included in the syscall filter policy, 1942 * which will likely crash the program. Skip setting seccomp filter in 1943 * that case. 1944 * 'running_with_asan()' has no inputs and is completely defined at 1945 * build time, so this cannot be used by an attacker to skip setting 1946 * seccomp filter. 1947 */ 1948 if (j->flags.seccomp_filter && running_with_asan()) { 1949 warn("running with (HW)ASan, not setting seccomp filter"); 1950 return; 1951 } 1952 1953 if (j->flags.seccomp_filter) { 1954 if (j->flags.seccomp_filter_logging) { 1955 /* 1956 * If logging seccomp filter failures, 1957 * install the SIGSYS handler first. 1958 */ 1959 if (install_sigsys_handler()) 1960 pdie("failed to install SIGSYS handler"); 1961 warn("logging seccomp filter failures"); 1962 } else if (j->flags.seccomp_filter_tsync) { 1963 /* 1964 * If setting thread sync, 1965 * reset the SIGSYS signal handler so that 1966 * the entire thread group is killed. 1967 */ 1968 if (signal(SIGSYS, SIG_DFL) == SIG_ERR) 1969 pdie("failed to reset SIGSYS disposition"); 1970 } 1971 } 1972 1973 /* 1974 * Install the syscall filter. 1975 */ 1976 if (j->flags.seccomp_filter) { 1977 if (j->flags.seccomp_filter_tsync) { 1978 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, 1979 SECCOMP_FILTER_FLAG_TSYNC, 1980 j->filter_prog)) { 1981 pdie("seccomp(tsync) failed"); 1982 } 1983 } else { 1984 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1985 j->filter_prog)) { 1986 pdie("prctl(seccomp_filter) failed"); 1987 } 1988 } 1989 } 1990 } 1991 1992 static pid_t forward_pid = -1; 1993 1994 static void forward_signal(int sig, 1995 siginfo_t *siginfo attribute_unused, 1996 void *void_context attribute_unused) 1997 { 1998 if (forward_pid != -1) { 1999 kill(forward_pid, sig); 2000 } 2001 } 2002 2003 static void install_signal_handlers(void) 2004 { 2005 struct sigaction act; 2006 2007 memset(&act, 0, sizeof(act)); 2008 act.sa_sigaction = &forward_signal; 2009 act.sa_flags = SA_SIGINFO | SA_RESTART; 2010 2011 /* Handle all signals, except SIGCHLD. */ 2012 for (int sig = 1; sig < NSIG; sig++) { 2013 /* 2014 * We don't care if we get EINVAL: that just means that we 2015 * can't handle this signal, so let's skip it and continue. 2016 */ 2017 sigaction(sig, &act, NULL); 2018 } 2019 /* Reset SIGCHLD's handler. */ 2020 signal(SIGCHLD, SIG_DFL); 2021 2022 /* Handle real-time signals. */ 2023 for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++) { 2024 sigaction(sig, &act, NULL); 2025 } 2026 } 2027 2028 static const char *lookup_hook_name(minijail_hook_event_t event) 2029 { 2030 switch (event) { 2031 case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS: 2032 return "pre-drop-caps"; 2033 case MINIJAIL_HOOK_EVENT_PRE_EXECVE: 2034 return "pre-execve"; 2035 case MINIJAIL_HOOK_EVENT_PRE_CHROOT: 2036 return "pre-chroot"; 2037 case MINIJAIL_HOOK_EVENT_MAX: 2038 /* 2039 * Adding this in favor of a default case to force the 2040 * compiler to error out if a new enum value is added. 2041 */ 2042 break; 2043 } 2044 return "unknown"; 2045 } 2046 2047 static void run_hooks_or_die(const struct minijail *j, 2048 minijail_hook_event_t event) 2049 { 2050 int rc; 2051 int hook_index = 0; 2052 for (struct hook *c = j->hooks_head; c; c = c->next) { 2053 if (c->event != event) 2054 continue; 2055 rc = c->hook(c->payload); 2056 if (rc != 0) { 2057 errno = -rc; 2058 pdie("%s hook (index %d) failed", 2059 lookup_hook_name(event), hook_index); 2060 } 2061 /* Only increase the index within the same hook event type. */ 2062 ++hook_index; 2063 } 2064 } 2065 2066 void API minijail_enter(const struct minijail *j) 2067 { 2068 /* 2069 * If we're dropping caps, get the last valid cap from /proc now, 2070 * since /proc can be unmounted before drop_caps() is called. 2071 */ 2072 unsigned int last_valid_cap = 0; 2073 if (j->flags.capbset_drop || j->flags.use_caps) 2074 last_valid_cap = get_last_valid_cap(); 2075 2076 if (j->flags.pids) 2077 die("tried to enter a pid-namespaced jail;" 2078 " try minijail_run()?"); 2079 2080 if (j->flags.inherit_suppl_gids && !j->user) 2081 die("cannot inherit supplementary groups without setting a " 2082 "username"); 2083 2084 /* 2085 * We can't recover from failures if we've dropped privileges partially, 2086 * so we don't even try. If any of our operations fail, we abort() the 2087 * entire process. 2088 */ 2089 if (j->flags.enter_vfs) { 2090 if (setns(j->mountns_fd, CLONE_NEWNS)) 2091 pdie("setns(CLONE_NEWNS) failed"); 2092 close(j->mountns_fd); 2093 } 2094 2095 if (j->flags.vfs) { 2096 if (unshare(CLONE_NEWNS)) 2097 pdie("unshare(CLONE_NEWNS) failed"); 2098 /* 2099 * By default, remount all filesystems as private, unless 2100 * - Passed a specific remount mode, in which case remount with that, 2101 * - Asked not to remount at all, in which case skip the mount(2) call. 2102 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 2103 */ 2104 if (j->remount_mode) { 2105 if (mount(NULL, "/", NULL, MS_REC | j->remount_mode, NULL)) 2106 pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE," 2107 " NULL) failed"); 2108 } 2109 } 2110 2111 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 2112 pdie("unshare(CLONE_NEWIPC) failed"); 2113 } 2114 2115 if (j->flags.uts) { 2116 if (unshare(CLONE_NEWUTS)) 2117 pdie("unshare(CLONE_NEWUTS) failed"); 2118 2119 if (j->hostname && sethostname(j->hostname, strlen(j->hostname))) 2120 pdie("sethostname(%s) failed", j->hostname); 2121 } 2122 2123 if (j->flags.enter_net) { 2124 if (setns(j->netns_fd, CLONE_NEWNET)) 2125 pdie("setns(CLONE_NEWNET) failed"); 2126 close(j->netns_fd); 2127 } else if (j->flags.net) { 2128 if (unshare(CLONE_NEWNET)) 2129 pdie("unshare(CLONE_NEWNET) failed"); 2130 config_net_loopback(); 2131 } 2132 2133 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP)) 2134 pdie("unshare(CLONE_NEWCGROUP) failed"); 2135 2136 if (j->flags.new_session_keyring) { 2137 if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0) 2138 pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed"); 2139 } 2140 2141 /* We have to process all the mounts before we chroot/pivot_root. */ 2142 process_mounts_or_die(j); 2143 2144 if (j->flags.chroot && enter_chroot(j)) 2145 pdie("chroot"); 2146 2147 if (j->flags.pivot_root && enter_pivot_root(j)) 2148 pdie("pivot_root"); 2149 2150 if (j->flags.mount_tmp && mount_tmp(j)) 2151 pdie("mount_tmp"); 2152 2153 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 2154 pdie("remount"); 2155 2156 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS); 2157 2158 /* 2159 * If we're only dropping capabilities from the bounding set, but not 2160 * from the thread's (permitted|inheritable|effective) sets, do it now. 2161 */ 2162 if (j->flags.capbset_drop) { 2163 drop_capbset(j->cap_bset, last_valid_cap); 2164 } 2165 2166 /* 2167 * POSIX capabilities are a bit tricky. We must set SECBIT_KEEP_CAPS 2168 * before drop_ugid() below as the latter would otherwise drop all 2169 * capabilities. 2170 */ 2171 if (j->flags.use_caps) { 2172 /* 2173 * When using ambient capabilities, CAP_SET{GID,UID} can be 2174 * inherited across execve(2), so SECBIT_KEEP_CAPS is not 2175 * strictly needed. 2176 */ 2177 bool require_keep_caps = !j->flags.set_ambient_caps; 2178 if (lock_securebits(j->securebits_skip_mask, 2179 require_keep_caps) < 0) { 2180 pdie("locking securebits failed"); 2181 } 2182 } 2183 2184 if (j->flags.no_new_privs) { 2185 /* 2186 * If we're setting no_new_privs, we can drop privileges 2187 * before setting seccomp filter. This way filter policies 2188 * don't need to allow privilege-dropping syscalls. 2189 */ 2190 drop_ugid(j); 2191 drop_caps(j, last_valid_cap); 2192 set_seccomp_filter(j); 2193 } else { 2194 /* 2195 * If we're not setting no_new_privs, 2196 * we need to set seccomp filter *before* dropping privileges. 2197 * WARNING: this means that filter policies *must* allow 2198 * setgroups()/setresgid()/setresuid() for dropping root and 2199 * capget()/capset()/prctl() for dropping caps. 2200 */ 2201 set_seccomp_filter(j); 2202 drop_ugid(j); 2203 drop_caps(j, last_valid_cap); 2204 } 2205 2206 /* 2207 * Select the specified alternate syscall table. The table must not 2208 * block prctl(2) if we're using seccomp as well. 2209 */ 2210 if (j->flags.alt_syscall) { 2211 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 2212 pdie("prctl(PR_ALT_SYSCALL) failed"); 2213 } 2214 2215 /* 2216 * seccomp has to come last since it cuts off all the other 2217 * privilege-dropping syscalls :) 2218 */ 2219 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 2220 if ((errno == EINVAL) && seccomp_can_softfail()) { 2221 warn("seccomp not supported"); 2222 return; 2223 } 2224 pdie("prctl(PR_SET_SECCOMP) failed"); 2225 } 2226 } 2227 2228 /* TODO(wad): will visibility affect this variable? */ 2229 static int init_exitstatus = 0; 2230 2231 void init_term(int sig attribute_unused) 2232 { 2233 _exit(init_exitstatus); 2234 } 2235 2236 void init(pid_t rootpid) 2237 { 2238 pid_t pid; 2239 int status; 2240 /* So that we exit with the right status. */ 2241 signal(SIGTERM, init_term); 2242 /* TODO(wad): self jail with seccomp filters here. */ 2243 while ((pid = wait(&status)) > 0) { 2244 /* 2245 * This loop will only end when either there are no processes 2246 * left inside our pid namespace or we get a signal. 2247 */ 2248 if (pid == rootpid) 2249 init_exitstatus = status; 2250 } 2251 if (!WIFEXITED(init_exitstatus)) 2252 _exit(MINIJAIL_ERR_INIT); 2253 _exit(WEXITSTATUS(init_exitstatus)); 2254 } 2255 2256 int API minijail_from_fd(int fd, struct minijail *j) 2257 { 2258 size_t sz = 0; 2259 size_t bytes = read(fd, &sz, sizeof(sz)); 2260 char *buf; 2261 int r; 2262 if (sizeof(sz) != bytes) 2263 return -EINVAL; 2264 if (sz > USHRT_MAX) /* arbitrary sanity check */ 2265 return -E2BIG; 2266 buf = malloc(sz); 2267 if (!buf) 2268 return -ENOMEM; 2269 bytes = read(fd, buf, sz); 2270 if (bytes != sz) { 2271 free(buf); 2272 return -EINVAL; 2273 } 2274 r = minijail_unmarshal(j, buf, sz); 2275 free(buf); 2276 return r; 2277 } 2278 2279 int API minijail_to_fd(struct minijail *j, int fd) 2280 { 2281 char *buf; 2282 size_t sz = minijail_size(j); 2283 ssize_t written; 2284 int r; 2285 2286 if (!sz) 2287 return -EINVAL; 2288 buf = malloc(sz); 2289 r = minijail_marshal(j, buf, sz); 2290 if (r) { 2291 free(buf); 2292 return r; 2293 } 2294 /* Sends [size][minijail]. */ 2295 written = write(fd, &sz, sizeof(sz)); 2296 if (written != sizeof(sz)) { 2297 free(buf); 2298 return -EFAULT; 2299 } 2300 written = write(fd, buf, sz); 2301 if (written < 0 || (size_t) written != sz) { 2302 free(buf); 2303 return -EFAULT; 2304 } 2305 free(buf); 2306 return 0; 2307 } 2308 2309 static int setup_preload(const struct minijail *j attribute_unused, 2310 const char *oldenv attribute_unused) 2311 { 2312 #if defined(__ANDROID__) 2313 /* Don't use LDPRELOAD on Android. */ 2314 return 0; 2315 #else 2316 const char *preload_path = j->preload_path ?: PRELOADPATH; 2317 char *newenv = NULL; 2318 int ret = 0; 2319 2320 if (!oldenv) 2321 oldenv = ""; 2322 2323 /* Only insert a separating space if we have something to separate... */ 2324 if (asprintf(&newenv, "%s%s%s", oldenv, oldenv[0] != '\0' ? " " : "", 2325 preload_path) < 0) { 2326 return -1; 2327 } 2328 2329 /* 2330 * Avoid using putenv(3), since that requires us to hold onto a 2331 * reference to that string until the environment is no longer used to 2332 * prevent a memory leak. 2333 * See https://crbug.com/930189 for more details. 2334 */ 2335 ret = setenv(kLdPreloadEnvVar, newenv, 1); 2336 free(newenv); 2337 return ret; 2338 #endif 2339 } 2340 2341 static int setup_pipe(int fds[2]) 2342 { 2343 int r = pipe(fds); 2344 char fd_buf[11]; 2345 if (r) 2346 return r; 2347 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 2348 if (r <= 0) 2349 return -EINVAL; 2350 setenv(kFdEnvVar, fd_buf, 1); 2351 return 0; 2352 } 2353 2354 static int close_open_fds(int *inheritable_fds, size_t size) 2355 { 2356 const char *kFdPath = "/proc/self/fd"; 2357 2358 DIR *d = opendir(kFdPath); 2359 struct dirent *dir_entry; 2360 2361 if (d == NULL) 2362 return -1; 2363 int dir_fd = dirfd(d); 2364 while ((dir_entry = readdir(d)) != NULL) { 2365 size_t i; 2366 char *end; 2367 bool should_close = true; 2368 const int fd = strtol(dir_entry->d_name, &end, 10); 2369 2370 if ((*end) != '\0') { 2371 continue; 2372 } 2373 /* 2374 * We might have set up some pipes that we want to share with 2375 * the parent process, and should not be closed. 2376 */ 2377 for (i = 0; i < size; ++i) { 2378 if (fd == inheritable_fds[i]) { 2379 should_close = false; 2380 break; 2381 } 2382 } 2383 /* Also avoid closing the directory fd. */ 2384 if (should_close && fd != dir_fd) 2385 close(fd); 2386 } 2387 closedir(d); 2388 return 0; 2389 } 2390 2391 static int redirect_fds(struct minijail *j) 2392 { 2393 size_t i, i2; 2394 int closeable; 2395 for (i = 0; i < j->preserved_fd_count; i++) { 2396 if (dup2(j->preserved_fds[i].parent_fd, 2397 j->preserved_fds[i].child_fd) == -1) { 2398 return -1; 2399 } 2400 } 2401 /* 2402 * After all fds have been duped, we are now free to close all parent 2403 * fds that are *not* child fds. 2404 */ 2405 for (i = 0; i < j->preserved_fd_count; i++) { 2406 closeable = true; 2407 for (i2 = 0; i2 < j->preserved_fd_count; i2++) { 2408 closeable &= j->preserved_fds[i].parent_fd != 2409 j->preserved_fds[i2].child_fd; 2410 } 2411 if (closeable) 2412 close(j->preserved_fds[i].parent_fd); 2413 } 2414 return 0; 2415 } 2416 2417 /* 2418 * Structure that specifies how to start a minijail. 2419 * 2420 * filename - The program to exec in the child. Required if |exec_in_child| = 1. 2421 * argv - Arguments for the child program. Required if |exec_in_child| = 1. 2422 * envp - Environment for the child program. Available if |exec_in_child| = 1. 2423 Currently only honored if |use_preload| = 0 and non-NULL. 2424 * use_preload - If true use LD_PRELOAD. 2425 * exec_in_child - If true, run |filename|. Otherwise, the child will return to 2426 * the caller. 2427 */ 2428 struct minijail_run_config { 2429 const char *filename; 2430 char *const *argv; 2431 char *const *envp; 2432 int use_preload; 2433 int exec_in_child; 2434 }; 2435 2436 /* 2437 * Set of pointers to fill with values from minijail_run. 2438 * All arguments are allowed to be NULL if unused. 2439 * 2440 * pstdin_fd - Filled with stdin pipe if non-NULL. 2441 * pstdout_fd - Filled with stdout pipe if non-NULL. 2442 * pstderr_fd - Filled with stderr pipe if non-NULL. 2443 * pchild_pid - Filled with the pid of the child process if non-NULL. 2444 */ 2445 struct minijail_run_status { 2446 int *pstdin_fd; 2447 int *pstdout_fd; 2448 int *pstderr_fd; 2449 pid_t *pchild_pid; 2450 }; 2451 2452 static int minijail_run_internal(struct minijail *j, 2453 const struct minijail_run_config *config, 2454 struct minijail_run_status *status_out); 2455 2456 int API minijail_run(struct minijail *j, const char *filename, 2457 char *const argv[]) 2458 { 2459 struct minijail_run_config config = { 2460 .filename = filename, 2461 .argv = argv, 2462 .envp = NULL, 2463 .use_preload = true, 2464 .exec_in_child = true, 2465 }; 2466 struct minijail_run_status status = {}; 2467 return minijail_run_internal(j, &config, &status); 2468 } 2469 2470 int API minijail_run_pid(struct minijail *j, const char *filename, 2471 char *const argv[], pid_t *pchild_pid) 2472 { 2473 struct minijail_run_config config = { 2474 .filename = filename, 2475 .argv = argv, 2476 .envp = NULL, 2477 .use_preload = true, 2478 .exec_in_child = true, 2479 }; 2480 struct minijail_run_status status = { 2481 .pchild_pid = pchild_pid, 2482 }; 2483 return minijail_run_internal(j, &config, &status); 2484 } 2485 2486 int API minijail_run_pipe(struct minijail *j, const char *filename, 2487 char *const argv[], int *pstdin_fd) 2488 { 2489 struct minijail_run_config config = { 2490 .filename = filename, 2491 .argv = argv, 2492 .envp = NULL, 2493 .use_preload = true, 2494 .exec_in_child = true, 2495 }; 2496 struct minijail_run_status status = { 2497 .pstdin_fd = pstdin_fd, 2498 }; 2499 return minijail_run_internal(j, &config, &status); 2500 } 2501 2502 int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 2503 char *const argv[], pid_t *pchild_pid, 2504 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 2505 { 2506 struct minijail_run_config config = { 2507 .filename = filename, 2508 .argv = argv, 2509 .envp = NULL, 2510 .use_preload = true, 2511 .exec_in_child = true, 2512 }; 2513 struct minijail_run_status status = { 2514 .pstdin_fd = pstdin_fd, 2515 .pstdout_fd = pstdout_fd, 2516 .pstderr_fd = pstderr_fd, 2517 .pchild_pid = pchild_pid, 2518 }; 2519 return minijail_run_internal(j, &config, &status); 2520 } 2521 2522 int API minijail_run_no_preload(struct minijail *j, const char *filename, 2523 char *const argv[]) 2524 { 2525 struct minijail_run_config config = { 2526 .filename = filename, 2527 .argv = argv, 2528 .envp = NULL, 2529 .use_preload = false, 2530 .exec_in_child = true, 2531 }; 2532 struct minijail_run_status status = {}; 2533 return minijail_run_internal(j, &config, &status); 2534 } 2535 2536 int API minijail_run_pid_pipes_no_preload(struct minijail *j, 2537 const char *filename, 2538 char *const argv[], 2539 pid_t *pchild_pid, 2540 int *pstdin_fd, 2541 int *pstdout_fd, 2542 int *pstderr_fd) 2543 { 2544 struct minijail_run_config config = { 2545 .filename = filename, 2546 .argv = argv, 2547 .envp = NULL, 2548 .use_preload = false, 2549 .exec_in_child = true, 2550 }; 2551 struct minijail_run_status status = { 2552 .pstdin_fd = pstdin_fd, 2553 .pstdout_fd = pstdout_fd, 2554 .pstderr_fd = pstderr_fd, 2555 .pchild_pid = pchild_pid, 2556 }; 2557 return minijail_run_internal(j, &config, &status); 2558 } 2559 2560 int API minijail_run_env_pid_pipes_no_preload(struct minijail *j, 2561 const char *filename, 2562 char *const argv[], 2563 char *const envp[], 2564 pid_t *pchild_pid, int *pstdin_fd, 2565 int *pstdout_fd, int *pstderr_fd) 2566 { 2567 struct minijail_run_config config = { 2568 .filename = filename, 2569 .argv = argv, 2570 .envp = envp, 2571 .use_preload = false, 2572 .exec_in_child = true, 2573 }; 2574 struct minijail_run_status status = { 2575 .pstdin_fd = pstdin_fd, 2576 .pstdout_fd = pstdout_fd, 2577 .pstderr_fd = pstderr_fd, 2578 .pchild_pid = pchild_pid, 2579 }; 2580 return minijail_run_internal(j, &config, &status); 2581 } 2582 2583 pid_t API minijail_fork(struct minijail *j) 2584 { 2585 struct minijail_run_config config = {}; 2586 struct minijail_run_status status = {}; 2587 return minijail_run_internal(j, &config, &status); 2588 } 2589 2590 static int minijail_run_internal(struct minijail *j, 2591 const struct minijail_run_config *config, 2592 struct minijail_run_status *status_out) 2593 { 2594 char *oldenv, *oldenv_copy = NULL; 2595 pid_t child_pid; 2596 int pipe_fds[2]; 2597 int stdin_fds[2]; 2598 int stdout_fds[2]; 2599 int stderr_fds[2]; 2600 int child_sync_pipe_fds[2]; 2601 int sync_child = 0; 2602 int ret; 2603 /* We need to remember this across the minijail_preexec() call. */ 2604 int pid_namespace = j->flags.pids; 2605 /* 2606 * Create an init process if we are entering a pid namespace, unless the 2607 * user has explicitly opted out by calling minijail_run_as_init(). 2608 */ 2609 int do_init = j->flags.do_init && !j->flags.run_as_init; 2610 int use_preload = config->use_preload; 2611 2612 if (use_preload) { 2613 if (j->hooks_head != NULL) 2614 die("Minijail hooks are not supported with LD_PRELOAD"); 2615 if (!config->exec_in_child) 2616 die("minijail_fork is not supported with LD_PRELOAD"); 2617 if (config->envp != NULL) 2618 die("cannot pass a new environment with LD_PRELOAD"); 2619 2620 oldenv = getenv(kLdPreloadEnvVar); 2621 if (oldenv) { 2622 oldenv_copy = strdup(oldenv); 2623 if (!oldenv_copy) 2624 return -ENOMEM; 2625 } 2626 2627 if (setup_preload(j, oldenv)) 2628 return -EFAULT; 2629 } 2630 2631 if (!use_preload) { 2632 if (j->flags.use_caps && j->caps != 0 && 2633 !j->flags.set_ambient_caps) { 2634 die("non-empty, non-ambient capabilities are not " 2635 "supported without LD_PRELOAD"); 2636 } 2637 } 2638 2639 if (use_preload) { 2640 /* 2641 * Before we fork(2) and execve(2) the child process, we need 2642 * to open a pipe(2) to send the minijail configuration over. 2643 */ 2644 if (setup_pipe(pipe_fds)) 2645 return -EFAULT; 2646 } 2647 2648 /* 2649 * If we want to write to the child process' standard input, 2650 * create the pipe(2) now. 2651 */ 2652 if (status_out->pstdin_fd) { 2653 if (pipe(stdin_fds)) 2654 return -EFAULT; 2655 } 2656 2657 /* 2658 * If we want to read from the child process' standard output, 2659 * create the pipe(2) now. 2660 */ 2661 if (status_out->pstdout_fd) { 2662 if (pipe(stdout_fds)) 2663 return -EFAULT; 2664 } 2665 2666 /* 2667 * If we want to read from the child process' standard error, 2668 * create the pipe(2) now. 2669 */ 2670 if (status_out->pstderr_fd) { 2671 if (pipe(stderr_fds)) 2672 return -EFAULT; 2673 } 2674 2675 /* 2676 * If we want to set up a new uid/gid map in the user namespace, 2677 * or if we need to add the child process to cgroups, create the pipe(2) 2678 * to sync between parent and child. 2679 */ 2680 if (j->flags.userns || j->flags.cgroups) { 2681 sync_child = 1; 2682 if (pipe(child_sync_pipe_fds)) 2683 return -EFAULT; 2684 } 2685 2686 /* 2687 * Use sys_clone() if and only if we're creating a pid namespace. 2688 * 2689 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 2690 * 2691 * In multithreaded programs, there are a bunch of locks inside libc, 2692 * some of which may be held by other threads at the time that we call 2693 * minijail_run_pid(). If we call fork(), glibc does its level best to 2694 * ensure that we hold all of these locks before it calls clone() 2695 * internally and drop them after clone() returns, but when we call 2696 * sys_clone(2) directly, all that gets bypassed and we end up with a 2697 * child address space where some of libc's important locks are held by 2698 * other threads (which did not get cloned, and hence will never release 2699 * those locks). This is okay so long as we call exec() immediately 2700 * after, but a bunch of seemingly-innocent libc functions like setenv() 2701 * take locks. 2702 * 2703 * Hence, only call sys_clone() if we need to, in order to get at pid 2704 * namespacing. If we follow this path, the child's address space might 2705 * have broken locks; you may only call functions that do not acquire 2706 * any locks. 2707 * 2708 * Unfortunately, fork() acquires every lock it can get its hands on, as 2709 * previously detailed, so this function is highly likely to deadlock 2710 * later on (see "deadlock here") if we're multithreaded. 2711 * 2712 * We might hack around this by having the clone()d child (init of the 2713 * pid namespace) return directly, rather than leaving the clone()d 2714 * process hanging around to be init for the new namespace (and having 2715 * its fork()ed child return in turn), but that process would be 2716 * crippled with its libc locks potentially broken. We might try 2717 * fork()ing in the parent before we clone() to ensure that we own all 2718 * the locks, but then we have to have the forked child hanging around 2719 * consuming resources (and possibly having file descriptors / shared 2720 * memory regions / etc attached). We'd need to keep the child around to 2721 * avoid having its children get reparented to init. 2722 * 2723 * TODO(ellyjones): figure out if the "forked child hanging around" 2724 * problem is fixable or not. It would be nice if we worked in this 2725 * case. 2726 */ 2727 if (pid_namespace) { 2728 int clone_flags = CLONE_NEWPID | SIGCHLD; 2729 if (j->flags.userns) 2730 clone_flags |= CLONE_NEWUSER; 2731 child_pid = syscall(SYS_clone, clone_flags, NULL); 2732 } else { 2733 child_pid = fork(); 2734 } 2735 2736 if (child_pid < 0) { 2737 if (use_preload) { 2738 free(oldenv_copy); 2739 } 2740 die("failed to fork child"); 2741 } 2742 2743 if (child_pid) { 2744 if (use_preload) { 2745 /* Restore parent's LD_PRELOAD. */ 2746 if (oldenv_copy) { 2747 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 2748 free(oldenv_copy); 2749 } else { 2750 unsetenv(kLdPreloadEnvVar); 2751 } 2752 unsetenv(kFdEnvVar); 2753 } 2754 2755 j->initpid = child_pid; 2756 2757 if (j->flags.forward_signals) { 2758 forward_pid = child_pid; 2759 install_signal_handlers(); 2760 } 2761 2762 if (j->flags.pid_file) 2763 write_pid_file_or_die(j); 2764 2765 if (j->flags.cgroups) 2766 add_to_cgroups_or_die(j); 2767 2768 if (j->rlimit_count) 2769 set_rlimits_or_die(j); 2770 2771 if (j->flags.userns) 2772 write_ugid_maps_or_die(j); 2773 2774 if (j->flags.enter_vfs) 2775 close(j->mountns_fd); 2776 2777 if (j->flags.enter_net) 2778 close(j->netns_fd); 2779 2780 if (sync_child) 2781 parent_setup_complete(child_sync_pipe_fds); 2782 2783 if (use_preload) { 2784 /* Send marshalled minijail. */ 2785 close(pipe_fds[0]); /* read endpoint */ 2786 ret = minijail_to_fd(j, pipe_fds[1]); 2787 close(pipe_fds[1]); /* write endpoint */ 2788 if (ret) { 2789 kill(j->initpid, SIGKILL); 2790 die("failed to send marshalled minijail"); 2791 } 2792 } 2793 2794 if (status_out->pchild_pid) 2795 *status_out->pchild_pid = child_pid; 2796 2797 /* 2798 * If we want to write to the child process' standard input, 2799 * set up the write end of the pipe. 2800 */ 2801 if (status_out->pstdin_fd) 2802 *status_out->pstdin_fd = 2803 setup_pipe_end(stdin_fds, 1 /* write end */); 2804 2805 /* 2806 * If we want to read from the child process' standard output, 2807 * set up the read end of the pipe. 2808 */ 2809 if (status_out->pstdout_fd) 2810 *status_out->pstdout_fd = 2811 setup_pipe_end(stdout_fds, 0 /* read end */); 2812 2813 /* 2814 * If we want to read from the child process' standard error, 2815 * set up the read end of the pipe. 2816 */ 2817 if (status_out->pstderr_fd) 2818 *status_out->pstderr_fd = 2819 setup_pipe_end(stderr_fds, 0 /* read end */); 2820 2821 /* 2822 * If forking return the child pid, in the normal exec case 2823 * return 0 for success. 2824 */ 2825 if (!config->exec_in_child) 2826 return child_pid; 2827 return 0; 2828 } 2829 /* Child process. */ 2830 free(oldenv_copy); 2831 2832 if (j->flags.reset_signal_mask) { 2833 sigset_t signal_mask; 2834 if (sigemptyset(&signal_mask) != 0) 2835 pdie("sigemptyset failed"); 2836 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 2837 pdie("sigprocmask failed"); 2838 } 2839 2840 if (j->flags.reset_signal_handlers) { 2841 int signum; 2842 for (signum = 0; signum <= SIGRTMAX; signum++) { 2843 /* 2844 * Ignore EINVAL since some signal numbers in the range 2845 * might not be valid. 2846 */ 2847 if (signal(signum, SIG_DFL) == SIG_ERR && 2848 errno != EINVAL) { 2849 pdie("failed to reset signal %d disposition", 2850 signum); 2851 } 2852 } 2853 } 2854 2855 if (j->flags.close_open_fds) { 2856 const size_t kMaxInheritableFdsSize = 10 + MAX_PRESERVED_FDS; 2857 int inheritable_fds[kMaxInheritableFdsSize]; 2858 size_t size = 0; 2859 size_t i; 2860 if (use_preload) { 2861 inheritable_fds[size++] = pipe_fds[0]; 2862 inheritable_fds[size++] = pipe_fds[1]; 2863 } 2864 if (sync_child) { 2865 inheritable_fds[size++] = child_sync_pipe_fds[0]; 2866 inheritable_fds[size++] = child_sync_pipe_fds[1]; 2867 } 2868 if (status_out->pstdin_fd) { 2869 inheritable_fds[size++] = stdin_fds[0]; 2870 inheritable_fds[size++] = stdin_fds[1]; 2871 } 2872 if (status_out->pstdout_fd) { 2873 inheritable_fds[size++] = stdout_fds[0]; 2874 inheritable_fds[size++] = stdout_fds[1]; 2875 } 2876 if (status_out->pstderr_fd) { 2877 inheritable_fds[size++] = stderr_fds[0]; 2878 inheritable_fds[size++] = stderr_fds[1]; 2879 } 2880 for (i = 0; i < j->preserved_fd_count; i++) { 2881 /* 2882 * Preserve all parent_fds. They will be dup2(2)-ed in 2883 * the child later. 2884 */ 2885 inheritable_fds[size++] = j->preserved_fds[i].parent_fd; 2886 } 2887 2888 if (close_open_fds(inheritable_fds, size) < 0) 2889 die("failed to close open file descriptors"); 2890 } 2891 2892 if (redirect_fds(j)) 2893 die("failed to set up fd redirections"); 2894 2895 if (sync_child) 2896 wait_for_parent_setup(child_sync_pipe_fds); 2897 2898 if (j->flags.userns) 2899 enter_user_namespace(j); 2900 2901 /* 2902 * If we want to write to the jailed process' standard input, 2903 * set up the read end of the pipe. 2904 */ 2905 if (status_out->pstdin_fd) { 2906 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 2907 STDIN_FILENO) < 0) 2908 die("failed to set up stdin pipe"); 2909 } 2910 2911 /* 2912 * If we want to read from the jailed process' standard output, 2913 * set up the write end of the pipe. 2914 */ 2915 if (status_out->pstdout_fd) { 2916 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 2917 STDOUT_FILENO) < 0) 2918 die("failed to set up stdout pipe"); 2919 } 2920 2921 /* 2922 * If we want to read from the jailed process' standard error, 2923 * set up the write end of the pipe. 2924 */ 2925 if (status_out->pstderr_fd) { 2926 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 2927 STDERR_FILENO) < 0) 2928 die("failed to set up stderr pipe"); 2929 } 2930 2931 /* 2932 * If any of stdin, stdout, or stderr are TTYs, create a new session. 2933 * This prevents the jailed process from using the TIOCSTI ioctl 2934 * to push characters into the parent process terminal's input buffer, 2935 * therefore escaping the jail. 2936 * 2937 * Since it has just forked, the child will not be a process group 2938 * leader, and this call to setsid() should always succeed. 2939 */ 2940 if (isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) || 2941 isatty(STDERR_FILENO)) { 2942 if (setsid() < 0) { 2943 pdie("setsid() failed"); 2944 } 2945 } 2946 2947 /* If running an init program, let it decide when/how to mount /proc. */ 2948 if (pid_namespace && !do_init) 2949 j->flags.remount_proc_ro = 0; 2950 2951 if (use_preload) { 2952 /* Strip out flags that cannot be inherited across execve(2). */ 2953 minijail_preexec(j); 2954 } else { 2955 /* 2956 * If not using LD_PRELOAD, do all jailing before execve(2). 2957 * Note that PID namespaces can only be entered on fork(2), 2958 * so that flag is still cleared. 2959 */ 2960 j->flags.pids = 0; 2961 } 2962 2963 /* 2964 * Jail this process. 2965 * If forking, return. 2966 * If not, execve(2) the target. 2967 */ 2968 minijail_enter(j); 2969 2970 if (config->exec_in_child && pid_namespace && do_init) { 2971 /* 2972 * pid namespace: this process will become init inside the new 2973 * namespace. We don't want all programs we might exec to have 2974 * to know how to be init. Normally (do_init == 1) we fork off 2975 * a child to actually run the program. If |do_init == 0|, we 2976 * let the program keep pid 1 and be init. 2977 * 2978 * If we're multithreaded, we'll probably deadlock here. See 2979 * WARNING above. 2980 */ 2981 child_pid = fork(); 2982 if (child_pid < 0) { 2983 _exit(child_pid); 2984 } else if (child_pid > 0) { 2985 /* 2986 * Best effort. Don't bother checking the return value. 2987 */ 2988 prctl(PR_SET_NAME, "minijail-init"); 2989 init(child_pid); /* Never returns. */ 2990 } 2991 } 2992 2993 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE); 2994 2995 if (!config->exec_in_child) 2996 return 0; 2997 2998 /* 2999 * If not using LD_PRELOAD, support passing a new environment instead of 3000 * inheriting the parent's. 3001 * When not using LD_PRELOAD there is no need to modify the environment 3002 * to add Minijail-related variables, so passing a new environment is 3003 * fine. 3004 */ 3005 char *const *child_env = environ; 3006 if (!use_preload && config->envp != NULL) { 3007 child_env = config->envp; 3008 } 3009 3010 /* 3011 * If we aren't pid-namespaced, or the jailed program asked to be init: 3012 * calling process 3013 * -> execve()-ing process 3014 * If we are: 3015 * calling process 3016 * -> init()-ing process 3017 * -> execve()-ing process 3018 */ 3019 ret = execve(config->filename, config->argv, child_env); 3020 if (ret == -1) { 3021 pwarn("execve(%s) failed", config->filename); 3022 } 3023 _exit(ret); 3024 } 3025 3026 int API minijail_kill(struct minijail *j) 3027 { 3028 int st; 3029 if (kill(j->initpid, SIGTERM)) 3030 return -errno; 3031 if (waitpid(j->initpid, &st, 0) < 0) 3032 return -errno; 3033 return st; 3034 } 3035 3036 int API minijail_wait(struct minijail *j) 3037 { 3038 int st; 3039 if (waitpid(j->initpid, &st, 0) < 0) 3040 return -errno; 3041 3042 if (!WIFEXITED(st)) { 3043 int error_status = st; 3044 if (WIFSIGNALED(st)) { 3045 int signum = WTERMSIG(st); 3046 warn("child process %d received signal %d", 3047 j->initpid, signum); 3048 /* 3049 * We return MINIJAIL_ERR_JAIL if the process received 3050 * SIGSYS, which happens when a syscall is blocked by 3051 * seccomp filters. 3052 * If not, we do what bash(1) does: 3053 * $? = 128 + signum 3054 */ 3055 if (signum == SIGSYS) { 3056 error_status = MINIJAIL_ERR_JAIL; 3057 } else { 3058 error_status = 128 + signum; 3059 } 3060 } 3061 return error_status; 3062 } 3063 3064 int exit_status = WEXITSTATUS(st); 3065 if (exit_status != 0) 3066 info("child process %d exited with status %d", 3067 j->initpid, exit_status); 3068 3069 return exit_status; 3070 } 3071 3072 void API minijail_destroy(struct minijail *j) 3073 { 3074 size_t i; 3075 3076 if (j->filter_prog) { 3077 free(j->filter_prog->filter); 3078 free(j->filter_prog); 3079 } 3080 free_mounts_list(j); 3081 while (j->hooks_head) { 3082 struct hook *c = j->hooks_head; 3083 j->hooks_head = c->next; 3084 free(c); 3085 } 3086 j->hooks_tail = NULL; 3087 if (j->user) 3088 free(j->user); 3089 if (j->suppl_gid_list) 3090 free(j->suppl_gid_list); 3091 if (j->chrootdir) 3092 free(j->chrootdir); 3093 if (j->pid_file_path) 3094 free(j->pid_file_path); 3095 if (j->uidmap) 3096 free(j->uidmap); 3097 if (j->gidmap) 3098 free(j->gidmap); 3099 if (j->hostname) 3100 free(j->hostname); 3101 if (j->preload_path) 3102 free(j->preload_path); 3103 if (j->alt_syscall_table) 3104 free(j->alt_syscall_table); 3105 for (i = 0; i < j->cgroup_count; ++i) 3106 free(j->cgroups[i]); 3107 free(j); 3108 } 3109 3110 void API minijail_log_to_fd(int fd, int min_priority) 3111 { 3112 init_logging(LOG_TO_FD, fd, min_priority); 3113 } 3114