1 /* Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 2 * Use of this source code is governed by a BSD-style license that can be 3 * found in the LICENSE file. 4 */ 5 6 #define _BSD_SOURCE 7 #define _GNU_SOURCE 8 9 #include <asm/unistd.h> 10 #include <ctype.h> 11 #include <errno.h> 12 #include <fcntl.h> 13 #include <grp.h> 14 #include <inttypes.h> 15 #include <limits.h> 16 #include <linux/capability.h> 17 #include <pwd.h> 18 #include <sched.h> 19 #include <signal.h> 20 #include <stdarg.h> 21 #include <stdbool.h> 22 #include <stddef.h> 23 #include <stdio.h> 24 #include <stdlib.h> 25 #include <string.h> 26 #include <syscall.h> 27 #include <sys/capability.h> 28 #include <sys/mount.h> 29 #include <sys/param.h> 30 #include <sys/prctl.h> 31 #include <sys/stat.h> 32 #include <sys/types.h> 33 #include <sys/user.h> 34 #include <sys/utsname.h> 35 #include <sys/wait.h> 36 #include <unistd.h> 37 38 #include "libminijail.h" 39 #include "libminijail-private.h" 40 41 #include "signal_handler.h" 42 #include "syscall_filter.h" 43 #include "util.h" 44 45 #ifdef HAVE_SECUREBITS_H 46 #include <linux/securebits.h> 47 #else 48 #define SECURE_ALL_BITS 0x15 49 #define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) 50 #endif 51 52 /* Until these are reliably available in linux/prctl.h */ 53 #ifndef PR_SET_SECCOMP 54 # define PR_SET_SECCOMP 22 55 #endif 56 57 #ifndef PR_ALT_SYSCALL 58 # define PR_ALT_SYSCALL 0x43724f53 59 #endif 60 61 /* For seccomp_filter using BPF. */ 62 #ifndef PR_SET_NO_NEW_PRIVS 63 # define PR_SET_NO_NEW_PRIVS 38 64 #endif 65 #ifndef SECCOMP_MODE_FILTER 66 # define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */ 67 #endif 68 69 #ifdef USE_SECCOMP_SOFTFAIL 70 # define SECCOMP_SOFTFAIL 1 71 #else 72 # define SECCOMP_SOFTFAIL 0 73 #endif 74 75 #define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */ 76 77 struct mountpoint { 78 char *src; 79 char *dest; 80 char *type; 81 unsigned long flags; 82 struct mountpoint *next; 83 }; 84 85 struct minijail { 86 /* 87 * WARNING: if you add a flag here you need to make sure it's 88 * accounted for in minijail_pre{enter|exec}() below. 89 */ 90 struct { 91 int uid:1; 92 int gid:1; 93 int usergroups:1; 94 int suppl_gids:1; 95 int caps:1; 96 int vfs:1; 97 int enter_vfs:1; 98 int pids:1; 99 int ipc:1; 100 int net:1; 101 int enter_net:1; 102 int userns:1; 103 int seccomp:1; 104 int remount_proc_ro:1; 105 int no_new_privs:1; 106 int seccomp_filter:1; 107 int log_seccomp_filter:1; 108 int chroot:1; 109 int pivot_root:1; 110 int mount_tmp:1; 111 int do_init:1; 112 int pid_file:1; 113 int cgroups:1; 114 int alt_syscall:1; 115 int reset_signal_mask:1; 116 } flags; 117 uid_t uid; 118 gid_t gid; 119 gid_t usergid; 120 char *user; 121 size_t suppl_gid_count; 122 gid_t *suppl_gid_list; 123 uint64_t caps; 124 pid_t initpid; 125 int mountns_fd; 126 int netns_fd; 127 char *chrootdir; 128 char *pid_file_path; 129 char *uidmap; 130 char *gidmap; 131 size_t filter_len; 132 struct sock_fprog *filter_prog; 133 char *alt_syscall_table; 134 struct mountpoint *mounts_head; 135 struct mountpoint *mounts_tail; 136 size_t mounts_count; 137 char *cgroups[MAX_CGROUPS]; 138 size_t cgroup_count; 139 }; 140 141 /* 142 * Strip out flags meant for the parent. 143 * We keep things that are not inherited across execve(2) (e.g. capabilities), 144 * or are easier to set after execve(2) (e.g. seccomp filters). 145 */ 146 void minijail_preenter(struct minijail *j) 147 { 148 j->flags.vfs = 0; 149 j->flags.enter_vfs = 0; 150 j->flags.remount_proc_ro = 0; 151 j->flags.pids = 0; 152 j->flags.do_init = 0; 153 j->flags.pid_file = 0; 154 j->flags.cgroups = 0; 155 } 156 157 /* 158 * Strip out flags meant for the child. 159 * We keep things that are inherited across execve(2). 160 */ 161 void minijail_preexec(struct minijail *j) 162 { 163 int vfs = j->flags.vfs; 164 int enter_vfs = j->flags.enter_vfs; 165 int remount_proc_ro = j->flags.remount_proc_ro; 166 int userns = j->flags.userns; 167 if (j->user) 168 free(j->user); 169 j->user = NULL; 170 if (j->suppl_gid_list) 171 free(j->suppl_gid_list); 172 j->suppl_gid_list = NULL; 173 memset(&j->flags, 0, sizeof(j->flags)); 174 /* Now restore anything we meant to keep. */ 175 j->flags.vfs = vfs; 176 j->flags.enter_vfs = enter_vfs; 177 j->flags.remount_proc_ro = remount_proc_ro; 178 j->flags.userns = userns; 179 /* Note, |pids| will already have been used before this call. */ 180 } 181 182 /* Returns true if the kernel version is less than 3.8. */ 183 int seccomp_kernel_support_not_required() 184 { 185 int major, minor; 186 struct utsname uts; 187 return (uname(&uts) != -1 && 188 sscanf(uts.release, "%d.%d", &major, &minor) == 2 && 189 ((major < 3) || ((major == 3) && (minor < 8)))); 190 } 191 192 /* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */ 193 int can_softfail() 194 { 195 #if SECCOMP_SOFTFAIL 196 if (is_android()) { 197 if (seccomp_kernel_support_not_required()) 198 return 1; 199 else 200 return 0; 201 } else { 202 return 1; 203 } 204 #endif 205 return 0; 206 } 207 208 /* Minijail API. */ 209 210 struct minijail API *minijail_new(void) 211 { 212 return calloc(1, sizeof(struct minijail)); 213 } 214 215 void API minijail_change_uid(struct minijail *j, uid_t uid) 216 { 217 if (uid == 0) 218 die("useless change to uid 0"); 219 j->uid = uid; 220 j->flags.uid = 1; 221 } 222 223 void API minijail_change_gid(struct minijail *j, gid_t gid) 224 { 225 if (gid == 0) 226 die("useless change to gid 0"); 227 j->gid = gid; 228 j->flags.gid = 1; 229 } 230 231 void API minijail_set_supplementary_gids(struct minijail *j, size_t size, 232 const gid_t *list) 233 { 234 size_t i; 235 236 if (j->flags.usergroups) 237 die("cannot inherit *and* set supplementary groups"); 238 239 if (size == 0) { 240 /* Clear supplementary groups. */ 241 j->suppl_gid_list = NULL; 242 j->suppl_gid_count = 0; 243 j->flags.suppl_gids = 1; 244 return; 245 } 246 247 /* Copy the gid_t array. */ 248 j->suppl_gid_list = calloc(size, sizeof(gid_t)); 249 if (!j->suppl_gid_list) { 250 die("failed to allocate internal supplementary group array"); 251 } 252 for (i = 0; i < size; i++) { 253 j->suppl_gid_list[i] = list[i]; 254 } 255 j->suppl_gid_count = size; 256 j->flags.suppl_gids = 1; 257 } 258 259 int API minijail_change_user(struct minijail *j, const char *user) 260 { 261 char *buf = NULL; 262 struct passwd pw; 263 struct passwd *ppw = NULL; 264 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); 265 if (sz == -1) 266 sz = 65536; /* your guess is as good as mine... */ 267 268 /* 269 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return 270 * the maximum needed size of the buffer, so we don't have to search. 271 */ 272 buf = malloc(sz); 273 if (!buf) 274 return -ENOMEM; 275 getpwnam_r(user, &pw, buf, sz, &ppw); 276 /* 277 * We're safe to free the buffer here. The strings inside |pw| point 278 * inside |buf|, but we don't use any of them; this leaves the pointers 279 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) succeeded. 280 */ 281 free(buf); 282 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */ 283 if (!ppw) 284 return -1; 285 minijail_change_uid(j, ppw->pw_uid); 286 j->user = strdup(user); 287 if (!j->user) 288 return -ENOMEM; 289 j->usergid = ppw->pw_gid; 290 return 0; 291 } 292 293 int API minijail_change_group(struct minijail *j, const char *group) 294 { 295 char *buf = NULL; 296 struct group gr; 297 struct group *pgr = NULL; 298 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); 299 if (sz == -1) 300 sz = 65536; /* and mine is as good as yours, really */ 301 302 /* 303 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return 304 * the maximum needed size of the buffer, so we don't have to search. 305 */ 306 buf = malloc(sz); 307 if (!buf) 308 return -ENOMEM; 309 getgrnam_r(group, &gr, buf, sz, &pgr); 310 /* 311 * We're safe to free the buffer here. The strings inside gr point 312 * inside buf, but we don't use any of them; this leaves the pointers 313 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded. 314 */ 315 free(buf); 316 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */ 317 if (!pgr) 318 return -1; 319 minijail_change_gid(j, pgr->gr_gid); 320 return 0; 321 } 322 323 void API minijail_use_seccomp(struct minijail *j) 324 { 325 j->flags.seccomp = 1; 326 } 327 328 void API minijail_no_new_privs(struct minijail *j) 329 { 330 j->flags.no_new_privs = 1; 331 } 332 333 void API minijail_use_seccomp_filter(struct minijail *j) 334 { 335 j->flags.seccomp_filter = 1; 336 } 337 338 void API minijail_log_seccomp_filter_failures(struct minijail *j) 339 { 340 j->flags.log_seccomp_filter = 1; 341 } 342 343 void API minijail_use_caps(struct minijail *j, uint64_t capmask) 344 { 345 j->caps = capmask; 346 j->flags.caps = 1; 347 } 348 349 void API minijail_reset_signal_mask(struct minijail* j) { 350 j->flags.reset_signal_mask = 1; 351 } 352 353 void API minijail_namespace_vfs(struct minijail *j) 354 { 355 j->flags.vfs = 1; 356 } 357 358 void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path) 359 { 360 int ns_fd = open(ns_path, O_RDONLY); 361 if (ns_fd < 0) { 362 pdie("failed to open namespace '%s'", ns_path); 363 } 364 j->mountns_fd = ns_fd; 365 j->flags.enter_vfs = 1; 366 } 367 368 void API minijail_namespace_pids(struct minijail *j) 369 { 370 j->flags.vfs = 1; 371 j->flags.remount_proc_ro = 1; 372 j->flags.pids = 1; 373 j->flags.do_init = 1; 374 } 375 376 void API minijail_namespace_ipc(struct minijail *j) 377 { 378 j->flags.ipc = 1; 379 } 380 381 void API minijail_namespace_net(struct minijail *j) 382 { 383 j->flags.net = 1; 384 } 385 386 void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path) 387 { 388 int ns_fd = open(ns_path, O_RDONLY); 389 if (ns_fd < 0) { 390 pdie("failed to open namespace '%s'", ns_path); 391 } 392 j->netns_fd = ns_fd; 393 j->flags.enter_net = 1; 394 } 395 396 void API minijail_remount_proc_readonly(struct minijail *j) 397 { 398 j->flags.vfs = 1; 399 j->flags.remount_proc_ro = 1; 400 } 401 402 void API minijail_namespace_user(struct minijail *j) 403 { 404 j->flags.userns = 1; 405 } 406 407 int API minijail_uidmap(struct minijail *j, const char *uidmap) 408 { 409 j->uidmap = strdup(uidmap); 410 if (!j->uidmap) 411 return -ENOMEM; 412 char *ch; 413 for (ch = j->uidmap; *ch; ch++) { 414 if (*ch == ',') 415 *ch = '\n'; 416 } 417 return 0; 418 } 419 420 int API minijail_gidmap(struct minijail *j, const char *gidmap) 421 { 422 j->gidmap = strdup(gidmap); 423 if (!j->gidmap) 424 return -ENOMEM; 425 char *ch; 426 for (ch = j->gidmap; *ch; ch++) { 427 if (*ch == ',') 428 *ch = '\n'; 429 } 430 return 0; 431 } 432 433 void API minijail_inherit_usergroups(struct minijail *j) 434 { 435 j->flags.usergroups = 1; 436 } 437 438 void API minijail_run_as_init(struct minijail *j) 439 { 440 /* 441 * Since the jailed program will become 'init' in the new PID namespace, 442 * Minijail does not need to fork an 'init' process. 443 */ 444 j->flags.do_init = 0; 445 } 446 447 int API minijail_enter_chroot(struct minijail *j, const char *dir) 448 { 449 if (j->chrootdir) 450 return -EINVAL; 451 j->chrootdir = strdup(dir); 452 if (!j->chrootdir) 453 return -ENOMEM; 454 j->flags.chroot = 1; 455 return 0; 456 } 457 458 int API minijail_enter_pivot_root(struct minijail *j, const char *dir) 459 { 460 if (j->chrootdir) 461 return -EINVAL; 462 j->chrootdir = strdup(dir); 463 if (!j->chrootdir) 464 return -ENOMEM; 465 j->flags.pivot_root = 1; 466 return 0; 467 } 468 469 static char *append_external_path(const char *external_path, 470 const char *path_inside_chroot) 471 { 472 char *path; 473 size_t pathlen; 474 475 /* One extra char for '/' and one for '\0', hence + 2. */ 476 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2; 477 path = malloc(pathlen); 478 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot); 479 480 return path; 481 } 482 483 char API *minijail_get_original_path(struct minijail *j, 484 const char *path_inside_chroot) 485 { 486 struct mountpoint *b; 487 488 b = j->mounts_head; 489 while (b) { 490 /* 491 * If |path_inside_chroot| is the exact destination of a 492 * mount, then the original path is exactly the source of 493 * the mount. 494 * for example: "-b /some/path/exe,/chroot/path/exe" 495 * mount source = /some/path/exe, mount dest = 496 * /chroot/path/exe Then when getting the original path of 497 * "/chroot/path/exe", the source of that mount, 498 * "/some/path/exe" is what should be returned. 499 */ 500 if (!strcmp(b->dest, path_inside_chroot)) 501 return strdup(b->src); 502 503 /* 504 * If |path_inside_chroot| is within the destination path of a 505 * mount, take the suffix of the chroot path relative to the 506 * mount destination path, and append it to the mount source 507 * path. 508 */ 509 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) { 510 const char *relative_path = 511 path_inside_chroot + strlen(b->dest); 512 return append_external_path(b->src, relative_path); 513 } 514 b = b->next; 515 } 516 517 /* If there is a chroot path, append |path_inside_chroot| to that. */ 518 if (j->chrootdir) 519 return append_external_path(j->chrootdir, path_inside_chroot); 520 521 /* No chroot, so the path outside is the same as it is inside. */ 522 return strdup(path_inside_chroot); 523 } 524 525 void API minijail_mount_tmp(struct minijail *j) 526 { 527 j->flags.mount_tmp = 1; 528 } 529 530 int API minijail_write_pid_file(struct minijail *j, const char *path) 531 { 532 j->pid_file_path = strdup(path); 533 if (!j->pid_file_path) 534 return -ENOMEM; 535 j->flags.pid_file = 1; 536 return 0; 537 } 538 539 int API minijail_add_to_cgroup(struct minijail *j, const char *path) 540 { 541 if (j->cgroup_count >= MAX_CGROUPS) 542 return -ENOMEM; 543 j->cgroups[j->cgroup_count] = strdup(path); 544 if (!j->cgroups[j->cgroup_count]) 545 return -ENOMEM; 546 j->cgroup_count++; 547 j->flags.cgroups = 1; 548 return 0; 549 } 550 551 int API minijail_mount(struct minijail *j, const char *src, const char *dest, 552 const char *type, unsigned long flags) 553 { 554 struct mountpoint *m; 555 556 if (*dest != '/') 557 return -EINVAL; 558 m = calloc(1, sizeof(*m)); 559 if (!m) 560 return -ENOMEM; 561 m->dest = strdup(dest); 562 if (!m->dest) 563 goto error; 564 m->src = strdup(src); 565 if (!m->src) 566 goto error; 567 m->type = strdup(type); 568 if (!m->type) 569 goto error; 570 m->flags = flags; 571 572 info("mount %s -> %s type '%s'", src, dest, type); 573 574 /* 575 * Force vfs namespacing so the mounts don't leak out into the 576 * containing vfs namespace. 577 */ 578 minijail_namespace_vfs(j); 579 580 if (j->mounts_tail) 581 j->mounts_tail->next = m; 582 else 583 j->mounts_head = m; 584 j->mounts_tail = m; 585 j->mounts_count++; 586 587 return 0; 588 589 error: 590 free(m->src); 591 free(m->dest); 592 free(m); 593 return -ENOMEM; 594 } 595 596 int API minijail_bind(struct minijail *j, const char *src, const char *dest, 597 int writeable) 598 { 599 unsigned long flags = MS_BIND; 600 601 if (!writeable) 602 flags |= MS_RDONLY; 603 604 return minijail_mount(j, src, dest, "", flags); 605 } 606 607 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path) 608 { 609 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) { 610 if ((errno == EINVAL) && can_softfail()) { 611 warn("not loading seccomp filter," 612 " seccomp not supported"); 613 j->flags.seccomp_filter = 0; 614 j->flags.log_seccomp_filter = 0; 615 j->filter_len = 0; 616 j->filter_prog = NULL; 617 j->flags.no_new_privs = 0; 618 } 619 } 620 FILE *file = fopen(path, "r"); 621 if (!file) { 622 pdie("failed to open seccomp filter file '%s'", path); 623 } 624 625 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog)); 626 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) { 627 die("failed to compile seccomp filter BPF program in '%s'", 628 path); 629 } 630 631 j->filter_len = fprog->len; 632 j->filter_prog = fprog; 633 634 fclose(file); 635 } 636 637 int API minijail_use_alt_syscall(struct minijail *j, const char *table) 638 { 639 j->alt_syscall_table = strdup(table); 640 if (!j->alt_syscall_table) 641 return -ENOMEM; 642 j->flags.alt_syscall = 1; 643 return 0; 644 } 645 646 struct marshal_state { 647 size_t available; 648 size_t total; 649 char *buf; 650 }; 651 652 void marshal_state_init(struct marshal_state *state, 653 char *buf, size_t available) 654 { 655 state->available = available; 656 state->buf = buf; 657 state->total = 0; 658 } 659 660 void marshal_append(struct marshal_state *state, 661 void *src, size_t length) 662 { 663 size_t copy_len = MIN(state->available, length); 664 665 /* Up to |available| will be written. */ 666 if (copy_len) { 667 memcpy(state->buf, src, copy_len); 668 state->buf += copy_len; 669 state->available -= copy_len; 670 } 671 /* |total| will contain the expected length. */ 672 state->total += length; 673 } 674 675 void minijail_marshal_helper(struct marshal_state *state, 676 const struct minijail *j) 677 { 678 struct mountpoint *m = NULL; 679 size_t i; 680 681 marshal_append(state, (char *)j, sizeof(*j)); 682 if (j->user) 683 marshal_append(state, j->user, strlen(j->user) + 1); 684 if (j->suppl_gid_list) { 685 marshal_append(state, j->suppl_gid_list, 686 j->suppl_gid_count * sizeof(gid_t)); 687 } 688 if (j->chrootdir) 689 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1); 690 if (j->alt_syscall_table) { 691 marshal_append(state, j->alt_syscall_table, 692 strlen(j->alt_syscall_table) + 1); 693 } 694 if (j->flags.seccomp_filter && j->filter_prog) { 695 struct sock_fprog *fp = j->filter_prog; 696 marshal_append(state, (char *)fp->filter, 697 fp->len * sizeof(struct sock_filter)); 698 } 699 for (m = j->mounts_head; m; m = m->next) { 700 marshal_append(state, m->src, strlen(m->src) + 1); 701 marshal_append(state, m->dest, strlen(m->dest) + 1); 702 marshal_append(state, m->type, strlen(m->type) + 1); 703 marshal_append(state, (char *)&m->flags, sizeof(m->flags)); 704 } 705 for (i = 0; i < j->cgroup_count; ++i) 706 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1); 707 } 708 709 size_t API minijail_size(const struct minijail *j) 710 { 711 struct marshal_state state; 712 marshal_state_init(&state, NULL, 0); 713 minijail_marshal_helper(&state, j); 714 return state.total; 715 } 716 717 int minijail_marshal(const struct minijail *j, char *buf, size_t available) 718 { 719 struct marshal_state state; 720 marshal_state_init(&state, buf, available); 721 minijail_marshal_helper(&state, j); 722 return (state.total > available); 723 } 724 725 /* 726 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength 727 * @length Number of bytes to consume 728 * @buf Buffer to consume from 729 * @buflength Size of @buf 730 * 731 * Returns a pointer to the base of the bytes, or NULL for errors. 732 */ 733 void *consumebytes(size_t length, char **buf, size_t *buflength) 734 { 735 char *p = *buf; 736 if (length > *buflength) 737 return NULL; 738 *buf += length; 739 *buflength -= length; 740 return p; 741 } 742 743 /* 744 * consumestr: consumes a C string from a buffer @buf of length @length 745 * @buf Buffer to consume 746 * @length Length of buffer 747 * 748 * Returns a pointer to the base of the string, or NULL for errors. 749 */ 750 char *consumestr(char **buf, size_t *buflength) 751 { 752 size_t len = strnlen(*buf, *buflength); 753 if (len == *buflength) 754 /* There's no null-terminator. */ 755 return NULL; 756 return consumebytes(len + 1, buf, buflength); 757 } 758 759 int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) 760 { 761 size_t i; 762 size_t count; 763 int ret = -EINVAL; 764 765 if (length < sizeof(*j)) 766 goto out; 767 memcpy((void *)j, serialized, sizeof(*j)); 768 serialized += sizeof(*j); 769 length -= sizeof(*j); 770 771 /* Potentially stale pointers not used as signals. */ 772 j->mounts_head = NULL; 773 j->mounts_tail = NULL; 774 j->filter_prog = NULL; 775 776 if (j->user) { /* stale pointer */ 777 char *user = consumestr(&serialized, &length); 778 if (!user) 779 goto clear_pointers; 780 j->user = strdup(user); 781 if (!j->user) 782 goto clear_pointers; 783 } 784 785 if (j->suppl_gid_list) { /* stale pointer */ 786 if (j->suppl_gid_count > NGROUPS_MAX) { 787 goto bad_gid_list; 788 } 789 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t); 790 void *gid_list_bytes = 791 consumebytes(gid_list_size, &serialized, &length); 792 if (!gid_list_bytes) 793 goto bad_gid_list; 794 795 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t)); 796 if (!j->suppl_gid_list) 797 goto bad_gid_list; 798 799 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size); 800 } 801 802 if (j->chrootdir) { /* stale pointer */ 803 char *chrootdir = consumestr(&serialized, &length); 804 if (!chrootdir) 805 goto bad_chrootdir; 806 j->chrootdir = strdup(chrootdir); 807 if (!j->chrootdir) 808 goto bad_chrootdir; 809 } 810 811 if (j->alt_syscall_table) { /* stale pointer */ 812 char *alt_syscall_table = consumestr(&serialized, &length); 813 if (!alt_syscall_table) 814 goto bad_syscall_table; 815 j->alt_syscall_table = strdup(alt_syscall_table); 816 if (!j->alt_syscall_table) 817 goto bad_syscall_table; 818 } 819 820 if (j->flags.seccomp_filter && j->filter_len > 0) { 821 size_t ninstrs = j->filter_len; 822 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) || 823 ninstrs > USHRT_MAX) 824 goto bad_filters; 825 826 size_t program_len = ninstrs * sizeof(struct sock_filter); 827 void *program = consumebytes(program_len, &serialized, &length); 828 if (!program) 829 goto bad_filters; 830 831 j->filter_prog = malloc(sizeof(struct sock_fprog)); 832 if (!j->filter_prog) 833 goto bad_filters; 834 835 j->filter_prog->len = ninstrs; 836 j->filter_prog->filter = malloc(program_len); 837 if (!j->filter_prog->filter) 838 goto bad_filter_prog_instrs; 839 840 memcpy(j->filter_prog->filter, program, program_len); 841 } 842 843 count = j->mounts_count; 844 j->mounts_count = 0; 845 for (i = 0; i < count; ++i) { 846 unsigned long *flags; 847 const char *dest; 848 const char *type; 849 const char *src = consumestr(&serialized, &length); 850 if (!src) 851 goto bad_mounts; 852 dest = consumestr(&serialized, &length); 853 if (!dest) 854 goto bad_mounts; 855 type = consumestr(&serialized, &length); 856 if (!type) 857 goto bad_mounts; 858 flags = consumebytes(sizeof(*flags), &serialized, &length); 859 if (!flags) 860 goto bad_mounts; 861 if (minijail_mount(j, src, dest, type, *flags)) 862 goto bad_mounts; 863 } 864 865 count = j->cgroup_count; 866 j->cgroup_count = 0; 867 for (i = 0; i < count; ++i) { 868 char *cgroup = consumestr(&serialized, &length); 869 if (!cgroup) 870 goto bad_cgroups; 871 j->cgroups[i] = strdup(cgroup); 872 if (!j->cgroups[i]) 873 goto bad_cgroups; 874 ++j->cgroup_count; 875 } 876 877 return 0; 878 879 bad_cgroups: 880 while (j->mounts_head) { 881 struct mountpoint *m = j->mounts_head; 882 j->mounts_head = j->mounts_head->next; 883 free(m->type); 884 free(m->dest); 885 free(m->src); 886 free(m); 887 } 888 for (i = 0; i < j->cgroup_count; ++i) 889 free(j->cgroups[i]); 890 bad_mounts: 891 if (j->flags.seccomp_filter && j->filter_len > 0) { 892 free(j->filter_prog->filter); 893 free(j->filter_prog); 894 } 895 bad_filter_prog_instrs: 896 if (j->filter_prog) 897 free(j->filter_prog); 898 bad_filters: 899 if (j->alt_syscall_table) 900 free(j->alt_syscall_table); 901 bad_syscall_table: 902 if (j->chrootdir) 903 free(j->chrootdir); 904 bad_chrootdir: 905 if (j->suppl_gid_list) 906 free(j->suppl_gid_list); 907 bad_gid_list: 908 if (j->user) 909 free(j->user); 910 clear_pointers: 911 j->user = NULL; 912 j->suppl_gid_list = NULL; 913 j->chrootdir = NULL; 914 j->alt_syscall_table = NULL; 915 j->cgroup_count = 0; 916 out: 917 return ret; 918 } 919 920 static void write_ugid_mappings(const struct minijail *j) 921 { 922 int fd, ret, len; 923 size_t sz; 924 char fname[32]; 925 926 sz = sizeof(fname); 927 if (j->uidmap) { 928 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid); 929 if (ret < 0 || (size_t)ret >= sz) 930 die("failed to write file name of uid_map"); 931 fd = open(fname, O_WRONLY); 932 if (fd < 0) 933 pdie("failed to open '%s'", fname); 934 len = strlen(j->uidmap); 935 if (write(fd, j->uidmap, len) < len) 936 die("failed to set uid_map"); 937 close(fd); 938 } 939 if (j->gidmap) { 940 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid); 941 if (ret < 0 || (size_t)ret >= sz) 942 die("failed to write file name of gid_map"); 943 fd = open(fname, O_WRONLY); 944 if (fd < 0) 945 pdie("failed to open '%s'", fname); 946 len = strlen(j->gidmap); 947 if (write(fd, j->gidmap, len) < len) 948 die("failed to set gid_map"); 949 close(fd); 950 } 951 } 952 953 static void parent_setup_complete(int *pipe_fds) 954 { 955 close(pipe_fds[0]); 956 close(pipe_fds[1]); 957 } 958 959 /* 960 * wait_for_parent_setup: Called by the child process to wait for any 961 * further parent-side setup to complete before continuing. 962 */ 963 static void wait_for_parent_setup(int *pipe_fds) 964 { 965 char buf; 966 967 close(pipe_fds[1]); 968 969 /* Wait for parent to complete setup and close the pipe. */ 970 if (read(pipe_fds[0], &buf, 1) != 0) 971 die("failed to sync with parent"); 972 close(pipe_fds[0]); 973 } 974 975 static void enter_user_namespace(const struct minijail *j) 976 { 977 if (j->uidmap && setresuid(0, 0, 0)) 978 pdie("setresuid"); 979 if (j->gidmap && setresgid(0, 0, 0)) 980 pdie("setresgid"); 981 } 982 983 /* 984 * mount_one: Applies mounts from @m for @j, recursing as needed. 985 * @j Minijail these mounts are for 986 * @m Head of list of mounts 987 * 988 * Returns 0 for success. 989 */ 990 static int mount_one(const struct minijail *j, struct mountpoint *m) 991 { 992 int ret; 993 char *dest; 994 int remount_ro = 0; 995 996 /* |dest| has a leading "/". */ 997 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0) 998 return -ENOMEM; 999 1000 /* 1001 * R/O bind mounts have to be remounted since 'bind' and 'ro' 1002 * can't both be specified in the original bind mount. 1003 * Remount R/O after the initial mount. 1004 */ 1005 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) { 1006 remount_ro = 1; 1007 m->flags &= ~MS_RDONLY; 1008 } 1009 1010 ret = mount(m->src, dest, m->type, m->flags, NULL); 1011 if (ret) 1012 pdie("mount: %s -> %s", m->src, dest); 1013 1014 if (remount_ro) { 1015 m->flags |= MS_RDONLY; 1016 ret = mount(m->src, dest, NULL, 1017 m->flags | MS_REMOUNT, NULL); 1018 if (ret) 1019 pdie("bind ro: %s -> %s", m->src, dest); 1020 } 1021 1022 free(dest); 1023 if (m->next) 1024 return mount_one(j, m->next); 1025 return ret; 1026 } 1027 1028 int enter_chroot(const struct minijail *j) 1029 { 1030 int ret; 1031 1032 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1033 return ret; 1034 1035 if (chroot(j->chrootdir)) 1036 return -errno; 1037 1038 if (chdir("/")) 1039 return -errno; 1040 1041 return 0; 1042 } 1043 1044 int enter_pivot_root(const struct minijail *j) 1045 { 1046 int ret, oldroot, newroot; 1047 1048 if (j->mounts_head && (ret = mount_one(j, j->mounts_head))) 1049 return ret; 1050 1051 /* 1052 * Keep the fd for both old and new root. 1053 * It will be used in fchdir later. 1054 */ 1055 oldroot = open("/", O_DIRECTORY | O_RDONLY); 1056 if (oldroot < 0) 1057 pdie("failed to open / for fchdir"); 1058 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY); 1059 if (newroot < 0) 1060 pdie("failed to open %s for fchdir", j->chrootdir); 1061 1062 /* 1063 * To ensure chrootdir is the root of a file system, 1064 * do a self bind mount. 1065 */ 1066 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, "")) 1067 pdie("failed to bind mount '%s'", j->chrootdir); 1068 if (chdir(j->chrootdir)) 1069 return -errno; 1070 if (syscall(SYS_pivot_root, ".", ".")) 1071 pdie("pivot_root"); 1072 1073 /* 1074 * Now the old root is mounted on top of the new root. Use fchdir to 1075 * change to the old root and unmount it. 1076 */ 1077 if (fchdir(oldroot)) 1078 pdie("failed to fchdir to old /"); 1079 /* The old root might be busy, so use lazy unmount. */ 1080 if (umount2(".", MNT_DETACH)) 1081 pdie("umount(/)"); 1082 /* Change back to the new root. */ 1083 if (fchdir(newroot)) 1084 return -errno; 1085 if (chroot("/")) 1086 return -errno; 1087 /* Set correct CWD for getcwd(3). */ 1088 if (chdir("/")) 1089 return -errno; 1090 1091 return 0; 1092 } 1093 1094 int mount_tmp(void) 1095 { 1096 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777"); 1097 } 1098 1099 int remount_proc_readonly(const struct minijail *j) 1100 { 1101 const char *kProcPath = "/proc"; 1102 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID; 1103 /* 1104 * Right now, we're holding a reference to our parent's old mount of 1105 * /proc in our namespace, which means using MS_REMOUNT here would 1106 * mutate our parent's mount as well, even though we're in a VFS 1107 * namespace (!). Instead, remove their mount from our namespace 1108 * and make our own. However, if we are in a new user namespace, /proc 1109 * is not seen as mounted, so don't return error if umount() fails. 1110 */ 1111 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns) 1112 return -errno; 1113 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, "")) 1114 return -errno; 1115 return 0; 1116 } 1117 1118 static void write_pid_to_path(pid_t pid, const char *path) 1119 { 1120 FILE *fp = fopen(path, "w"); 1121 1122 if (!fp) 1123 pdie("failed to open '%s'", path); 1124 if (fprintf(fp, "%d\n", (int)pid) < 0) 1125 pdie("fprintf(%s)", path); 1126 if (fclose(fp)) 1127 pdie("fclose(%s)", path); 1128 } 1129 1130 static void write_pid_file(const struct minijail *j) 1131 { 1132 write_pid_to_path(j->initpid, j->pid_file_path); 1133 } 1134 1135 static void add_to_cgroups(const struct minijail *j) 1136 { 1137 size_t i; 1138 1139 for (i = 0; i < j->cgroup_count; ++i) 1140 write_pid_to_path(j->initpid, j->cgroups[i]); 1141 } 1142 1143 void drop_ugid(const struct minijail *j) 1144 { 1145 if (j->flags.usergroups && j->flags.suppl_gids) { 1146 die("tried to inherit *and* set supplementary groups;" 1147 " can only do one"); 1148 } 1149 1150 if (j->flags.usergroups) { 1151 if (initgroups(j->user, j->usergid)) 1152 pdie("initgroups"); 1153 } else if (j->flags.suppl_gids) { 1154 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) { 1155 pdie("setgroups"); 1156 } 1157 } else { 1158 /* 1159 * Only attempt to clear supplementary groups if we are changing 1160 * users. 1161 */ 1162 if ((j->uid || j->gid) && setgroups(0, NULL)) 1163 pdie("setgroups"); 1164 } 1165 1166 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid)) 1167 pdie("setresgid"); 1168 1169 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid)) 1170 pdie("setresuid"); 1171 } 1172 1173 /* 1174 * We specifically do not use cap_valid() as that only tells us the last 1175 * valid cap we were *compiled* against (i.e. what the version of kernel 1176 * headers says). If we run on a different kernel version, then it's not 1177 * uncommon for that to be less (if an older kernel) or more (if a newer 1178 * kernel). 1179 * Normally, we suck up the answer via /proc. On Android, not all processes are 1180 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we 1181 * programmatically find the value by calling prctl(PR_CAPBSET_READ). 1182 */ 1183 static unsigned int get_last_valid_cap() 1184 { 1185 unsigned int last_valid_cap = 0; 1186 if (is_android()) { 1187 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; 1188 ++last_valid_cap); 1189 1190 /* |last_valid_cap| will be the first failing value. */ 1191 if (last_valid_cap > 0) { 1192 last_valid_cap--; 1193 } 1194 } else { 1195 const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; 1196 FILE *fp = fopen(cap_file, "re"); 1197 if (fscanf(fp, "%u", &last_valid_cap) != 1) 1198 pdie("fscanf(%s)", cap_file); 1199 fclose(fp); 1200 } 1201 return last_valid_cap; 1202 } 1203 1204 void drop_caps(const struct minijail *j, unsigned int last_valid_cap) 1205 { 1206 cap_t caps = cap_get_proc(); 1207 cap_value_t flag[1]; 1208 const uint64_t one = 1; 1209 unsigned int i; 1210 if (!caps) 1211 die("can't get process caps"); 1212 if (cap_clear_flag(caps, CAP_INHERITABLE)) 1213 die("can't clear inheritable caps"); 1214 if (cap_clear_flag(caps, CAP_EFFECTIVE)) 1215 die("can't clear effective caps"); 1216 if (cap_clear_flag(caps, CAP_PERMITTED)) 1217 die("can't clear permitted caps"); 1218 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1219 /* Keep CAP_SETPCAP for dropping bounding set bits. */ 1220 if (i != CAP_SETPCAP && !(j->caps & (one << i))) 1221 continue; 1222 flag[0] = i; 1223 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET)) 1224 die("can't add effective cap"); 1225 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET)) 1226 die("can't add permitted cap"); 1227 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET)) 1228 die("can't add inheritable cap"); 1229 } 1230 if (cap_set_proc(caps)) 1231 die("can't apply initial cleaned capset"); 1232 1233 /* 1234 * Instead of dropping bounding set first, do it here in case 1235 * the caller had a more permissive bounding set which could 1236 * have been used above to raise a capability that wasn't already 1237 * present. This requires CAP_SETPCAP, so we raised/kept it above. 1238 */ 1239 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) { 1240 if (j->caps & (one << i)) 1241 continue; 1242 if (prctl(PR_CAPBSET_DROP, i)) 1243 pdie("prctl(PR_CAPBSET_DROP)"); 1244 } 1245 1246 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */ 1247 if ((j->caps & (one << CAP_SETPCAP)) == 0) { 1248 flag[0] = CAP_SETPCAP; 1249 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR)) 1250 die("can't clear effective cap"); 1251 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR)) 1252 die("can't clear permitted cap"); 1253 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR)) 1254 die("can't clear inheritable cap"); 1255 } 1256 1257 if (cap_set_proc(caps)) 1258 die("can't apply final cleaned capset"); 1259 1260 cap_free(caps); 1261 } 1262 1263 void set_seccomp_filter(const struct minijail *j) 1264 { 1265 /* 1266 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c> 1267 * in the kernel source tree for an explanation of the parameters. 1268 */ 1269 if (j->flags.no_new_privs) { 1270 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) 1271 pdie("prctl(PR_SET_NO_NEW_PRIVS)"); 1272 } 1273 1274 /* 1275 * Code running with ASan 1276 * (https://github.com/google/sanitizers/wiki/AddressSanitizer) 1277 * will make system calls not included in the syscall filter policy, 1278 * which will likely crash the program. Skip setting seccomp filter in 1279 * that case. 1280 * 'running_with_asan()' has no inputs and is completely defined at 1281 * build time, so this cannot be used by an attacker to skip setting 1282 * seccomp filter. 1283 */ 1284 if (j->flags.seccomp_filter && running_with_asan()) { 1285 warn("running with ASan, not setting seccomp filter"); 1286 return; 1287 } 1288 1289 /* 1290 * If we're logging seccomp filter failures, 1291 * install the SIGSYS handler first. 1292 */ 1293 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) { 1294 if (install_sigsys_handler()) 1295 pdie("install SIGSYS handler"); 1296 warn("logging seccomp filter failures"); 1297 } 1298 1299 /* 1300 * Install the syscall filter. 1301 */ 1302 if (j->flags.seccomp_filter) { 1303 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, 1304 j->filter_prog)) { 1305 if ((errno == EINVAL) && can_softfail()) { 1306 warn("seccomp not supported"); 1307 return; 1308 } 1309 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)"); 1310 } 1311 } 1312 } 1313 1314 void API minijail_enter(const struct minijail *j) 1315 { 1316 /* 1317 * If we're dropping caps, get the last valid cap from /proc now, 1318 * since /proc can be unmounted before drop_caps() is called. 1319 */ 1320 unsigned int last_valid_cap = 0; 1321 if (j->flags.caps) 1322 last_valid_cap = get_last_valid_cap(); 1323 1324 if (j->flags.pids) 1325 die("tried to enter a pid-namespaced jail;" 1326 " try minijail_run()?"); 1327 1328 if (j->flags.usergroups && !j->user) 1329 die("usergroup inheritance without username"); 1330 1331 /* 1332 * We can't recover from failures if we've dropped privileges partially, 1333 * so we don't even try. If any of our operations fail, we abort() the 1334 * entire process. 1335 */ 1336 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS)) 1337 pdie("setns(CLONE_NEWNS)"); 1338 1339 if (j->flags.vfs) { 1340 if (unshare(CLONE_NEWNS)) 1341 pdie("unshare(vfs)"); 1342 /* 1343 * Remount all filesystems as private. If they are shared 1344 * new bind mounts will creep out of our namespace. 1345 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt 1346 */ 1347 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) 1348 pdie("mount(/, private)"); 1349 } 1350 1351 if (j->flags.ipc && unshare(CLONE_NEWIPC)) { 1352 pdie("unshare(ipc)"); 1353 } 1354 1355 if (j->flags.enter_net) { 1356 if (setns(j->netns_fd, CLONE_NEWNET)) 1357 pdie("setns(CLONE_NEWNET)"); 1358 } else if (j->flags.net && unshare(CLONE_NEWNET)) { 1359 pdie("unshare(net)"); 1360 } 1361 1362 if (j->flags.chroot && enter_chroot(j)) 1363 pdie("chroot"); 1364 1365 if (j->flags.pivot_root && enter_pivot_root(j)) 1366 pdie("pivot_root"); 1367 1368 if (j->flags.mount_tmp && mount_tmp()) 1369 pdie("mount_tmp"); 1370 1371 if (j->flags.remount_proc_ro && remount_proc_readonly(j)) 1372 pdie("remount"); 1373 1374 if (j->flags.caps) { 1375 /* 1376 * POSIX capabilities are a bit tricky. If we drop our 1377 * capability to change uids, our attempt to use setuid() 1378 * below will fail. Hang on to root caps across setuid(), then 1379 * lock securebits. 1380 */ 1381 if (prctl(PR_SET_KEEPCAPS, 1)) 1382 pdie("prctl(PR_SET_KEEPCAPS)"); 1383 if (prctl 1384 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS)) 1385 pdie("prctl(PR_SET_SECUREBITS)"); 1386 } 1387 1388 /* 1389 * If we're setting no_new_privs, we can drop privileges 1390 * before setting seccomp filter. This way filter policies 1391 * don't need to allow privilege-dropping syscalls. 1392 */ 1393 if (j->flags.no_new_privs) { 1394 drop_ugid(j); 1395 if (j->flags.caps) 1396 drop_caps(j, last_valid_cap); 1397 1398 set_seccomp_filter(j); 1399 } else { 1400 /* 1401 * If we're not setting no_new_privs, 1402 * we need to set seccomp filter *before* dropping privileges. 1403 * WARNING: this means that filter policies *must* allow 1404 * setgroups()/setresgid()/setresuid() for dropping root and 1405 * capget()/capset()/prctl() for dropping caps. 1406 */ 1407 set_seccomp_filter(j); 1408 1409 drop_ugid(j); 1410 if (j->flags.caps) 1411 drop_caps(j, last_valid_cap); 1412 } 1413 1414 /* 1415 * Select the specified alternate syscall table. The table must not 1416 * block prctl(2) if we're using seccomp as well. 1417 */ 1418 if (j->flags.alt_syscall) { 1419 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table)) 1420 pdie("prctl(PR_ALT_SYSCALL)"); 1421 } 1422 1423 /* 1424 * seccomp has to come last since it cuts off all the other 1425 * privilege-dropping syscalls :) 1426 */ 1427 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) { 1428 if ((errno == EINVAL) && can_softfail()) { 1429 warn("seccomp not supported"); 1430 return; 1431 } 1432 pdie("prctl(PR_SET_SECCOMP)"); 1433 } 1434 } 1435 1436 /* TODO(wad) will visibility affect this variable? */ 1437 static int init_exitstatus = 0; 1438 1439 void init_term(int __attribute__ ((unused)) sig) 1440 { 1441 _exit(init_exitstatus); 1442 } 1443 1444 int init(pid_t rootpid) 1445 { 1446 pid_t pid; 1447 int status; 1448 /* so that we exit with the right status */ 1449 signal(SIGTERM, init_term); 1450 /* TODO(wad) self jail with seccomp_filters here. */ 1451 while ((pid = wait(&status)) > 0) { 1452 /* 1453 * This loop will only end when either there are no processes 1454 * left inside our pid namespace or we get a signal. 1455 */ 1456 if (pid == rootpid) 1457 init_exitstatus = status; 1458 } 1459 if (!WIFEXITED(init_exitstatus)) 1460 _exit(MINIJAIL_ERR_INIT); 1461 _exit(WEXITSTATUS(init_exitstatus)); 1462 } 1463 1464 int API minijail_from_fd(int fd, struct minijail *j) 1465 { 1466 size_t sz = 0; 1467 size_t bytes = read(fd, &sz, sizeof(sz)); 1468 char *buf; 1469 int r; 1470 if (sizeof(sz) != bytes) 1471 return -EINVAL; 1472 if (sz > USHRT_MAX) /* arbitrary sanity check */ 1473 return -E2BIG; 1474 buf = malloc(sz); 1475 if (!buf) 1476 return -ENOMEM; 1477 bytes = read(fd, buf, sz); 1478 if (bytes != sz) { 1479 free(buf); 1480 return -EINVAL; 1481 } 1482 r = minijail_unmarshal(j, buf, sz); 1483 free(buf); 1484 return r; 1485 } 1486 1487 int API minijail_to_fd(struct minijail *j, int fd) 1488 { 1489 char *buf; 1490 size_t sz = minijail_size(j); 1491 ssize_t written; 1492 int r; 1493 1494 if (!sz) 1495 return -EINVAL; 1496 buf = malloc(sz); 1497 r = minijail_marshal(j, buf, sz); 1498 if (r) { 1499 free(buf); 1500 return r; 1501 } 1502 /* Sends [size][minijail]. */ 1503 written = write(fd, &sz, sizeof(sz)); 1504 if (written != sizeof(sz)) { 1505 free(buf); 1506 return -EFAULT; 1507 } 1508 written = write(fd, buf, sz); 1509 if (written < 0 || (size_t) written != sz) { 1510 free(buf); 1511 return -EFAULT; 1512 } 1513 free(buf); 1514 return 0; 1515 } 1516 1517 int setup_preload(void) 1518 { 1519 #if defined(__ANDROID__) 1520 /* Don't use LDPRELOAD on Brillo. */ 1521 return 0; 1522 #else 1523 char *oldenv = getenv(kLdPreloadEnvVar) ? : ""; 1524 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH)); 1525 if (!newenv) 1526 return -ENOMEM; 1527 1528 /* Only insert a separating space if we have something to separate... */ 1529 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", 1530 PRELOADPATH); 1531 1532 /* setenv() makes a copy of the string we give it. */ 1533 setenv(kLdPreloadEnvVar, newenv, 1); 1534 free(newenv); 1535 return 0; 1536 #endif 1537 } 1538 1539 int setup_pipe(int fds[2]) 1540 { 1541 int r = pipe(fds); 1542 char fd_buf[11]; 1543 if (r) 1544 return r; 1545 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]); 1546 if (r <= 0) 1547 return -EINVAL; 1548 setenv(kFdEnvVar, fd_buf, 1); 1549 return 0; 1550 } 1551 1552 int setup_pipe_end(int fds[2], size_t index) 1553 { 1554 if (index > 1) 1555 return -1; 1556 1557 close(fds[1 - index]); 1558 return fds[index]; 1559 } 1560 1561 int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd) 1562 { 1563 if (index > 1) 1564 return -1; 1565 1566 close(fds[1 - index]); 1567 /* dup2(2) the corresponding end of the pipe into |fd|. */ 1568 return dup2(fds[index], fd); 1569 } 1570 1571 int minijail_run_internal(struct minijail *j, const char *filename, 1572 char *const argv[], pid_t *pchild_pid, 1573 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1574 int use_preload); 1575 1576 int API minijail_run(struct minijail *j, const char *filename, 1577 char *const argv[]) 1578 { 1579 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1580 true); 1581 } 1582 1583 int API minijail_run_pid(struct minijail *j, const char *filename, 1584 char *const argv[], pid_t *pchild_pid) 1585 { 1586 return minijail_run_internal(j, filename, argv, pchild_pid, 1587 NULL, NULL, NULL, true); 1588 } 1589 1590 int API minijail_run_pipe(struct minijail *j, const char *filename, 1591 char *const argv[], int *pstdin_fd) 1592 { 1593 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd, 1594 NULL, NULL, true); 1595 } 1596 1597 int API minijail_run_pid_pipes(struct minijail *j, const char *filename, 1598 char *const argv[], pid_t *pchild_pid, 1599 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) 1600 { 1601 return minijail_run_internal(j, filename, argv, pchild_pid, 1602 pstdin_fd, pstdout_fd, pstderr_fd, true); 1603 } 1604 1605 int API minijail_run_no_preload(struct minijail *j, const char *filename, 1606 char *const argv[]) 1607 { 1608 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL, 1609 false); 1610 } 1611 1612 int API minijail_run_pid_pipes_no_preload(struct minijail *j, 1613 const char *filename, 1614 char *const argv[], 1615 pid_t *pchild_pid, 1616 int *pstdin_fd, int *pstdout_fd, 1617 int *pstderr_fd) { 1618 return minijail_run_internal(j, filename, argv, pchild_pid, 1619 pstdin_fd, pstdout_fd, pstderr_fd, false); 1620 } 1621 1622 int minijail_run_internal(struct minijail *j, const char *filename, 1623 char *const argv[], pid_t *pchild_pid, 1624 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd, 1625 int use_preload) 1626 { 1627 char *oldenv, *oldenv_copy = NULL; 1628 pid_t child_pid; 1629 int pipe_fds[2]; 1630 int stdin_fds[2]; 1631 int stdout_fds[2]; 1632 int stderr_fds[2]; 1633 int child_sync_pipe_fds[2]; 1634 int sync_child = 0; 1635 int ret; 1636 /* We need to remember this across the minijail_preexec() call. */ 1637 int pid_namespace = j->flags.pids; 1638 int do_init = j->flags.do_init; 1639 1640 if (use_preload) { 1641 oldenv = getenv(kLdPreloadEnvVar); 1642 if (oldenv) { 1643 oldenv_copy = strdup(oldenv); 1644 if (!oldenv_copy) 1645 return -ENOMEM; 1646 } 1647 1648 if (setup_preload()) 1649 return -EFAULT; 1650 } 1651 1652 if (!use_preload) { 1653 if (j->flags.caps) 1654 die("capabilities are not supported without " 1655 "LD_PRELOAD"); 1656 } 1657 1658 /* 1659 * Make the process group ID of this process equal to its PID, so that 1660 * both the Minijail process and the jailed process can be killed 1661 * together. 1662 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when 1663 * the process is already a process group leader. 1664 */ 1665 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) { 1666 if (errno != EPERM) { 1667 pdie("setpgid(0, 0)"); 1668 } 1669 } 1670 1671 if (use_preload) { 1672 /* 1673 * Before we fork(2) and execve(2) the child process, we need 1674 * to open a pipe(2) to send the minijail configuration over. 1675 */ 1676 if (setup_pipe(pipe_fds)) 1677 return -EFAULT; 1678 } 1679 1680 /* 1681 * If we want to write to the child process' standard input, 1682 * create the pipe(2) now. 1683 */ 1684 if (pstdin_fd) { 1685 if (pipe(stdin_fds)) 1686 return -EFAULT; 1687 } 1688 1689 /* 1690 * If we want to read from the child process' standard output, 1691 * create the pipe(2) now. 1692 */ 1693 if (pstdout_fd) { 1694 if (pipe(stdout_fds)) 1695 return -EFAULT; 1696 } 1697 1698 /* 1699 * If we want to read from the child process' standard error, 1700 * create the pipe(2) now. 1701 */ 1702 if (pstderr_fd) { 1703 if (pipe(stderr_fds)) 1704 return -EFAULT; 1705 } 1706 1707 /* 1708 * If we want to set up a new uid/gid mapping in the user namespace, 1709 * or if we need to add the child process to cgroups, create the pipe(2) 1710 * to sync between parent and child. 1711 */ 1712 if (j->flags.userns || j->flags.cgroups) { 1713 sync_child = 1; 1714 if (pipe(child_sync_pipe_fds)) 1715 return -EFAULT; 1716 } 1717 1718 /* 1719 * Use sys_clone() if and only if we're creating a pid namespace. 1720 * 1721 * tl;dr: WARNING: do not mix pid namespaces and multithreading. 1722 * 1723 * In multithreaded programs, there are a bunch of locks inside libc, 1724 * some of which may be held by other threads at the time that we call 1725 * minijail_run_pid(). If we call fork(), glibc does its level best to 1726 * ensure that we hold all of these locks before it calls clone() 1727 * internally and drop them after clone() returns, but when we call 1728 * sys_clone(2) directly, all that gets bypassed and we end up with a 1729 * child address space where some of libc's important locks are held by 1730 * other threads (which did not get cloned, and hence will never release 1731 * those locks). This is okay so long as we call exec() immediately 1732 * after, but a bunch of seemingly-innocent libc functions like setenv() 1733 * take locks. 1734 * 1735 * Hence, only call sys_clone() if we need to, in order to get at pid 1736 * namespacing. If we follow this path, the child's address space might 1737 * have broken locks; you may only call functions that do not acquire 1738 * any locks. 1739 * 1740 * Unfortunately, fork() acquires every lock it can get its hands on, as 1741 * previously detailed, so this function is highly likely to deadlock 1742 * later on (see "deadlock here") if we're multithreaded. 1743 * 1744 * We might hack around this by having the clone()d child (init of the 1745 * pid namespace) return directly, rather than leaving the clone()d 1746 * process hanging around to be init for the new namespace (and having 1747 * its fork()ed child return in turn), but that process would be crippled 1748 * with its libc locks potentially broken. We might try fork()ing in the 1749 * parent before we clone() to ensure that we own all the locks, but 1750 * then we have to have the forked child hanging around consuming 1751 * resources (and possibly having file descriptors / shared memory 1752 * regions / etc attached). We'd need to keep the child around to avoid 1753 * having its children get reparented to init. 1754 * 1755 * TODO(ellyjones): figure out if the "forked child hanging around" 1756 * problem is fixable or not. It would be nice if we worked in this 1757 * case. 1758 */ 1759 if (pid_namespace) { 1760 int clone_flags = CLONE_NEWPID | SIGCHLD; 1761 if (j->flags.userns) 1762 clone_flags |= CLONE_NEWUSER; 1763 child_pid = syscall(SYS_clone, clone_flags, NULL); 1764 } else { 1765 child_pid = fork(); 1766 } 1767 1768 if (child_pid < 0) { 1769 if (use_preload) { 1770 free(oldenv_copy); 1771 } 1772 die("failed to fork child"); 1773 } 1774 1775 if (child_pid) { 1776 if (use_preload) { 1777 /* Restore parent's LD_PRELOAD. */ 1778 if (oldenv_copy) { 1779 setenv(kLdPreloadEnvVar, oldenv_copy, 1); 1780 free(oldenv_copy); 1781 } else { 1782 unsetenv(kLdPreloadEnvVar); 1783 } 1784 unsetenv(kFdEnvVar); 1785 } 1786 1787 j->initpid = child_pid; 1788 1789 if (j->flags.pid_file) 1790 write_pid_file(j); 1791 1792 if (j->flags.cgroups) 1793 add_to_cgroups(j); 1794 1795 if (j->flags.userns) 1796 write_ugid_mappings(j); 1797 1798 if (sync_child) 1799 parent_setup_complete(child_sync_pipe_fds); 1800 1801 if (use_preload) { 1802 /* Send marshalled minijail. */ 1803 close(pipe_fds[0]); /* read endpoint */ 1804 ret = minijail_to_fd(j, pipe_fds[1]); 1805 close(pipe_fds[1]); /* write endpoint */ 1806 if (ret) { 1807 kill(j->initpid, SIGKILL); 1808 die("failed to send marshalled minijail"); 1809 } 1810 } 1811 1812 if (pchild_pid) 1813 *pchild_pid = child_pid; 1814 1815 /* 1816 * If we want to write to the child process' standard input, 1817 * set up the write end of the pipe. 1818 */ 1819 if (pstdin_fd) 1820 *pstdin_fd = setup_pipe_end(stdin_fds, 1821 1 /* write end */); 1822 1823 /* 1824 * If we want to read from the child process' standard output, 1825 * set up the read end of the pipe. 1826 */ 1827 if (pstdout_fd) 1828 *pstdout_fd = setup_pipe_end(stdout_fds, 1829 0 /* read end */); 1830 1831 /* 1832 * If we want to read from the child process' standard error, 1833 * set up the read end of the pipe. 1834 */ 1835 if (pstderr_fd) 1836 *pstderr_fd = setup_pipe_end(stderr_fds, 1837 0 /* read end */); 1838 1839 return 0; 1840 } 1841 free(oldenv_copy); 1842 1843 if (j->flags.reset_signal_mask) { 1844 sigset_t signal_mask; 1845 if (sigemptyset(&signal_mask) != 0) 1846 pdie("sigemptyset failed"); 1847 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0) 1848 pdie("sigprocmask failed"); 1849 } 1850 1851 if (sync_child) 1852 wait_for_parent_setup(child_sync_pipe_fds); 1853 1854 if (j->flags.userns) 1855 enter_user_namespace(j); 1856 1857 /* 1858 * If we want to write to the jailed process' standard input, 1859 * set up the read end of the pipe. 1860 */ 1861 if (pstdin_fd) { 1862 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */, 1863 STDIN_FILENO) < 0) 1864 die("failed to set up stdin pipe"); 1865 } 1866 1867 /* 1868 * If we want to read from the jailed process' standard output, 1869 * set up the write end of the pipe. 1870 */ 1871 if (pstdout_fd) { 1872 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */, 1873 STDOUT_FILENO) < 0) 1874 die("failed to set up stdout pipe"); 1875 } 1876 1877 /* 1878 * If we want to read from the jailed process' standard error, 1879 * set up the write end of the pipe. 1880 */ 1881 if (pstderr_fd) { 1882 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */, 1883 STDERR_FILENO) < 0) 1884 die("failed to set up stderr pipe"); 1885 } 1886 1887 /* If running an init program, let it decide when/how to mount /proc. */ 1888 if (pid_namespace && !do_init) 1889 j->flags.remount_proc_ro = 0; 1890 1891 if (use_preload) { 1892 /* Strip out flags that cannot be inherited across execve(2). */ 1893 minijail_preexec(j); 1894 } else { 1895 j->flags.pids = 0; 1896 } 1897 /* Jail this process, then execve() the target. */ 1898 minijail_enter(j); 1899 1900 if (pid_namespace && do_init) { 1901 /* 1902 * pid namespace: this process will become init inside the new 1903 * namespace. We don't want all programs we might exec to have 1904 * to know how to be init. Normally (do_init == 1) we fork off 1905 * a child to actually run the program. If |do_init == 0|, we 1906 * let the program keep pid 1 and be init. 1907 * 1908 * If we're multithreaded, we'll probably deadlock here. See 1909 * WARNING above. 1910 */ 1911 child_pid = fork(); 1912 if (child_pid < 0) 1913 _exit(child_pid); 1914 else if (child_pid > 0) 1915 init(child_pid); /* never returns */ 1916 } 1917 1918 /* 1919 * If we aren't pid-namespaced, or the jailed program asked to be init: 1920 * calling process 1921 * -> execve()-ing process 1922 * If we are: 1923 * calling process 1924 * -> init()-ing process 1925 * -> execve()-ing process 1926 */ 1927 _exit(execve(filename, argv, environ)); 1928 } 1929 1930 int API minijail_kill(struct minijail *j) 1931 { 1932 int st; 1933 if (kill(j->initpid, SIGTERM)) 1934 return -errno; 1935 if (waitpid(j->initpid, &st, 0) < 0) 1936 return -errno; 1937 return st; 1938 } 1939 1940 int API minijail_wait(struct minijail *j) 1941 { 1942 int st; 1943 if (waitpid(j->initpid, &st, 0) < 0) 1944 return -errno; 1945 1946 if (!WIFEXITED(st)) { 1947 int error_status = st; 1948 if (WIFSIGNALED(st)) { 1949 int signum = WTERMSIG(st); 1950 warn("child process %d received signal %d", 1951 j->initpid, signum); 1952 /* 1953 * We return MINIJAIL_ERR_JAIL if the process received 1954 * SIGSYS, which happens when a syscall is blocked by 1955 * seccomp filters. 1956 * If not, we do what bash(1) does: 1957 * $? = 128 + signum 1958 */ 1959 if (signum == SIGSYS) { 1960 error_status = MINIJAIL_ERR_JAIL; 1961 } else { 1962 error_status = 128 + signum; 1963 } 1964 } 1965 return error_status; 1966 } 1967 1968 int exit_status = WEXITSTATUS(st); 1969 if (exit_status != 0) 1970 info("child process %d exited with status %d", 1971 j->initpid, exit_status); 1972 1973 return exit_status; 1974 } 1975 1976 void API minijail_destroy(struct minijail *j) 1977 { 1978 size_t i; 1979 1980 if (j->flags.seccomp_filter && j->filter_prog) { 1981 free(j->filter_prog->filter); 1982 free(j->filter_prog); 1983 } 1984 while (j->mounts_head) { 1985 struct mountpoint *m = j->mounts_head; 1986 j->mounts_head = j->mounts_head->next; 1987 free(m->type); 1988 free(m->dest); 1989 free(m->src); 1990 free(m); 1991 } 1992 j->mounts_tail = NULL; 1993 if (j->user) 1994 free(j->user); 1995 if (j->suppl_gid_list) 1996 free(j->suppl_gid_list); 1997 if (j->chrootdir) 1998 free(j->chrootdir); 1999 if (j->alt_syscall_table) 2000 free(j->alt_syscall_table); 2001 for (i = 0; i < j->cgroup_count; ++i) 2002 free(j->cgroups[i]); 2003 free(j); 2004 } 2005