Home | History | Annotate | Download | only in minijail
      1 /* Copyright 2017 The Chromium OS Authors. All rights reserved.
      2  * Use of this source code is governed by a BSD-style license that can be
      3  * found in the LICENSE file.
      4  */
      5 
      6 #include "system.h"
      7 
      8 #include <errno.h>
      9 #include <fcntl.h>
     10 #include <grp.h>
     11 #include <net/if.h>
     12 #include <pwd.h>
     13 #include <stdbool.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <sys/ioctl.h>
     17 #include <sys/prctl.h>
     18 #include <sys/socket.h>
     19 #include <sys/stat.h>
     20 #include <sys/statvfs.h>
     21 #include <unistd.h>
     22 
     23 #include <linux/securebits.h>
     24 
     25 #include "util.h"
     26 
     27 /*
     28  * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
     29  * definition if the securebits header doesn't provide it.
     30  */
     31 #ifndef SECBIT_NO_CAP_AMBIENT_RAISE
     32 #define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
     33 #endif
     34 
     35 #ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
     36 #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
     37 #endif
     38 
     39 /*
     40  * Assert the value of SECURE_ALL_BITS at compile-time.
     41  * Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3
     42  * added a new securebit.
     43  * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
     44  * when used on older kernels. The compile-time assert will catch this situation
     45  * at compile time.
     46  */
     47 #if defined(__ANDROID__)
     48 _Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
     49 #endif
     50 
     51 int secure_noroot_set_and_locked(uint64_t mask)
     52 {
     53 	return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
     54 	       (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
     55 }
     56 
     57 int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
     58 {
     59 	/* The general idea is to set all bits, subject to exceptions below. */
     60 	unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;
     61 
     62 	/*
     63 	 * SECBIT_KEEP_CAPS is special in that it is automatically cleared on
     64 	 * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
     65 	 * the default) in processes that have it locked already (such as nested
     66 	 * minijail usage) would fail. Thus, unless the caller requires it,
     67 	 * allow it to remain off if it is already locked.
     68 	 */
     69 	if (!require_keep_caps) {
     70 		int current_securebits = prctl(PR_GET_SECUREBITS);
     71 		if (current_securebits < 0) {
     72 			pwarn("prctl(PR_GET_SECUREBITS) failed");
     73 			return -1;
     74 		}
     75 
     76 		if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
     77 		    (current_securebits & SECBIT_KEEP_CAPS) == 0) {
     78 			securebits &= ~SECBIT_KEEP_CAPS;
     79 		}
     80 	}
     81 
     82 	/*
     83 	 * Ambient capabilities can only be raised if they're already present
     84 	 * in the permitted *and* inheritable set. Therefore, we don't really
     85 	 * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
     86 	 * configuring the permitted and inheritable set.
     87 	 */
     88 	securebits &=
     89 	    ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
     90 
     91 	/* Don't set any bits that the user requested not to be touched. */
     92 	securebits &= ~skip_mask;
     93 
     94 	if (!securebits) {
     95 		warn("not locking any securebits");
     96 		return 0;
     97 	}
     98 	int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
     99 	if (securebits_ret < 0) {
    100 		pwarn("prctl(PR_SET_SECUREBITS) failed");
    101 		return -1;
    102 	}
    103 
    104 	return 0;
    105 }
    106 
    107 int write_proc_file(pid_t pid, const char *content, const char *basename)
    108 {
    109 	int fd, ret;
    110 	size_t sz, len;
    111 	ssize_t written;
    112 	char filename[32];
    113 
    114 	sz = sizeof(filename);
    115 	ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
    116 	if (ret < 0 || (size_t)ret >= sz) {
    117 		warn("failed to generate %s filename", basename);
    118 		return -1;
    119 	}
    120 
    121 	fd = open(filename, O_WRONLY | O_CLOEXEC);
    122 	if (fd < 0) {
    123 		pwarn("failed to open '%s'", filename);
    124 		return -errno;
    125 	}
    126 
    127 	len = strlen(content);
    128 	written = write(fd, content, len);
    129 	if (written < 0) {
    130 		pwarn("failed to write '%s'", filename);
    131 		return -errno;
    132 	}
    133 
    134 	if ((size_t)written < len) {
    135 		warn("failed to write %zu bytes to '%s'", len, filename);
    136 		return -1;
    137 	}
    138 	close(fd);
    139 	return 0;
    140 }
    141 
    142 /*
    143  * We specifically do not use cap_valid() as that only tells us the last
    144  * valid cap we were *compiled* against (i.e. what the version of kernel
    145  * headers says). If we run on a different kernel version, then it's not
    146  * uncommon for that to be less (if an older kernel) or more (if a newer
    147  * kernel).
    148  * Normally, we suck up the answer via /proc. On Android, not all processes are
    149  * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
    150  * programmatically find the value by calling prctl(PR_CAPBSET_READ).
    151  */
    152 unsigned int get_last_valid_cap(void)
    153 {
    154 	unsigned int last_valid_cap = 0;
    155 	if (is_android()) {
    156 		for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
    157 		     ++last_valid_cap)
    158 			;
    159 
    160 		/* |last_valid_cap| will be the first failing value. */
    161 		if (last_valid_cap > 0) {
    162 			last_valid_cap--;
    163 		}
    164 	} else {
    165 		const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
    166 		FILE *fp = fopen(cap_file, "re");
    167 		if (fscanf(fp, "%u", &last_valid_cap) != 1)
    168 			pdie("fscanf(%s)", cap_file);
    169 		fclose(fp);
    170 	}
    171 	return last_valid_cap;
    172 }
    173 
    174 int cap_ambient_supported(void)
    175 {
    176 	return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
    177 	       0;
    178 }
    179 
    180 int config_net_loopback(void)
    181 {
    182 	const char ifname[] = "lo";
    183 	int sock;
    184 	struct ifreq ifr;
    185 
    186 	/* Make sure people don't try to add really long names. */
    187 	_Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
    188 
    189 	sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
    190 	if (sock < 0) {
    191 		pwarn("socket(AF_LOCAL) failed");
    192 		return -1;
    193 	}
    194 
    195 	/*
    196 	 * Do the equiv of `ip link set up lo`.  The kernel will assign
    197 	 * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
    198 	 */
    199 	strcpy(ifr.ifr_name, ifname);
    200 	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
    201 		pwarn("ioctl(SIOCGIFFLAGS) failed");
    202 		return -1;
    203 	}
    204 
    205 	/* The kernel preserves ifr.ifr_name for use. */
    206 	ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
    207 	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
    208 		pwarn("ioctl(SIOCSIFFLAGS) failed");
    209 		return -1;
    210 	}
    211 
    212 	close(sock);
    213 	return 0;
    214 }
    215 
    216 int setup_pipe_end(int fds[2], size_t index)
    217 {
    218 	if (index > 1)
    219 		return -1;
    220 
    221 	close(fds[1 - index]);
    222 	return fds[index];
    223 }
    224 
    225 int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
    226 {
    227 	if (index > 1)
    228 		return -1;
    229 
    230 	close(fds[1 - index]);
    231 	/* dup2(2) the corresponding end of the pipe into |fd|. */
    232 	return dup2(fds[index], fd);
    233 }
    234 
    235 int write_pid_to_path(pid_t pid, const char *path)
    236 {
    237 	FILE *fp = fopen(path, "we");
    238 
    239 	if (!fp) {
    240 		pwarn("failed to open '%s'", path);
    241 		return -errno;
    242 	}
    243 	if (fprintf(fp, "%d\n", (int)pid) < 0) {
    244 		/* fprintf(3) does not set errno on failure. */
    245 		warn("fprintf(%s) failed", path);
    246 		return -1;
    247 	}
    248 	if (fclose(fp)) {
    249 		pwarn("fclose(%s) failed", path);
    250 		return -errno;
    251 	}
    252 
    253 	return 0;
    254 }
    255 
    256 /*
    257  * Create the |path| directory and its parents (if need be) with |mode|.
    258  * If not |isdir|, then |path| is actually a file, so the last component
    259  * will not be created.
    260  */
    261 int mkdir_p(const char *path, mode_t mode, bool isdir)
    262 {
    263 	int rc;
    264 	char *dir = strdup(path);
    265 	if (!dir) {
    266 		rc = errno;
    267 		pwarn("strdup(%s) failed", path);
    268 		return -rc;
    269 	}
    270 
    271 	/* Starting from the root, work our way out to the end. */
    272 	char *p = strchr(dir + 1, '/');
    273 	while (p) {
    274 		*p = '\0';
    275 		if (mkdir(dir, mode) && errno != EEXIST) {
    276 			rc = errno;
    277 			pwarn("mkdir(%s, 0%o) failed", dir, mode);
    278 			free(dir);
    279 			return -rc;
    280 		}
    281 		*p = '/';
    282 		p = strchr(p + 1, '/');
    283 	}
    284 
    285 	/*
    286 	 * Create the last directory.  We still check EEXIST here in case
    287 	 * of trailing slashes.
    288 	 */
    289 	free(dir);
    290 	if (isdir && mkdir(path, mode) && errno != EEXIST) {
    291 		rc = errno;
    292 		pwarn("mkdir(%s, 0%o) failed", path, mode);
    293 		return -rc;
    294 	}
    295 	return 0;
    296 }
    297 
    298 /*
    299  * setup_mount_destination: Ensures the mount target exists.
    300  * Creates it if needed and possible.
    301  */
    302 int setup_mount_destination(const char *source, const char *dest, uid_t uid,
    303 			    uid_t gid, bool bind, unsigned long *mnt_flags)
    304 {
    305 	int rc;
    306 	struct stat st_buf;
    307 	bool domkdir;
    308 
    309 	rc = stat(dest, &st_buf);
    310 	if (rc == 0) /* destination exists */
    311 		return 0;
    312 
    313 	/*
    314 	 * Try to create the destination.
    315 	 * Either make a directory or touch a file depending on the source type.
    316 	 *
    317 	 * If the source isn't an absolute path, assume it is a filesystem type
    318 	 * such as "tmpfs" and create a directory to mount it on.  The dest will
    319 	 * be something like "none" or "proc" which we shouldn't be checking.
    320 	 */
    321 	if (source[0] == '/') {
    322 		/* The source is an absolute path -- it better exist! */
    323 		rc = stat(source, &st_buf);
    324 		if (rc) {
    325 			rc = errno;
    326 			pwarn("stat(%s) failed", source);
    327 			return -rc;
    328 		}
    329 
    330 		/*
    331 		 * If bind mounting, we only create a directory if the source
    332 		 * is a directory, else we always bind mount it as a file to
    333 		 * support device nodes, sockets, etc...
    334 		 *
    335 		 * For all other mounts, we assume a block/char source is
    336 		 * going to want a directory to mount to.  If the source is
    337 		 * something else (e.g. a fifo or socket), this probably will
    338 		 * not do the right thing, but we'll fail later on when we try
    339 		 * to mount(), so shouldn't be a big deal.
    340 		 */
    341 		domkdir = S_ISDIR(st_buf.st_mode) ||
    342 			  (!bind && (S_ISBLK(st_buf.st_mode) ||
    343 				     S_ISCHR(st_buf.st_mode)));
    344 
    345 		/* If bind mounting, also grab the mount flags of the source. */
    346 		if (bind && mnt_flags) {
    347 			struct statvfs stvfs_buf;
    348 			rc = statvfs(source, &stvfs_buf);
    349 			if (rc) {
    350 				rc = errno;
    351 				pwarn(
    352 				    "failed to look up mount flags: source=%s",
    353 				    source);
    354 				return -rc;
    355 			}
    356 			*mnt_flags = stvfs_buf.f_flag;
    357 		}
    358 	} else {
    359 		/* The source is a relative path -- assume it's a pseudo fs. */
    360 
    361 		/* Disallow relative bind mounts. */
    362 		if (bind) {
    363 			warn("relative bind-mounts are not allowed: source=%s",
    364 			     source);
    365 			return -EINVAL;
    366 		}
    367 
    368 		domkdir = true;
    369 	}
    370 
    371 	/*
    372 	 * Now that we know what we want to do, do it!
    373 	 * We always create the intermediate dirs and the final path with 0755
    374 	 * perms and root/root ownership.  This shouldn't be a problem because
    375 	 * the actual mount will set those perms/ownership on the mount point
    376 	 * which is all people should need to access it.
    377 	 */
    378 	rc = mkdir_p(dest, 0755, domkdir);
    379 	if (rc)
    380 		return rc;
    381 	if (!domkdir) {
    382 		int fd = open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
    383 		if (fd < 0) {
    384 			rc = errno;
    385 			pwarn("open(%s) failed", dest);
    386 			return -rc;
    387 		}
    388 		close(fd);
    389 	}
    390 	if (chown(dest, uid, gid)) {
    391 		rc = errno;
    392 		pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
    393 		return -rc;
    394 	}
    395 	return 0;
    396 }
    397 
    398 /*
    399  * lookup_user: Gets the uid/gid for the given username.
    400  */
    401 int lookup_user(const char *user, uid_t *uid, gid_t *gid)
    402 {
    403 	char *buf = NULL;
    404 	struct passwd pw;
    405 	struct passwd *ppw = NULL;
    406 	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
    407 	if (sz == -1)
    408 		sz = 65536; /* your guess is as good as mine... */
    409 
    410 	/*
    411 	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
    412 	 * the maximum needed size of the buffer, so we don't have to search.
    413 	 */
    414 	buf = malloc(sz);
    415 	if (!buf)
    416 		return -ENOMEM;
    417 	getpwnam_r(user, &pw, buf, sz, &ppw);
    418 	/*
    419 	 * We're safe to free the buffer here. The strings inside |pw| point
    420 	 * inside |buf|, but we don't use any of them; this leaves the pointers
    421 	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
    422 	 * succeeded.
    423 	 */
    424 	free(buf);
    425 	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
    426 	if (!ppw)
    427 		return -1;
    428 
    429 	*uid = ppw->pw_uid;
    430 	*gid = ppw->pw_gid;
    431 	return 0;
    432 }
    433 
    434 /*
    435  * lookup_group: Gets the gid for the given group name.
    436  */
    437 int lookup_group(const char *group, gid_t *gid)
    438 {
    439 	char *buf = NULL;
    440 	struct group gr;
    441 	struct group *pgr = NULL;
    442 	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
    443 	if (sz == -1)
    444 		sz = 65536; /* and mine is as good as yours, really */
    445 
    446 	/*
    447 	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
    448 	 * the maximum needed size of the buffer, so we don't have to search.
    449 	 */
    450 	buf = malloc(sz);
    451 	if (!buf)
    452 		return -ENOMEM;
    453 	getgrnam_r(group, &gr, buf, sz, &pgr);
    454 	/*
    455 	 * We're safe to free the buffer here. The strings inside gr point
    456 	 * inside buf, but we don't use any of them; this leaves the pointers
    457 	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
    458 	 */
    459 	free(buf);
    460 	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
    461 	if (!pgr)
    462 		return -1;
    463 
    464 	*gid = pgr->gr_gid;
    465 	return 0;
    466 }
    467