Home | History | Annotate | Download | only in executor
      1 // Copyright 2016 syzkaller project authors. All rights reserved.
      2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
      3 
      4 // This file is shared between executor and csource package.
      5 
      6 #include <stdlib.h>
      7 #include <sys/syscall.h>
      8 #include <sys/types.h>
      9 #include <unistd.h>
     10 
     11 #if SYZ_EXECUTOR
     12 struct cover_t;
     13 static void cover_reset(cover_t* cov);
     14 #endif
     15 
     16 #if SYZ_EXECUTOR || SYZ_THREADED
     17 #include <linux/futex.h>
     18 #include <pthread.h>
     19 
     20 typedef struct {
     21 	int state;
     22 } event_t;
     23 
     24 static void event_init(event_t* ev)
     25 {
     26 	ev->state = 0;
     27 }
     28 
     29 static void event_reset(event_t* ev)
     30 {
     31 	ev->state = 0;
     32 }
     33 
     34 static void event_set(event_t* ev)
     35 {
     36 	if (ev->state)
     37 		fail("event already set");
     38 	__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
     39 	syscall(SYS_futex, &ev->state, FUTEX_WAKE);
     40 }
     41 
     42 static void event_wait(event_t* ev)
     43 {
     44 	while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
     45 		syscall(SYS_futex, &ev->state, FUTEX_WAIT, 0, 0);
     46 }
     47 
     48 static int event_isset(event_t* ev)
     49 {
     50 	return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
     51 }
     52 
     53 static int event_timedwait(event_t* ev, uint64 timeout)
     54 {
     55 	uint64 start = current_time_ms();
     56 	uint64 now = start;
     57 	for (;;) {
     58 		uint64 remain = timeout - (now - start);
     59 		struct timespec ts;
     60 		ts.tv_sec = remain / 1000;
     61 		ts.tv_nsec = (remain % 1000) * 1000 * 1000;
     62 		syscall(SYS_futex, &ev->state, FUTEX_WAIT, 0, &ts);
     63 		if (__atomic_load_n(&ev->state, __ATOMIC_RELAXED))
     64 			return 1;
     65 		now = current_time_ms();
     66 		if (now - start > timeout)
     67 			return 0;
     68 	}
     69 }
     70 #endif
     71 
     72 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE || SYZ_ENABLE_NETDEV
     73 #include <stdarg.h>
     74 #include <stdbool.h>
     75 #include <string.h>
     76 
     77 static void vsnprintf_check(char* str, size_t size, const char* format, va_list args)
     78 {
     79 	int rv;
     80 
     81 	rv = vsnprintf(str, size, format, args);
     82 	if (rv < 0)
     83 		fail("tun: snprintf failed");
     84 	if ((size_t)rv >= size)
     85 		fail("tun: string '%s...' doesn't fit into buffer", str);
     86 }
     87 
     88 #define COMMAND_MAX_LEN 128
     89 #define PATH_PREFIX "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin "
     90 #define PATH_PREFIX_LEN (sizeof(PATH_PREFIX) - 1)
     91 
     92 static void execute_command(bool panic, const char* format, ...)
     93 {
     94 	va_list args;
     95 	char command[PATH_PREFIX_LEN + COMMAND_MAX_LEN];
     96 	int rv;
     97 
     98 	va_start(args, format);
     99 	// Executor process does not have any env, including PATH.
    100 	// On some distributions, system/shell adds a minimal PATH, on some it does not.
    101 	// Set own standard PATH to make it work across distributions.
    102 	memcpy(command, PATH_PREFIX, PATH_PREFIX_LEN);
    103 	vsnprintf_check(command + PATH_PREFIX_LEN, COMMAND_MAX_LEN, format, args);
    104 	va_end(args);
    105 	rv = system(command);
    106 	if (rv) {
    107 		if (panic)
    108 			fail("command '%s' failed: %d", &command[0], rv);
    109 		debug("command '%s': %d\n", &command[0], rv);
    110 	}
    111 }
    112 #endif
    113 
    114 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
    115 #include <arpa/inet.h>
    116 #include <errno.h>
    117 #include <fcntl.h>
    118 #include <linux/if.h>
    119 #include <linux/if_ether.h>
    120 #include <linux/if_tun.h>
    121 #include <linux/ip.h>
    122 #include <linux/tcp.h>
    123 #include <net/if_arp.h>
    124 #include <stdarg.h>
    125 #include <stdbool.h>
    126 #include <sys/ioctl.h>
    127 #include <sys/stat.h>
    128 
    129 static int tunfd = -1;
    130 static int tun_frags_enabled;
    131 
    132 // We just need this to be large enough to hold headers that we parse (ethernet/ip/tcp).
    133 // Rest of the packet (if any) will be silently truncated which is fine.
    134 #define SYZ_TUN_MAX_PACKET_SIZE 1000
    135 
    136 #define TUN_IFACE "syz_tun"
    137 
    138 #define LOCAL_MAC "aa:aa:aa:aa:aa:aa"
    139 #define REMOTE_MAC "aa:aa:aa:aa:aa:bb"
    140 
    141 #define LOCAL_IPV4 "172.20.20.170"
    142 #define REMOTE_IPV4 "172.20.20.187"
    143 
    144 #define LOCAL_IPV6 "fe80::aa"
    145 #define REMOTE_IPV6 "fe80::bb"
    146 
    147 #ifndef IFF_NAPI
    148 #define IFF_NAPI 0x0010
    149 #endif
    150 #ifndef IFF_NAPI_FRAGS
    151 #define IFF_NAPI_FRAGS 0x0020
    152 #endif
    153 
    154 static void initialize_tun(void)
    155 {
    156 #if SYZ_EXECUTOR
    157 	if (!flag_enable_tun)
    158 		return;
    159 #endif
    160 	tunfd = open("/dev/net/tun", O_RDWR | O_NONBLOCK);
    161 	if (tunfd == -1) {
    162 #if SYZ_EXECUTOR
    163 		fail("tun: can't open /dev/net/tun\n");
    164 #else
    165 		printf("tun: can't open /dev/net/tun: please enable CONFIG_TUN=y\n");
    166 		printf("otherwise fuzzing or reproducing might not work as intended\n");
    167 		return;
    168 #endif
    169 	}
    170 	// Remap tun onto higher fd number to hide it from fuzzer and to keep
    171 	// fd numbers stable regardless of whether tun is opened or not (also see kMaxFd).
    172 	const int kTunFd = 240;
    173 	if (dup2(tunfd, kTunFd) < 0)
    174 		fail("dup2(tunfd, kTunFd) failed");
    175 	close(tunfd);
    176 	tunfd = kTunFd;
    177 
    178 	struct ifreq ifr;
    179 	memset(&ifr, 0, sizeof(ifr));
    180 	strncpy(ifr.ifr_name, TUN_IFACE, IFNAMSIZ);
    181 	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_NAPI | IFF_NAPI_FRAGS;
    182 	if (ioctl(tunfd, TUNSETIFF, (void*)&ifr) < 0) {
    183 		// IFF_NAPI_FRAGS requires root, so try without it.
    184 		ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
    185 		if (ioctl(tunfd, TUNSETIFF, (void*)&ifr) < 0)
    186 			fail("tun: ioctl(TUNSETIFF) failed");
    187 	}
    188 	// If IFF_NAPI_FRAGS is not supported it will be silently dropped,
    189 	// so query the effective flags.
    190 	if (ioctl(tunfd, TUNGETIFF, (void*)&ifr) < 0)
    191 		fail("tun: ioctl(TUNGETIFF) failed");
    192 	tun_frags_enabled = (ifr.ifr_flags & IFF_NAPI_FRAGS) != 0;
    193 	debug("tun_frags_enabled=%d\n", tun_frags_enabled);
    194 
    195 	// Disable IPv6 DAD, otherwise the address remains unusable until DAD completes.
    196 	// Don't panic because this is an optional config.
    197 	execute_command(0, "sysctl -w net.ipv6.conf.%s.accept_dad=0", TUN_IFACE);
    198 
    199 	// Disable IPv6 router solicitation to prevent IPv6 spam.
    200 	// Don't panic because this is an optional config.
    201 	execute_command(0, "sysctl -w net.ipv6.conf.%s.router_solicitations=0", TUN_IFACE);
    202 	// There seems to be no way to disable IPv6 MTD to prevent more IPv6 spam.
    203 
    204 	execute_command(1, "ip link set dev %s address %s", TUN_IFACE, LOCAL_MAC);
    205 	execute_command(1, "ip addr add %s/24 dev %s", LOCAL_IPV4, TUN_IFACE);
    206 	execute_command(1, "ip neigh add %s lladdr %s dev %s nud permanent",
    207 			REMOTE_IPV4, REMOTE_MAC, TUN_IFACE);
    208 	// Don't panic because ipv6 may be not enabled in kernel.
    209 	execute_command(0, "ip -6 addr add %s/120 dev %s", LOCAL_IPV6, TUN_IFACE);
    210 	execute_command(0, "ip -6 neigh add %s lladdr %s dev %s nud permanent",
    211 			REMOTE_IPV6, REMOTE_MAC, TUN_IFACE);
    212 	execute_command(1, "ip link set dev %s up", TUN_IFACE);
    213 }
    214 #endif
    215 
    216 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
    217 #include <arpa/inet.h>
    218 #include <errno.h>
    219 #include <fcntl.h>
    220 #include <linux/if.h>
    221 #include <linux/if_ether.h>
    222 #include <linux/if_tun.h>
    223 #include <linux/ip.h>
    224 #include <linux/tcp.h>
    225 #include <net/if_arp.h>
    226 #include <stdarg.h>
    227 #include <stdbool.h>
    228 #include <sys/ioctl.h>
    229 #include <sys/stat.h>
    230 #include <sys/uio.h>
    231 
    232 // Addresses are chosen to be in the same subnet as tun addresses.
    233 #define DEV_IPV4 "172.20.20.%d"
    234 #define DEV_IPV6 "fe80::%02hx"
    235 #define DEV_MAC "aa:aa:aa:aa:aa:%02hx"
    236 
    237 static void snprintf_check(char* str, size_t size, const char* format, ...)
    238 {
    239 	va_list args;
    240 
    241 	va_start(args, format);
    242 	vsnprintf_check(str, size, format, args);
    243 	va_end(args);
    244 }
    245 
    246 // We test in a separate namespace, which does not have any network devices initially (even lo).
    247 // Create/up as many as we can.
    248 static void initialize_netdevices(void)
    249 {
    250 #if SYZ_EXECUTOR
    251 	if (!flag_enable_net_dev)
    252 		return;
    253 #endif
    254 	unsigned i;
    255 	const char* devtypes[] = {"ip6gretap", "bridge", "vcan", "bond", "team"};
    256 	// If you extend this array, also update netdev_addr_id in vnet.txt.
    257 	const char* devnames[] = {"lo", "sit0", "bridge0", "vcan0", "tunl0",
    258 				  "gre0", "gretap0", "ip_vti0", "ip6_vti0",
    259 				  "ip6tnl0", "ip6gre0", "ip6gretap0",
    260 				  "erspan0", "bond0", "veth0", "veth1", "team0",
    261 				  "veth0_to_bridge", "veth1_to_bridge",
    262 				  "veth0_to_bond", "veth1_to_bond",
    263 				  "veth0_to_team", "veth1_to_team"};
    264 	const char* devmasters[] = {"bridge", "bond", "team"};
    265 
    266 	for (i = 0; i < sizeof(devtypes) / (sizeof(devtypes[0])); i++)
    267 		execute_command(0, "ip link add dev %s0 type %s", devtypes[i], devtypes[i]);
    268 	// This adds connected veth0 and veth1 devices.
    269 	execute_command(0, "ip link add type veth");
    270 
    271 	// This creates connected bridge/bond/team_slave devices of type veth,
    272 	// and makes them slaves of bridge/bond/team devices, respectively.
    273 	// Note: slave devices don't need MAC/IP addresses, only master devices.
    274 	//       veth0_to_* is not slave devices, which still need ip addresses.
    275 	for (i = 0; i < sizeof(devmasters) / (sizeof(devmasters[0])); i++) {
    276 		execute_command(0, "ip link add name %s_slave_0 type veth peer name veth0_to_%s", devmasters[i], devmasters[i]);
    277 		execute_command(0, "ip link add name %s_slave_1 type veth peer name veth1_to_%s", devmasters[i], devmasters[i]);
    278 		execute_command(0, "ip link set %s_slave_0 master %s0", devmasters[i], devmasters[i]);
    279 		execute_command(0, "ip link set %s_slave_1 master %s0", devmasters[i], devmasters[i]);
    280 		execute_command(0, "ip link set veth0_to_%s up", devmasters[i]);
    281 		execute_command(0, "ip link set veth1_to_%s up", devmasters[i]);
    282 	}
    283 	// bond/team_slave_* will set up automatically when set their master.
    284 	// But bridge_slave_* need to set up manually.
    285 	execute_command(0, "ip link set bridge_slave_0 up");
    286 	execute_command(0, "ip link set bridge_slave_1 up");
    287 
    288 	for (i = 0; i < sizeof(devnames) / (sizeof(devnames[0])); i++) {
    289 		char addr[32];
    290 		// Assign some unique address to devices. Some devices won't up without this.
    291 		// Devices that don't need these addresses will simply ignore them.
    292 		// Shift addresses by 10 because 0 subnet address can mean special things.
    293 		snprintf_check(addr, sizeof(addr), DEV_IPV4, i + 10);
    294 		execute_command(0, "ip -4 addr add %s/24 dev %s", addr, devnames[i]);
    295 		snprintf_check(addr, sizeof(addr), DEV_IPV6, i + 10);
    296 		execute_command(0, "ip -6 addr add %s/120 dev %s", addr, devnames[i]);
    297 		snprintf_check(addr, sizeof(addr), DEV_MAC, i + 10);
    298 		execute_command(0, "ip link set dev %s address %s", devnames[i], addr);
    299 		execute_command(0, "ip link set dev %s up", devnames[i]);
    300 	}
    301 }
    302 #endif
    303 
    304 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE && (__NR_syz_extract_tcp_res || SYZ_REPEAT)
    305 #include <errno.h>
    306 
    307 static int read_tun(char* data, int size)
    308 {
    309 	if (tunfd < 0)
    310 		return -1;
    311 
    312 	int rv = read(tunfd, data, size);
    313 	if (rv < 0) {
    314 		if (errno == EAGAIN)
    315 			return -1;
    316 		// Tun sometimes returns this, unclear if it's a kernel bug or not.
    317 		if (errno == EBADFD)
    318 			return -1;
    319 		fail("tun: read failed with %d", rv);
    320 	}
    321 	return rv;
    322 }
    323 #endif
    324 
    325 #if SYZ_EXECUTOR || __NR_syz_emit_ethernet && SYZ_TUN_ENABLE
    326 #include <stdbool.h>
    327 #include <sys/uio.h>
    328 
    329 #define MAX_FRAGS 4
    330 struct vnet_fragmentation {
    331 	uint32 full;
    332 	uint32 count;
    333 	uint32 frags[MAX_FRAGS];
    334 };
    335 
    336 static long syz_emit_ethernet(long a0, long a1, long a2)
    337 {
    338 	// syz_emit_ethernet(len len[packet], packet ptr[in, eth_packet], frags ptr[in, vnet_fragmentation, opt])
    339 	// vnet_fragmentation {
    340 	// 	full	int32[0:1]
    341 	// 	count	int32[1:4]
    342 	// 	frags	array[int32[0:4096], 4]
    343 	// }
    344 	if (tunfd < 0)
    345 		return (uintptr_t)-1;
    346 
    347 	uint32 length = a0;
    348 	char* data = (char*)a1;
    349 	debug_dump_data(data, length);
    350 
    351 	struct vnet_fragmentation* frags = (struct vnet_fragmentation*)a2;
    352 	struct iovec vecs[MAX_FRAGS + 1];
    353 	uint32 nfrags = 0;
    354 	if (!tun_frags_enabled || frags == NULL) {
    355 		vecs[nfrags].iov_base = data;
    356 		vecs[nfrags].iov_len = length;
    357 		nfrags++;
    358 	} else {
    359 		bool full = true;
    360 		uint32 i, count = 0;
    361 		NONFAILING(full = frags->full);
    362 		NONFAILING(count = frags->count);
    363 		if (count > MAX_FRAGS)
    364 			count = MAX_FRAGS;
    365 		for (i = 0; i < count && length != 0; i++) {
    366 			uint32 size = 0;
    367 			NONFAILING(size = frags->frags[i]);
    368 			if (size > length)
    369 				size = length;
    370 			vecs[nfrags].iov_base = data;
    371 			vecs[nfrags].iov_len = size;
    372 			nfrags++;
    373 			data += size;
    374 			length -= size;
    375 		}
    376 		if (length != 0 && (full || nfrags == 0)) {
    377 			vecs[nfrags].iov_base = data;
    378 			vecs[nfrags].iov_len = length;
    379 			nfrags++;
    380 		}
    381 	}
    382 	return writev(tunfd, vecs, nfrags);
    383 }
    384 #endif
    385 
    386 #if SYZ_EXECUTOR || SYZ_REPEAT && SYZ_TUN_ENABLE
    387 static void flush_tun()
    388 {
    389 #if SYZ_EXECUTOR
    390 	if (!flag_enable_tun)
    391 		return;
    392 #endif
    393 	char data[SYZ_TUN_MAX_PACKET_SIZE];
    394 	while (read_tun(&data[0], sizeof(data)) != -1) {
    395 	}
    396 }
    397 #endif
    398 
    399 #if SYZ_EXECUTOR || __NR_syz_extract_tcp_res && SYZ_TUN_ENABLE
    400 #ifndef __ANDROID__
    401 // Can't include <linux/ipv6.h>, since it causes
    402 // conflicts due to some structs redefinition.
    403 struct ipv6hdr {
    404 	__u8 priority : 4,
    405 	    version : 4;
    406 	__u8 flow_lbl[3];
    407 
    408 	__be16 payload_len;
    409 	__u8 nexthdr;
    410 	__u8 hop_limit;
    411 
    412 	struct in6_addr saddr;
    413 	struct in6_addr daddr;
    414 };
    415 #endif
    416 
    417 struct tcp_resources {
    418 	uint32 seq;
    419 	uint32 ack;
    420 };
    421 
    422 static long syz_extract_tcp_res(long a0, long a1, long a2)
    423 {
    424 	// syz_extract_tcp_res(res ptr[out, tcp_resources], seq_inc int32, ack_inc int32)
    425 
    426 	if (tunfd < 0)
    427 		return (uintptr_t)-1;
    428 
    429 	char data[SYZ_TUN_MAX_PACKET_SIZE];
    430 	int rv = read_tun(&data[0], sizeof(data));
    431 	if (rv == -1)
    432 		return (uintptr_t)-1;
    433 	size_t length = rv;
    434 	debug_dump_data(data, length);
    435 
    436 	struct tcphdr* tcphdr;
    437 
    438 	if (length < sizeof(struct ethhdr))
    439 		return (uintptr_t)-1;
    440 	struct ethhdr* ethhdr = (struct ethhdr*)&data[0];
    441 
    442 	if (ethhdr->h_proto == htons(ETH_P_IP)) {
    443 		if (length < sizeof(struct ethhdr) + sizeof(struct iphdr))
    444 			return (uintptr_t)-1;
    445 		struct iphdr* iphdr = (struct iphdr*)&data[sizeof(struct ethhdr)];
    446 		if (iphdr->protocol != IPPROTO_TCP)
    447 			return (uintptr_t)-1;
    448 		if (length < sizeof(struct ethhdr) + iphdr->ihl * 4 + sizeof(struct tcphdr))
    449 			return (uintptr_t)-1;
    450 		tcphdr = (struct tcphdr*)&data[sizeof(struct ethhdr) + iphdr->ihl * 4];
    451 	} else {
    452 		if (length < sizeof(struct ethhdr) + sizeof(struct ipv6hdr))
    453 			return (uintptr_t)-1;
    454 		struct ipv6hdr* ipv6hdr = (struct ipv6hdr*)&data[sizeof(struct ethhdr)];
    455 		// TODO: parse and skip extension headers.
    456 		if (ipv6hdr->nexthdr != IPPROTO_TCP)
    457 			return (uintptr_t)-1;
    458 		if (length < sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
    459 			return (uintptr_t)-1;
    460 		tcphdr = (struct tcphdr*)&data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr)];
    461 	}
    462 
    463 	struct tcp_resources* res = (struct tcp_resources*)a0;
    464 	NONFAILING(res->seq = htonl((ntohl(tcphdr->seq) + (uint32)a1)));
    465 	NONFAILING(res->ack = htonl((ntohl(tcphdr->ack_seq) + (uint32)a2)));
    466 
    467 	debug("extracted seq: %08x\n", res->seq);
    468 	debug("extracted ack: %08x\n", res->ack);
    469 
    470 	return 0;
    471 }
    472 #endif
    473 
    474 #if SYZ_EXECUTOR || __NR_syz_open_dev
    475 #include <fcntl.h>
    476 #include <string.h>
    477 #include <sys/stat.h>
    478 #include <sys/types.h>
    479 
    480 static long syz_open_dev(long a0, long a1, long a2)
    481 {
    482 	if (a0 == 0xc || a0 == 0xb) {
    483 		// syz_open_dev$char(dev const[0xc], major intptr, minor intptr) fd
    484 		// syz_open_dev$block(dev const[0xb], major intptr, minor intptr) fd
    485 		char buf[128];
    486 		sprintf(buf, "/dev/%s/%d:%d", a0 == 0xc ? "char" : "block", (uint8)a1, (uint8)a2);
    487 		return open(buf, O_RDWR, 0);
    488 	} else {
    489 		// syz_open_dev(dev strconst, id intptr, flags flags[open_flags]) fd
    490 		char buf[1024];
    491 		char* hash;
    492 		NONFAILING(strncpy(buf, (char*)a0, sizeof(buf) - 1));
    493 		buf[sizeof(buf) - 1] = 0;
    494 		while ((hash = strchr(buf, '#'))) {
    495 			*hash = '0' + (char)(a1 % 10); // 10 devices should be enough for everyone.
    496 			a1 /= 10;
    497 		}
    498 		return open(buf, a2, 0);
    499 	}
    500 }
    501 #endif
    502 
    503 #if SYZ_EXECUTOR || __NR_syz_open_procfs
    504 #include <fcntl.h>
    505 #include <string.h>
    506 #include <sys/stat.h>
    507 #include <sys/types.h>
    508 
    509 static long syz_open_procfs(long a0, long a1)
    510 {
    511 	// syz_open_procfs(pid pid, file ptr[in, string[procfs_file]]) fd
    512 
    513 	char buf[128];
    514 	memset(buf, 0, sizeof(buf));
    515 	if (a0 == 0) {
    516 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/self/%s", (char*)a1));
    517 	} else if (a0 == -1) {
    518 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/thread-self/%s", (char*)a1));
    519 	} else {
    520 		NONFAILING(snprintf(buf, sizeof(buf), "/proc/self/task/%d/%s", (int)a0, (char*)a1));
    521 	}
    522 	int fd = open(buf, O_RDWR);
    523 	if (fd == -1)
    524 		fd = open(buf, O_RDONLY);
    525 	return fd;
    526 }
    527 #endif
    528 
    529 #if SYZ_EXECUTOR || __NR_syz_open_pts
    530 #include <fcntl.h>
    531 #include <sys/ioctl.h>
    532 #include <sys/stat.h>
    533 #include <sys/types.h>
    534 
    535 static long syz_open_pts(long a0, long a1)
    536 {
    537 	// syz_openpts(fd fd[tty], flags flags[open_flags]) fd[tty]
    538 	int ptyno = 0;
    539 	if (ioctl(a0, TIOCGPTN, &ptyno))
    540 		return -1;
    541 	char buf[128];
    542 	sprintf(buf, "/dev/pts/%d", ptyno);
    543 	return open(buf, a1, 0);
    544 }
    545 #endif
    546 
    547 #if SYZ_EXECUTOR || __NR_syz_init_net_socket
    548 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
    549 #include <fcntl.h>
    550 #include <sched.h>
    551 #include <sys/stat.h>
    552 #include <sys/types.h>
    553 #include <unistd.h>
    554 
    555 const int kInitNetNsFd = 239; // see kMaxFd
    556 // syz_init_net_socket opens a socket in init net namespace.
    557 // Used for families that can only be created in init net namespace.
    558 static long syz_init_net_socket(long domain, long type, long proto)
    559 {
    560 	int netns = open("/proc/self/ns/net", O_RDONLY);
    561 	if (netns == -1)
    562 		return netns;
    563 	if (setns(kInitNetNsFd, 0))
    564 		return -1;
    565 	int sock = syscall(__NR_socket, domain, type, proto);
    566 	int err = errno;
    567 	if (setns(netns, 0))
    568 		fail("setns(netns) failed");
    569 	close(netns);
    570 	errno = err;
    571 	return sock;
    572 }
    573 #else
    574 static long syz_init_net_socket(long domain, long type, long proto)
    575 {
    576 	return syscall(__NR_socket, domain, type, proto);
    577 }
    578 #endif
    579 #endif
    580 
    581 #if SYZ_EXECUTOR || __NR_syz_genetlink_get_family_id
    582 #include <errno.h>
    583 #include <linux/genetlink.h>
    584 #include <linux/netlink.h>
    585 #include <sys/socket.h>
    586 #include <sys/types.h>
    587 
    588 static long syz_genetlink_get_family_id(long name)
    589 {
    590 	char buf[512] = {0};
    591 	struct nlmsghdr* hdr = (struct nlmsghdr*)buf;
    592 	struct genlmsghdr* genlhdr = (struct genlmsghdr*)NLMSG_DATA(hdr);
    593 	struct nlattr* attr = (struct nlattr*)(genlhdr + 1);
    594 	hdr->nlmsg_len = sizeof(*hdr) + sizeof(*genlhdr) + sizeof(*attr) + GENL_NAMSIZ;
    595 	hdr->nlmsg_type = GENL_ID_CTRL;
    596 	hdr->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
    597 	genlhdr->cmd = CTRL_CMD_GETFAMILY;
    598 	attr->nla_type = CTRL_ATTR_FAMILY_NAME;
    599 	attr->nla_len = sizeof(*attr) + GENL_NAMSIZ;
    600 	NONFAILING(strncpy((char*)(attr + 1), (char*)name, GENL_NAMSIZ));
    601 	struct iovec iov = {hdr, hdr->nlmsg_len};
    602 	struct sockaddr_nl addr = {0};
    603 	addr.nl_family = AF_NETLINK;
    604 	debug("syz_genetlink_get_family_id(%s)\n", (char*)(attr + 1));
    605 	int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
    606 	if (fd == -1) {
    607 		debug("syz_genetlink_get_family_id: socket failed: %d\n", errno);
    608 		return -1;
    609 	}
    610 	struct msghdr msg = {&addr, sizeof(addr), &iov, 1, NULL, 0, 0};
    611 	if (sendmsg(fd, &msg, 0) == -1) {
    612 		debug("syz_genetlink_get_family_id: sendmsg failed: %d\n", errno);
    613 		close(fd);
    614 		return -1;
    615 	}
    616 	ssize_t n = recv(fd, buf, sizeof(buf), 0);
    617 	close(fd);
    618 	if (n <= 0) {
    619 		debug("syz_genetlink_get_family_id: recv failed: %d\n", errno);
    620 		return -1;
    621 	}
    622 	if (hdr->nlmsg_type != GENL_ID_CTRL) {
    623 		debug("syz_genetlink_get_family_id: wrong reply type: %d\n", hdr->nlmsg_type);
    624 		return -1;
    625 	}
    626 	for (; (char*)attr < buf + n; attr = (struct nlattr*)((char*)attr + NLMSG_ALIGN(attr->nla_len))) {
    627 		if (attr->nla_type == CTRL_ATTR_FAMILY_ID)
    628 			return *(uint16*)(attr + 1);
    629 	}
    630 	debug("syz_genetlink_get_family_id: no CTRL_ATTR_FAMILY_ID attr\n");
    631 	return -1;
    632 }
    633 #endif
    634 
    635 #if SYZ_EXECUTOR || __NR_syz_mount_image || __NR_syz_read_part_table
    636 #include <errno.h>
    637 #include <fcntl.h>
    638 #include <linux/loop.h>
    639 #include <sys/ioctl.h>
    640 #include <sys/stat.h>
    641 #include <sys/types.h>
    642 
    643 struct fs_image_segment {
    644 	void* data;
    645 	uintptr_t size;
    646 	uintptr_t offset;
    647 };
    648 
    649 #define IMAGE_MAX_SEGMENTS 4096
    650 #define IMAGE_MAX_SIZE (129 << 20)
    651 
    652 #if GOARCH_386
    653 #define SYZ_memfd_create 356
    654 #elif GOARCH_amd64
    655 #define SYZ_memfd_create 319
    656 #elif GOARCH_arm
    657 #define SYZ_memfd_create 385
    658 #elif GOARCH_arm64
    659 #define SYZ_memfd_create 279
    660 #elif GOARCH_ppc64le
    661 #define SYZ_memfd_create 360
    662 #endif
    663 #endif
    664 
    665 #if SYZ_EXECUTOR || __NR_syz_read_part_table
    666 // syz_read_part_table(size intptr, nsegs len[segments], segments ptr[in, array[fs_image_segment]])
    667 static long syz_read_part_table(unsigned long size, unsigned long nsegs, long segments)
    668 {
    669 	char loopname[64], linkname[64];
    670 	int loopfd, err = 0, res = -1;
    671 	unsigned long i, j;
    672 	// See the comment in syz_mount_image.
    673 	struct fs_image_segment* segs = (struct fs_image_segment*)segments;
    674 
    675 	if (nsegs > IMAGE_MAX_SEGMENTS)
    676 		nsegs = IMAGE_MAX_SEGMENTS;
    677 	for (i = 0; i < nsegs; i++) {
    678 		if (segs[i].size > IMAGE_MAX_SIZE)
    679 			segs[i].size = IMAGE_MAX_SIZE;
    680 		segs[i].offset %= IMAGE_MAX_SIZE;
    681 		if (segs[i].offset > IMAGE_MAX_SIZE - segs[i].size)
    682 			segs[i].offset = IMAGE_MAX_SIZE - segs[i].size;
    683 		if (size < segs[i].offset + segs[i].offset)
    684 			size = segs[i].offset + segs[i].offset;
    685 	}
    686 	if (size > IMAGE_MAX_SIZE)
    687 		size = IMAGE_MAX_SIZE;
    688 	int memfd = syscall(SYZ_memfd_create, "syz_read_part_table", 0);
    689 	if (memfd == -1) {
    690 		err = errno;
    691 		goto error;
    692 	}
    693 	if (ftruncate(memfd, size)) {
    694 		err = errno;
    695 		goto error_close_memfd;
    696 	}
    697 	for (i = 0; i < nsegs; i++) {
    698 		if (pwrite(memfd, segs[i].data, segs[i].size, segs[i].offset) < 0) {
    699 			debug("syz_read_part_table: pwrite[%u] failed: %d\n", (int)i, errno);
    700 		}
    701 	}
    702 	snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
    703 	loopfd = open(loopname, O_RDWR);
    704 	if (loopfd == -1) {
    705 		err = errno;
    706 		goto error_close_memfd;
    707 	}
    708 	if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
    709 		if (errno != EBUSY) {
    710 			err = errno;
    711 			goto error_close_loop;
    712 		}
    713 		ioctl(loopfd, LOOP_CLR_FD, 0);
    714 		usleep(1000);
    715 		if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
    716 			err = errno;
    717 			goto error_close_loop;
    718 		}
    719 	}
    720 	struct loop_info64 info;
    721 	if (ioctl(loopfd, LOOP_GET_STATUS64, &info)) {
    722 		err = errno;
    723 		goto error_clear_loop;
    724 	}
    725 #if SYZ_EXECUTOR
    726 	cover_reset(0);
    727 #endif
    728 	info.lo_flags |= LO_FLAGS_PARTSCAN;
    729 	if (ioctl(loopfd, LOOP_SET_STATUS64, &info)) {
    730 		err = errno;
    731 		goto error_clear_loop;
    732 	}
    733 	res = 0;
    734 	// If we managed to parse some partitions, symlink them into our work dir.
    735 	for (i = 1, j = 0; i < 8; i++) {
    736 		snprintf(loopname, sizeof(loopname), "/dev/loop%llup%d", procid, (int)i);
    737 		struct stat statbuf;
    738 		if (stat(loopname, &statbuf) == 0) {
    739 			snprintf(linkname, sizeof(linkname), "./file%d", (int)j++);
    740 			if (symlink(loopname, linkname)) {
    741 				debug("syz_read_part_table: symlink(%s, %s) failed: %d\n", loopname, linkname, errno);
    742 			}
    743 		}
    744 	}
    745 error_clear_loop:
    746 	ioctl(loopfd, LOOP_CLR_FD, 0);
    747 error_close_loop:
    748 	close(loopfd);
    749 error_close_memfd:
    750 	close(memfd);
    751 error:
    752 	errno = err;
    753 	return res;
    754 }
    755 #endif
    756 
    757 #if SYZ_EXECUTOR || __NR_syz_mount_image
    758 #include <string.h>
    759 #include <sys/mount.h>
    760 
    761 //syz_mount_image(fs ptr[in, string[disk_filesystems]], dir ptr[in, filename], size intptr, nsegs len[segments], segments ptr[in, array[fs_image_segment]], flags flags[mount_flags], opts ptr[in, fs_options[vfat_options]])
    762 //fs_image_segment {
    763 //	data	ptr[in, array[int8]]
    764 //	size	len[data, intptr]
    765 //	offset	intptr
    766 //}
    767 static long syz_mount_image(long fsarg, long dir, unsigned long size, unsigned long nsegs, long segments, long flags, long optsarg)
    768 {
    769 	char loopname[64], fs[32], opts[256];
    770 	int loopfd, err = 0, res = -1;
    771 	unsigned long i;
    772 	// Strictly saying we ought to do a nonfailing copyout of segments into a local var.
    773 	// But some filesystems have large number of segments (2000+),
    774 	// we can't allocate that much on stack and allocating elsewhere is problematic,
    775 	// so we just use the memory allocated by fuzzer.
    776 	struct fs_image_segment* segs = (struct fs_image_segment*)segments;
    777 
    778 	if (nsegs > IMAGE_MAX_SEGMENTS)
    779 		nsegs = IMAGE_MAX_SEGMENTS;
    780 	for (i = 0; i < nsegs; i++) {
    781 		if (segs[i].size > IMAGE_MAX_SIZE)
    782 			segs[i].size = IMAGE_MAX_SIZE;
    783 		segs[i].offset %= IMAGE_MAX_SIZE;
    784 		if (segs[i].offset > IMAGE_MAX_SIZE - segs[i].size)
    785 			segs[i].offset = IMAGE_MAX_SIZE - segs[i].size;
    786 		if (size < segs[i].offset + segs[i].offset)
    787 			size = segs[i].offset + segs[i].offset;
    788 	}
    789 	if (size > IMAGE_MAX_SIZE)
    790 		size = IMAGE_MAX_SIZE;
    791 	int memfd = syscall(SYZ_memfd_create, "syz_mount_image", 0);
    792 	if (memfd == -1) {
    793 		err = errno;
    794 		goto error;
    795 	}
    796 	if (ftruncate(memfd, size)) {
    797 		err = errno;
    798 		goto error_close_memfd;
    799 	}
    800 	for (i = 0; i < nsegs; i++) {
    801 		if (pwrite(memfd, segs[i].data, segs[i].size, segs[i].offset) < 0) {
    802 			debug("syz_mount_image: pwrite[%u] failed: %d\n", (int)i, errno);
    803 		}
    804 	}
    805 	snprintf(loopname, sizeof(loopname), "/dev/loop%llu", procid);
    806 	loopfd = open(loopname, O_RDWR);
    807 	if (loopfd == -1) {
    808 		err = errno;
    809 		goto error_close_memfd;
    810 	}
    811 	if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
    812 		if (errno != EBUSY) {
    813 			err = errno;
    814 			goto error_close_loop;
    815 		}
    816 		ioctl(loopfd, LOOP_CLR_FD, 0);
    817 		usleep(1000);
    818 		if (ioctl(loopfd, LOOP_SET_FD, memfd)) {
    819 			err = errno;
    820 			goto error_close_loop;
    821 		}
    822 	}
    823 	mkdir((char*)dir, 0777);
    824 	memset(fs, 0, sizeof(fs));
    825 	NONFAILING(strncpy(fs, (char*)fsarg, sizeof(fs) - 1));
    826 	memset(opts, 0, sizeof(opts));
    827 	// Leave some space for the additional options we append below.
    828 	NONFAILING(strncpy(opts, (char*)optsarg, sizeof(opts) - 32));
    829 	if (strcmp(fs, "iso9660") == 0) {
    830 		flags |= MS_RDONLY;
    831 	} else if (strncmp(fs, "ext", 3) == 0) {
    832 		// For ext2/3/4 we have to have errors=continue because the image
    833 		// can contain errors=panic flag and can legally crash kernel.
    834 		if (strstr(opts, "errors=panic") || strstr(opts, "errors=remount-ro") == 0)
    835 			strcat(opts, ",errors=continue");
    836 	} else if (strcmp(fs, "xfs") == 0) {
    837 		// For xfs we need nouuid because xfs has a global uuids table
    838 		// and if two parallel executors mounts fs with the same uuid, second mount fails.
    839 		strcat(opts, ",nouuid");
    840 	}
    841 	debug("syz_mount_image: size=%llu segs=%llu loop='%s' dir='%s' fs='%s' flags=%llu opts='%s'\n", (uint64)size, (uint64)nsegs, loopname, (char*)dir, fs, (uint64)flags, opts);
    842 #if SYZ_EXECUTOR
    843 	cover_reset(0);
    844 #endif
    845 	if (mount(loopname, (char*)dir, fs, flags, opts)) {
    846 		err = errno;
    847 		goto error_clear_loop;
    848 	}
    849 	res = 0;
    850 error_clear_loop:
    851 	ioctl(loopfd, LOOP_CLR_FD, 0);
    852 error_close_loop:
    853 	close(loopfd);
    854 error_close_memfd:
    855 	close(memfd);
    856 error:
    857 	errno = err;
    858 	return res;
    859 }
    860 #endif
    861 
    862 #if SYZ_EXECUTOR || __NR_syz_kvm_setup_cpu
    863 #include <errno.h>
    864 #include <fcntl.h>
    865 #include <linux/kvm.h>
    866 #include <stdarg.h>
    867 #include <stddef.h>
    868 #include <sys/ioctl.h>
    869 #include <sys/stat.h>
    870 
    871 #if defined(__x86_64__)
    872 #include "common_kvm_amd64.h"
    873 #elif defined(__aarch64__)
    874 #include "common_kvm_arm64.h"
    875 #else
    876 static long syz_kvm_setup_cpu(long a0, long a1, long a2, long a3, long a4, long a5, long a6, long a7)
    877 {
    878 	return 0;
    879 }
    880 #endif
    881 #endif
    882 
    883 #if SYZ_EXECUTOR || SYZ_FAULT_INJECTION || SYZ_SANDBOX_NAMESPACE || SYZ_ENABLE_CGROUPS
    884 #include <errno.h>
    885 #include <fcntl.h>
    886 #include <stdarg.h>
    887 #include <stdbool.h>
    888 #include <string.h>
    889 #include <sys/stat.h>
    890 #include <sys/types.h>
    891 
    892 static bool write_file(const char* file, const char* what, ...)
    893 {
    894 	char buf[1024];
    895 	va_list args;
    896 	va_start(args, what);
    897 	vsnprintf(buf, sizeof(buf), what, args);
    898 	va_end(args);
    899 	buf[sizeof(buf) - 1] = 0;
    900 	int len = strlen(buf);
    901 
    902 	int fd = open(file, O_WRONLY | O_CLOEXEC);
    903 	if (fd == -1)
    904 		return false;
    905 	if (write(fd, buf, len) != len) {
    906 		int err = errno;
    907 		close(fd);
    908 		errno = err;
    909 		return false;
    910 	}
    911 	close(fd);
    912 	return true;
    913 }
    914 #endif
    915 
    916 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
    917 #include <errno.h>
    918 #include <linux/net.h>
    919 #include <netinet/in.h>
    920 #include <string.h>
    921 #include <sys/socket.h>
    922 
    923 // checkpoint/reset_net_namespace partially resets net namespace to initial state
    924 // after each test. Currently it resets only ipv4 netfilter state.
    925 // Ideally, we just create a new net namespace for each test,
    926 // however it's too slow (1-1.5 seconds per namespace, not parallelizable).
    927 
    928 // Linux headers do not compile for C++, so we have to define the structs manualy.
    929 #define XT_TABLE_SIZE 1536
    930 #define XT_MAX_ENTRIES 10
    931 
    932 struct xt_counters {
    933 	uint64 pcnt, bcnt;
    934 };
    935 
    936 struct ipt_getinfo {
    937 	char name[32];
    938 	unsigned int valid_hooks;
    939 	unsigned int hook_entry[5];
    940 	unsigned int underflow[5];
    941 	unsigned int num_entries;
    942 	unsigned int size;
    943 };
    944 
    945 struct ipt_get_entries {
    946 	char name[32];
    947 	unsigned int size;
    948 	void* entrytable[XT_TABLE_SIZE / sizeof(void*)];
    949 };
    950 
    951 struct ipt_replace {
    952 	char name[32];
    953 	unsigned int valid_hooks;
    954 	unsigned int num_entries;
    955 	unsigned int size;
    956 	unsigned int hook_entry[5];
    957 	unsigned int underflow[5];
    958 	unsigned int num_counters;
    959 	struct xt_counters* counters;
    960 	char entrytable[XT_TABLE_SIZE];
    961 };
    962 
    963 struct ipt_table_desc {
    964 	const char* name;
    965 	struct ipt_getinfo info;
    966 	struct ipt_replace replace;
    967 };
    968 
    969 static struct ipt_table_desc ipv4_tables[] = {
    970     {.name = "filter"},
    971     {.name = "nat"},
    972     {.name = "mangle"},
    973     {.name = "raw"},
    974     {.name = "security"},
    975 };
    976 
    977 static struct ipt_table_desc ipv6_tables[] = {
    978     {.name = "filter"},
    979     {.name = "nat"},
    980     {.name = "mangle"},
    981     {.name = "raw"},
    982     {.name = "security"},
    983 };
    984 
    985 #define IPT_BASE_CTL 64
    986 #define IPT_SO_SET_REPLACE (IPT_BASE_CTL)
    987 #define IPT_SO_GET_INFO (IPT_BASE_CTL)
    988 #define IPT_SO_GET_ENTRIES (IPT_BASE_CTL + 1)
    989 
    990 struct arpt_getinfo {
    991 	char name[32];
    992 	unsigned int valid_hooks;
    993 	unsigned int hook_entry[3];
    994 	unsigned int underflow[3];
    995 	unsigned int num_entries;
    996 	unsigned int size;
    997 };
    998 
    999 struct arpt_get_entries {
   1000 	char name[32];
   1001 	unsigned int size;
   1002 	void* entrytable[XT_TABLE_SIZE / sizeof(void*)];
   1003 };
   1004 
   1005 struct arpt_replace {
   1006 	char name[32];
   1007 	unsigned int valid_hooks;
   1008 	unsigned int num_entries;
   1009 	unsigned int size;
   1010 	unsigned int hook_entry[3];
   1011 	unsigned int underflow[3];
   1012 	unsigned int num_counters;
   1013 	struct xt_counters* counters;
   1014 	char entrytable[XT_TABLE_SIZE];
   1015 };
   1016 
   1017 struct arpt_table_desc {
   1018 	const char* name;
   1019 	struct arpt_getinfo info;
   1020 	struct arpt_replace replace;
   1021 };
   1022 
   1023 static struct arpt_table_desc arpt_tables[] = {
   1024     {.name = "filter"},
   1025 };
   1026 
   1027 #define ARPT_BASE_CTL 96
   1028 #define ARPT_SO_SET_REPLACE (ARPT_BASE_CTL)
   1029 #define ARPT_SO_GET_INFO (ARPT_BASE_CTL)
   1030 #define ARPT_SO_GET_ENTRIES (ARPT_BASE_CTL + 1)
   1031 
   1032 static void checkpoint_iptables(struct ipt_table_desc* tables, int num_tables, int family, int level)
   1033 {
   1034 	struct ipt_get_entries entries;
   1035 	socklen_t optlen;
   1036 	int fd, i;
   1037 
   1038 	fd = socket(family, SOCK_STREAM, IPPROTO_TCP);
   1039 	if (fd == -1) {
   1040 		switch (errno) {
   1041 		case EAFNOSUPPORT:
   1042 		case ENOPROTOOPT:
   1043 			return;
   1044 		}
   1045 		fail("iptable checkpoint %d: socket failed", family);
   1046 	}
   1047 	for (i = 0; i < num_tables; i++) {
   1048 		struct ipt_table_desc* table = &tables[i];
   1049 		strcpy(table->info.name, table->name);
   1050 		strcpy(table->replace.name, table->name);
   1051 		optlen = sizeof(table->info);
   1052 		if (getsockopt(fd, level, IPT_SO_GET_INFO, &table->info, &optlen)) {
   1053 			switch (errno) {
   1054 			case EPERM:
   1055 			case ENOENT:
   1056 			case ENOPROTOOPT:
   1057 				continue;
   1058 			}
   1059 			fail("iptable checkpoint %s/%d: getsockopt(IPT_SO_GET_INFO)", table->name, family);
   1060 		}
   1061 		debug("iptable checkpoint %s/%d: checkpoint entries=%d hooks=%x size=%d\n",
   1062 		      table->name, family, table->info.num_entries,
   1063 		      table->info.valid_hooks, table->info.size);
   1064 		if (table->info.size > sizeof(table->replace.entrytable))
   1065 			fail("iptable checkpoint %s/%d: table size is too large: %u",
   1066 			     table->name, family, table->info.size);
   1067 		if (table->info.num_entries > XT_MAX_ENTRIES)
   1068 			fail("iptable checkpoint %s/%d: too many counters: %u",
   1069 			     table->name, family, table->info.num_entries);
   1070 		memset(&entries, 0, sizeof(entries));
   1071 		strcpy(entries.name, table->name);
   1072 		entries.size = table->info.size;
   1073 		optlen = sizeof(entries) - sizeof(entries.entrytable) + table->info.size;
   1074 		if (getsockopt(fd, level, IPT_SO_GET_ENTRIES, &entries, &optlen))
   1075 			fail("iptable checkpoint %s/%d: getsockopt(IPT_SO_GET_ENTRIES)",
   1076 			     table->name, family);
   1077 		table->replace.valid_hooks = table->info.valid_hooks;
   1078 		table->replace.num_entries = table->info.num_entries;
   1079 		table->replace.size = table->info.size;
   1080 		memcpy(table->replace.hook_entry, table->info.hook_entry, sizeof(table->replace.hook_entry));
   1081 		memcpy(table->replace.underflow, table->info.underflow, sizeof(table->replace.underflow));
   1082 		memcpy(table->replace.entrytable, entries.entrytable, table->info.size);
   1083 	}
   1084 	close(fd);
   1085 }
   1086 
   1087 static void reset_iptables(struct ipt_table_desc* tables, int num_tables, int family, int level)
   1088 {
   1089 	struct xt_counters counters[XT_MAX_ENTRIES];
   1090 	struct ipt_get_entries entries;
   1091 	struct ipt_getinfo info;
   1092 	socklen_t optlen;
   1093 	int fd, i;
   1094 
   1095 	fd = socket(family, SOCK_STREAM, IPPROTO_TCP);
   1096 	if (fd == -1) {
   1097 		switch (errno) {
   1098 		case EAFNOSUPPORT:
   1099 		case ENOPROTOOPT:
   1100 			return;
   1101 		}
   1102 		fail("iptable %d: socket failed", family);
   1103 	}
   1104 	for (i = 0; i < num_tables; i++) {
   1105 		struct ipt_table_desc* table = &tables[i];
   1106 		if (table->info.valid_hooks == 0)
   1107 			continue;
   1108 		memset(&info, 0, sizeof(info));
   1109 		strcpy(info.name, table->name);
   1110 		optlen = sizeof(info);
   1111 		if (getsockopt(fd, level, IPT_SO_GET_INFO, &info, &optlen))
   1112 			fail("iptable %s/%d: getsockopt(IPT_SO_GET_INFO)", table->name, family);
   1113 		if (memcmp(&table->info, &info, sizeof(table->info)) == 0) {
   1114 			memset(&entries, 0, sizeof(entries));
   1115 			strcpy(entries.name, table->name);
   1116 			entries.size = table->info.size;
   1117 			optlen = sizeof(entries) - sizeof(entries.entrytable) + entries.size;
   1118 			if (getsockopt(fd, level, IPT_SO_GET_ENTRIES, &entries, &optlen))
   1119 				fail("iptable %s/%d: getsockopt(IPT_SO_GET_ENTRIES)", table->name, family);
   1120 			if (memcmp(table->replace.entrytable, entries.entrytable, table->info.size) == 0)
   1121 				continue;
   1122 		}
   1123 		debug("iptable %s/%d: resetting\n", table->name, family);
   1124 		table->replace.num_counters = info.num_entries;
   1125 		table->replace.counters = counters;
   1126 		optlen = sizeof(table->replace) - sizeof(table->replace.entrytable) + table->replace.size;
   1127 		if (setsockopt(fd, level, IPT_SO_SET_REPLACE, &table->replace, optlen))
   1128 			fail("iptable %s/%d: setsockopt(IPT_SO_SET_REPLACE)", table->name, family);
   1129 	}
   1130 	close(fd);
   1131 }
   1132 
   1133 static void checkpoint_arptables(void)
   1134 {
   1135 	struct arpt_get_entries entries;
   1136 	socklen_t optlen;
   1137 	unsigned i;
   1138 	int fd;
   1139 
   1140 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
   1141 	if (fd == -1) {
   1142 		switch (errno) {
   1143 		case EAFNOSUPPORT:
   1144 		case ENOPROTOOPT:
   1145 			return;
   1146 		}
   1147 		fail("arptable checkpoint: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
   1148 	}
   1149 	for (i = 0; i < sizeof(arpt_tables) / sizeof(arpt_tables[0]); i++) {
   1150 		struct arpt_table_desc* table = &arpt_tables[i];
   1151 		strcpy(table->info.name, table->name);
   1152 		strcpy(table->replace.name, table->name);
   1153 		optlen = sizeof(table->info);
   1154 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_INFO, &table->info, &optlen)) {
   1155 			switch (errno) {
   1156 			case EPERM:
   1157 			case ENOENT:
   1158 			case ENOPROTOOPT:
   1159 				continue;
   1160 			}
   1161 			fail("arptable checkpoint %s: getsockopt(ARPT_SO_GET_INFO)", table->name);
   1162 		}
   1163 		debug("arptable checkpoint %s: entries=%d hooks=%x size=%d\n",
   1164 		      table->name, table->info.num_entries, table->info.valid_hooks, table->info.size);
   1165 		if (table->info.size > sizeof(table->replace.entrytable))
   1166 			fail("arptable checkpoint %s: table size is too large: %u",
   1167 			     table->name, table->info.size);
   1168 		if (table->info.num_entries > XT_MAX_ENTRIES)
   1169 			fail("arptable checkpoint %s: too many counters: %u",
   1170 			     table->name, table->info.num_entries);
   1171 		memset(&entries, 0, sizeof(entries));
   1172 		strcpy(entries.name, table->name);
   1173 		entries.size = table->info.size;
   1174 		optlen = sizeof(entries) - sizeof(entries.entrytable) + table->info.size;
   1175 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_ENTRIES, &entries, &optlen))
   1176 			fail("arptable checkpoint %s: getsockopt(ARPT_SO_GET_ENTRIES)", table->name);
   1177 		table->replace.valid_hooks = table->info.valid_hooks;
   1178 		table->replace.num_entries = table->info.num_entries;
   1179 		table->replace.size = table->info.size;
   1180 		memcpy(table->replace.hook_entry, table->info.hook_entry, sizeof(table->replace.hook_entry));
   1181 		memcpy(table->replace.underflow, table->info.underflow, sizeof(table->replace.underflow));
   1182 		memcpy(table->replace.entrytable, entries.entrytable, table->info.size);
   1183 	}
   1184 	close(fd);
   1185 }
   1186 
   1187 static void reset_arptables()
   1188 {
   1189 	struct xt_counters counters[XT_MAX_ENTRIES];
   1190 	struct arpt_get_entries entries;
   1191 	struct arpt_getinfo info;
   1192 	socklen_t optlen;
   1193 	unsigned i;
   1194 	int fd;
   1195 
   1196 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
   1197 	if (fd == -1) {
   1198 		switch (errno) {
   1199 		case EAFNOSUPPORT:
   1200 		case ENOPROTOOPT:
   1201 			return;
   1202 		}
   1203 		fail("arptable: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
   1204 	}
   1205 	for (i = 0; i < sizeof(arpt_tables) / sizeof(arpt_tables[0]); i++) {
   1206 		struct arpt_table_desc* table = &arpt_tables[i];
   1207 		if (table->info.valid_hooks == 0)
   1208 			continue;
   1209 		memset(&info, 0, sizeof(info));
   1210 		strcpy(info.name, table->name);
   1211 		optlen = sizeof(info);
   1212 		if (getsockopt(fd, SOL_IP, ARPT_SO_GET_INFO, &info, &optlen))
   1213 			fail("arptable %s:getsockopt(ARPT_SO_GET_INFO)", table->name);
   1214 		if (memcmp(&table->info, &info, sizeof(table->info)) == 0) {
   1215 			memset(&entries, 0, sizeof(entries));
   1216 			strcpy(entries.name, table->name);
   1217 			entries.size = table->info.size;
   1218 			optlen = sizeof(entries) - sizeof(entries.entrytable) + entries.size;
   1219 			if (getsockopt(fd, SOL_IP, ARPT_SO_GET_ENTRIES, &entries, &optlen))
   1220 				fail("arptable %s: getsockopt(ARPT_SO_GET_ENTRIES)", table->name);
   1221 			if (memcmp(table->replace.entrytable, entries.entrytable, table->info.size) == 0)
   1222 				continue;
   1223 			debug("arptable %s: data changed\n", table->name);
   1224 		} else {
   1225 			debug("arptable %s: header changed\n", table->name);
   1226 		}
   1227 		debug("arptable %s: resetting\n", table->name);
   1228 		table->replace.num_counters = info.num_entries;
   1229 		table->replace.counters = counters;
   1230 		optlen = sizeof(table->replace) - sizeof(table->replace.entrytable) + table->replace.size;
   1231 		if (setsockopt(fd, SOL_IP, ARPT_SO_SET_REPLACE, &table->replace, optlen))
   1232 			fail("arptable %s: setsockopt(ARPT_SO_SET_REPLACE)", table->name);
   1233 	}
   1234 	close(fd);
   1235 }
   1236 
   1237 #include <linux/if.h>
   1238 #include <linux/netfilter_bridge/ebtables.h>
   1239 
   1240 struct ebt_table_desc {
   1241 	const char* name;
   1242 	struct ebt_replace replace;
   1243 	char entrytable[XT_TABLE_SIZE];
   1244 };
   1245 
   1246 static struct ebt_table_desc ebt_tables[] = {
   1247     {.name = "filter"},
   1248     {.name = "nat"},
   1249     {.name = "broute"},
   1250 };
   1251 
   1252 static void checkpoint_ebtables(void)
   1253 {
   1254 	socklen_t optlen;
   1255 	unsigned i;
   1256 	int fd;
   1257 
   1258 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
   1259 	if (fd == -1) {
   1260 		switch (errno) {
   1261 		case EAFNOSUPPORT:
   1262 		case ENOPROTOOPT:
   1263 			return;
   1264 		}
   1265 		fail("ebtable checkpoint: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
   1266 	}
   1267 	for (i = 0; i < sizeof(ebt_tables) / sizeof(ebt_tables[0]); i++) {
   1268 		struct ebt_table_desc* table = &ebt_tables[i];
   1269 		strcpy(table->replace.name, table->name);
   1270 		optlen = sizeof(table->replace);
   1271 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INIT_INFO, &table->replace, &optlen)) {
   1272 			switch (errno) {
   1273 			case EPERM:
   1274 			case ENOENT:
   1275 			case ENOPROTOOPT:
   1276 				continue;
   1277 			}
   1278 			fail("ebtable checkpoint %s: getsockopt(EBT_SO_GET_INIT_INFO)", table->name);
   1279 		}
   1280 		debug("ebtable checkpoint %s: entries=%d hooks=%x size=%d\n",
   1281 		      table->name, table->replace.nentries, table->replace.valid_hooks,
   1282 		      table->replace.entries_size);
   1283 		if (table->replace.entries_size > sizeof(table->entrytable))
   1284 			fail("ebtable checkpoint %s: table size is too large: %u",
   1285 			     table->name, table->replace.entries_size);
   1286 		table->replace.num_counters = 0;
   1287 		table->replace.entries = table->entrytable;
   1288 		optlen = sizeof(table->replace) + table->replace.entries_size;
   1289 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INIT_ENTRIES, &table->replace, &optlen))
   1290 			fail("ebtable checkpoint %s: getsockopt(EBT_SO_GET_INIT_ENTRIES)", table->name);
   1291 	}
   1292 	close(fd);
   1293 }
   1294 
   1295 static void reset_ebtables()
   1296 {
   1297 	struct ebt_replace replace;
   1298 	char entrytable[XT_TABLE_SIZE];
   1299 	socklen_t optlen;
   1300 	unsigned i, j, h;
   1301 	int fd;
   1302 
   1303 	fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
   1304 	if (fd == -1) {
   1305 		switch (errno) {
   1306 		case EAFNOSUPPORT:
   1307 		case ENOPROTOOPT:
   1308 			return;
   1309 		}
   1310 		fail("ebtable: socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)");
   1311 	}
   1312 	for (i = 0; i < sizeof(ebt_tables) / sizeof(ebt_tables[0]); i++) {
   1313 		struct ebt_table_desc* table = &ebt_tables[i];
   1314 		if (table->replace.valid_hooks == 0)
   1315 			continue;
   1316 		memset(&replace, 0, sizeof(replace));
   1317 		strcpy(replace.name, table->name);
   1318 		optlen = sizeof(replace);
   1319 		if (getsockopt(fd, SOL_IP, EBT_SO_GET_INFO, &replace, &optlen))
   1320 			fail("ebtable %s: getsockopt(EBT_SO_GET_INFO)", table->name);
   1321 		replace.num_counters = 0;
   1322 		table->replace.entries = 0;
   1323 		for (h = 0; h < NF_BR_NUMHOOKS; h++)
   1324 			table->replace.hook_entry[h] = 0;
   1325 		if (memcmp(&table->replace, &replace, sizeof(table->replace)) == 0) {
   1326 			memset(&entrytable, 0, sizeof(entrytable));
   1327 			replace.entries = entrytable;
   1328 			optlen = sizeof(replace) + replace.entries_size;
   1329 			if (getsockopt(fd, SOL_IP, EBT_SO_GET_ENTRIES, &replace, &optlen))
   1330 				fail("ebtable %s: getsockopt(EBT_SO_GET_ENTRIES)", table->name);
   1331 			if (memcmp(table->entrytable, entrytable, replace.entries_size) == 0)
   1332 				continue;
   1333 		}
   1334 		debug("ebtable %s: resetting\n", table->name);
   1335 		// Kernel does not seem to return actual entry points (wat?).
   1336 		for (j = 0, h = 0; h < NF_BR_NUMHOOKS; h++) {
   1337 			if (table->replace.valid_hooks & (1 << h)) {
   1338 				table->replace.hook_entry[h] = (struct ebt_entries*)table->entrytable + j;
   1339 				j++;
   1340 			}
   1341 		}
   1342 		table->replace.entries = table->entrytable;
   1343 		optlen = sizeof(table->replace) + table->replace.entries_size;
   1344 		if (setsockopt(fd, SOL_IP, EBT_SO_SET_ENTRIES, &table->replace, optlen))
   1345 			fail("ebtable %s: setsockopt(EBT_SO_SET_ENTRIES)", table->name);
   1346 	}
   1347 	close(fd);
   1348 }
   1349 
   1350 static void checkpoint_net_namespace(void)
   1351 {
   1352 #if SYZ_EXECUTOR
   1353 	if (flag_sandbox == sandbox_setuid)
   1354 		return;
   1355 #endif
   1356 	checkpoint_ebtables();
   1357 	checkpoint_arptables();
   1358 	checkpoint_iptables(ipv4_tables, sizeof(ipv4_tables) / sizeof(ipv4_tables[0]), AF_INET, SOL_IP);
   1359 	checkpoint_iptables(ipv6_tables, sizeof(ipv6_tables) / sizeof(ipv6_tables[0]), AF_INET6, SOL_IPV6);
   1360 }
   1361 
   1362 static void reset_net_namespace(void)
   1363 {
   1364 #if SYZ_EXECUTOR
   1365 	if (flag_sandbox == sandbox_setuid)
   1366 		return;
   1367 #endif
   1368 	reset_ebtables();
   1369 	reset_arptables();
   1370 	reset_iptables(ipv4_tables, sizeof(ipv4_tables) / sizeof(ipv4_tables[0]), AF_INET, SOL_IP);
   1371 	reset_iptables(ipv6_tables, sizeof(ipv6_tables) / sizeof(ipv6_tables[0]), AF_INET6, SOL_IPV6);
   1372 }
   1373 #endif
   1374 
   1375 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
   1376 #include <fcntl.h>
   1377 #include <sys/mount.h>
   1378 #include <sys/stat.h>
   1379 #include <sys/types.h>
   1380 
   1381 static void setup_cgroups()
   1382 {
   1383 	if (mkdir("/syzcgroup", 0777)) {
   1384 		debug("mkdir(/syzcgroup) failed: %d\n", errno);
   1385 	}
   1386 	if (mkdir("/syzcgroup/unified", 0777)) {
   1387 		debug("mkdir(/syzcgroup/unified) failed: %d\n", errno);
   1388 	}
   1389 	if (mount("none", "/syzcgroup/unified", "cgroup2", 0, NULL)) {
   1390 		debug("mount(cgroup2) failed: %d\n", errno);
   1391 	}
   1392 	if (chmod("/syzcgroup/unified", 0777)) {
   1393 		debug("chmod(/syzcgroup/unified) failed: %d\n", errno);
   1394 	}
   1395 	if (!write_file("/syzcgroup/unified/cgroup.subtree_control", "+cpu +memory +io +pids +rdma")) {
   1396 		debug("write(cgroup.subtree_control) failed: %d\n", errno);
   1397 	}
   1398 	if (mkdir("/syzcgroup/cpu", 0777)) {
   1399 		debug("mkdir(/syzcgroup/cpu) failed: %d\n", errno);
   1400 	}
   1401 	if (mount("none", "/syzcgroup/cpu", "cgroup", 0, "cpuset,cpuacct,perf_event,hugetlb")) {
   1402 		debug("mount(cgroup cpu) failed: %d\n", errno);
   1403 	}
   1404 	if (!write_file("/syzcgroup/cpu/cgroup.clone_children", "1")) {
   1405 		debug("write(/syzcgroup/cpu/cgroup.clone_children) failed: %d\n", errno);
   1406 	}
   1407 	if (chmod("/syzcgroup/cpu", 0777)) {
   1408 		debug("chmod(/syzcgroup/cpu) failed: %d\n", errno);
   1409 	}
   1410 	if (mkdir("/syzcgroup/net", 0777)) {
   1411 		debug("mkdir(/syzcgroup/net) failed: %d\n", errno);
   1412 	}
   1413 	if (mount("none", "/syzcgroup/net", "cgroup", 0, "net_cls,net_prio,devices,freezer")) {
   1414 		debug("mount(cgroup net) failed: %d\n", errno);
   1415 	}
   1416 	if (chmod("/syzcgroup/net", 0777)) {
   1417 		debug("chmod(/syzcgroup/net) failed: %d\n", errno);
   1418 	}
   1419 }
   1420 
   1421 // TODO(dvyukov): this should be under a separate define for separate minimization,
   1422 // but for now we bundle this with cgroups.
   1423 static void setup_binfmt_misc()
   1424 {
   1425 	if (mount(0, "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, 0)) {
   1426 		debug("mount(binfmt_misc) failed: %d\n", errno);
   1427 	}
   1428 	if (!write_file("/proc/sys/fs/binfmt_misc/register", ":syz0:M:0:\x01::./file0:")) {
   1429 		debug("write(/proc/sys/fs/binfmt_misc/register, syz0) failed: %d\n", errno);
   1430 	}
   1431 	if (!write_file("/proc/sys/fs/binfmt_misc/register", ":syz1:M:1:\x02::./file0:POC")) {
   1432 		debug("write(/proc/sys/fs/binfmt_misc/register, syz1) failed: %d\n", errno);
   1433 	}
   1434 }
   1435 #endif
   1436 
   1437 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
   1438 #include <errno.h>
   1439 #include <sys/mount.h>
   1440 
   1441 static void setup_common()
   1442 {
   1443 	if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0)) {
   1444 		debug("mount(fusectl) failed: %d\n", errno);
   1445 	}
   1446 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
   1447 	setup_cgroups();
   1448 	setup_binfmt_misc();
   1449 #endif
   1450 }
   1451 #endif
   1452 
   1453 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE || SYZ_SANDBOX_SETUID || SYZ_SANDBOX_NAMESPACE
   1454 #include <sched.h>
   1455 #include <sys/prctl.h>
   1456 #include <sys/resource.h>
   1457 #include <sys/time.h>
   1458 #include <sys/wait.h>
   1459 
   1460 static void loop();
   1461 
   1462 static void sandbox_common()
   1463 {
   1464 	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
   1465 	setpgrp();
   1466 	setsid();
   1467 
   1468 #if SYZ_EXECUTOR || __NR_syz_init_net_socket
   1469 	int netns = open("/proc/self/ns/net", O_RDONLY);
   1470 	if (netns == -1)
   1471 		fail("open(/proc/self/ns/net) failed");
   1472 	if (dup2(netns, kInitNetNsFd) < 0)
   1473 		fail("dup2(netns, kInitNetNsFd) failed");
   1474 	close(netns);
   1475 #endif
   1476 
   1477 	struct rlimit rlim;
   1478 	rlim.rlim_cur = rlim.rlim_max = 160 << 20;
   1479 	setrlimit(RLIMIT_AS, &rlim);
   1480 	rlim.rlim_cur = rlim.rlim_max = 8 << 20;
   1481 	setrlimit(RLIMIT_MEMLOCK, &rlim);
   1482 	rlim.rlim_cur = rlim.rlim_max = 136 << 20;
   1483 	setrlimit(RLIMIT_FSIZE, &rlim);
   1484 	rlim.rlim_cur = rlim.rlim_max = 1 << 20;
   1485 	setrlimit(RLIMIT_STACK, &rlim);
   1486 	rlim.rlim_cur = rlim.rlim_max = 0;
   1487 	setrlimit(RLIMIT_CORE, &rlim);
   1488 	rlim.rlim_cur = rlim.rlim_max = 256; // see kMaxFd
   1489 	setrlimit(RLIMIT_NOFILE, &rlim);
   1490 
   1491 	// CLONE_NEWNS/NEWCGROUP cause EINVAL on some systems,
   1492 	// so we do them separately of clone in do_sandbox_namespace.
   1493 	if (unshare(CLONE_NEWNS)) {
   1494 		debug("unshare(CLONE_NEWNS): %d\n", errno);
   1495 	}
   1496 	if (unshare(CLONE_NEWIPC)) {
   1497 		debug("unshare(CLONE_NEWIPC): %d\n", errno);
   1498 	}
   1499 	if (unshare(0x02000000)) {
   1500 		debug("unshare(CLONE_NEWCGROUP): %d\n", errno);
   1501 	}
   1502 	if (unshare(CLONE_NEWUTS)) {
   1503 		debug("unshare(CLONE_NEWUTS): %d\n", errno);
   1504 	}
   1505 	if (unshare(CLONE_SYSVSEM)) {
   1506 		debug("unshare(CLONE_SYSVSEM): %d\n", errno);
   1507 	}
   1508 }
   1509 
   1510 int wait_for_loop(int pid)
   1511 {
   1512 	if (pid < 0)
   1513 		fail("sandbox fork failed");
   1514 	debug("spawned loop pid %d\n", pid);
   1515 	int status = 0;
   1516 	while (waitpid(-1, &status, __WALL) != pid) {
   1517 	}
   1518 	return WEXITSTATUS(status);
   1519 }
   1520 #endif
   1521 
   1522 #if SYZ_EXECUTOR || SYZ_SANDBOX_NONE
   1523 #include <sched.h>
   1524 #include <sys/types.h>
   1525 
   1526 static int do_sandbox_none(void)
   1527 {
   1528 	// CLONE_NEWPID takes effect for the first child of the current process,
   1529 	// so we do it before fork to make the loop "init" process of the namespace.
   1530 	// We ought to do fail here, but sandbox=none is used in pkg/ipc tests
   1531 	// and they are usually run under non-root.
   1532 	// Also since debug is stripped by pkg/csource, we need to do {}
   1533 	// even though we generally don't do {} around single statements.
   1534 	if (unshare(CLONE_NEWPID)) {
   1535 		debug("unshare(CLONE_NEWPID): %d\n", errno);
   1536 	}
   1537 	int pid = fork();
   1538 	if (pid != 0)
   1539 		return wait_for_loop(pid);
   1540 
   1541 	setup_common();
   1542 	sandbox_common();
   1543 	if (unshare(CLONE_NEWNET)) {
   1544 		debug("unshare(CLONE_NEWNET): %d\n", errno);
   1545 	}
   1546 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
   1547 	initialize_tun();
   1548 #endif
   1549 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
   1550 	initialize_netdevices();
   1551 #endif
   1552 	loop();
   1553 	doexit(1);
   1554 }
   1555 #endif
   1556 
   1557 #if SYZ_EXECUTOR || SYZ_SANDBOX_SETUID
   1558 #include <grp.h>
   1559 #include <sched.h>
   1560 #include <sys/prctl.h>
   1561 
   1562 static int do_sandbox_setuid(void)
   1563 {
   1564 	if (unshare(CLONE_NEWPID)) {
   1565 		debug("unshare(CLONE_NEWPID): %d\n", errno);
   1566 	}
   1567 	int pid = fork();
   1568 	if (pid != 0)
   1569 		return wait_for_loop(pid);
   1570 
   1571 	setup_common();
   1572 	sandbox_common();
   1573 	if (unshare(CLONE_NEWNET)) {
   1574 		debug("unshare(CLONE_NEWNET): %d\n", errno);
   1575 	}
   1576 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
   1577 	initialize_tun();
   1578 #endif
   1579 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
   1580 	initialize_netdevices();
   1581 #endif
   1582 
   1583 	const int nobody = 65534;
   1584 	if (setgroups(0, NULL))
   1585 		fail("failed to setgroups");
   1586 	if (syscall(SYS_setresgid, nobody, nobody, nobody))
   1587 		fail("failed to setresgid");
   1588 	if (syscall(SYS_setresuid, nobody, nobody, nobody))
   1589 		fail("failed to setresuid");
   1590 
   1591 	// This is required to open /proc/self/* files.
   1592 	// Otherwise they are owned by root and we can't open them after setuid.
   1593 	// See task_dump_owner function in kernel.
   1594 	prctl(PR_SET_DUMPABLE, 1, 0, 0, 0);
   1595 
   1596 	loop();
   1597 	doexit(1);
   1598 }
   1599 #endif
   1600 
   1601 #if SYZ_EXECUTOR || SYZ_SANDBOX_NAMESPACE
   1602 #include <linux/capability.h>
   1603 #include <sched.h>
   1604 #include <sys/mman.h>
   1605 #include <sys/mount.h>
   1606 
   1607 static int real_uid;
   1608 static int real_gid;
   1609 __attribute__((aligned(64 << 10))) static char sandbox_stack[1 << 20];
   1610 
   1611 static int namespace_sandbox_proc(void* arg)
   1612 {
   1613 	sandbox_common();
   1614 
   1615 	// /proc/self/setgroups is not present on some systems, ignore error.
   1616 	write_file("/proc/self/setgroups", "deny");
   1617 	if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid))
   1618 		fail("write of /proc/self/uid_map failed");
   1619 	if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid))
   1620 		fail("write of /proc/self/gid_map failed");
   1621 
   1622 	// CLONE_NEWNET must always happen before tun setup,
   1623 	// because we want the tun device in the test namespace.
   1624 	if (unshare(CLONE_NEWNET))
   1625 		fail("unshare(CLONE_NEWNET)");
   1626 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
   1627 	// We setup tun here as it needs to be in the test net namespace,
   1628 	// which in turn needs to be in the test user namespace.
   1629 	// However, IFF_NAPI_FRAGS will fail as we are not root already.
   1630 	// There does not seem to be a call sequence that would satisfy all of that.
   1631 	initialize_tun();
   1632 #endif
   1633 #if SYZ_EXECUTOR || SYZ_ENABLE_NETDEV
   1634 	initialize_netdevices();
   1635 #endif
   1636 
   1637 	if (mkdir("./syz-tmp", 0777))
   1638 		fail("mkdir(syz-tmp) failed");
   1639 	if (mount("", "./syz-tmp", "tmpfs", 0, NULL))
   1640 		fail("mount(tmpfs) failed");
   1641 	if (mkdir("./syz-tmp/newroot", 0777))
   1642 		fail("mkdir failed");
   1643 	if (mkdir("./syz-tmp/newroot/dev", 0700))
   1644 		fail("mkdir failed");
   1645 	unsigned bind_mount_flags = MS_BIND | MS_REC | MS_PRIVATE;
   1646 	if (mount("/dev", "./syz-tmp/newroot/dev", NULL, bind_mount_flags, NULL))
   1647 		fail("mount(dev) failed");
   1648 	if (mkdir("./syz-tmp/newroot/proc", 0700))
   1649 		fail("mkdir failed");
   1650 	if (mount(NULL, "./syz-tmp/newroot/proc", "proc", 0, NULL))
   1651 		fail("mount(proc) failed");
   1652 	if (mkdir("./syz-tmp/newroot/selinux", 0700))
   1653 		fail("mkdir failed");
   1654 	// selinux mount used to be at /selinux, but then moved to /sys/fs/selinux.
   1655 	const char* selinux_path = "./syz-tmp/newroot/selinux";
   1656 	if (mount("/selinux", selinux_path, NULL, bind_mount_flags, NULL)) {
   1657 		if (errno != ENOENT)
   1658 			fail("mount(/selinux) failed");
   1659 		if (mount("/sys/fs/selinux", selinux_path, NULL, bind_mount_flags, NULL) && errno != ENOENT)
   1660 			fail("mount(/sys/fs/selinux) failed");
   1661 	}
   1662 	if (mkdir("./syz-tmp/newroot/sys", 0700))
   1663 		fail("mkdir failed");
   1664 	if (mount("/sys", "./syz-tmp/newroot/sys", 0, bind_mount_flags, NULL))
   1665 		fail("mount(sysfs) failed");
   1666 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
   1667 	if (mkdir("./syz-tmp/newroot/syzcgroup", 0700))
   1668 		fail("mkdir failed");
   1669 	if (mkdir("./syz-tmp/newroot/syzcgroup/unified", 0700))
   1670 		fail("mkdir failed");
   1671 	if (mkdir("./syz-tmp/newroot/syzcgroup/cpu", 0700))
   1672 		fail("mkdir failed");
   1673 	if (mkdir("./syz-tmp/newroot/syzcgroup/net", 0700))
   1674 		fail("mkdir failed");
   1675 	if (mount("/syzcgroup/unified", "./syz-tmp/newroot/syzcgroup/unified", NULL, bind_mount_flags, NULL)) {
   1676 		debug("mount(cgroup2, MS_BIND) failed: %d\n", errno);
   1677 	}
   1678 	if (mount("/syzcgroup/cpu", "./syz-tmp/newroot/syzcgroup/cpu", NULL, bind_mount_flags, NULL)) {
   1679 		debug("mount(cgroup/cpu, MS_BIND) failed: %d\n", errno);
   1680 	}
   1681 	if (mount("/syzcgroup/net", "./syz-tmp/newroot/syzcgroup/net", NULL, bind_mount_flags, NULL)) {
   1682 		debug("mount(cgroup/net, MS_BIND) failed: %d\n", errno);
   1683 	}
   1684 #endif
   1685 	if (mkdir("./syz-tmp/pivot", 0777))
   1686 		fail("mkdir failed");
   1687 	if (syscall(SYS_pivot_root, "./syz-tmp", "./syz-tmp/pivot")) {
   1688 		debug("pivot_root failed\n");
   1689 		if (chdir("./syz-tmp"))
   1690 			fail("chdir failed");
   1691 	} else {
   1692 		debug("pivot_root OK\n");
   1693 		if (chdir("/"))
   1694 			fail("chdir failed");
   1695 		if (umount2("./pivot", MNT_DETACH))
   1696 			fail("umount failed");
   1697 	}
   1698 	if (chroot("./newroot"))
   1699 		fail("chroot failed");
   1700 	if (chdir("/"))
   1701 		fail("chdir failed");
   1702 
   1703 	// Drop CAP_SYS_PTRACE so that test processes can't attach to parent processes.
   1704 	// Previously it lead to hangs because the loop process stopped due to SIGSTOP.
   1705 	// Note that a process can always ptrace its direct children, which is enough
   1706 	// for testing purposes.
   1707 	struct __user_cap_header_struct cap_hdr = {};
   1708 	struct __user_cap_data_struct cap_data[2] = {};
   1709 	cap_hdr.version = _LINUX_CAPABILITY_VERSION_3;
   1710 	cap_hdr.pid = getpid();
   1711 	if (syscall(SYS_capget, &cap_hdr, &cap_data))
   1712 		fail("capget failed");
   1713 	cap_data[0].effective &= ~(1 << CAP_SYS_PTRACE);
   1714 	cap_data[0].permitted &= ~(1 << CAP_SYS_PTRACE);
   1715 	cap_data[0].inheritable &= ~(1 << CAP_SYS_PTRACE);
   1716 	if (syscall(SYS_capset, &cap_hdr, &cap_data))
   1717 		fail("capset failed");
   1718 
   1719 	loop();
   1720 	doexit(1);
   1721 }
   1722 
   1723 static int do_sandbox_namespace(void)
   1724 {
   1725 	int pid;
   1726 
   1727 	setup_common();
   1728 	real_uid = getuid();
   1729 	real_gid = getgid();
   1730 	mprotect(sandbox_stack, 4096, PROT_NONE); // to catch stack underflows
   1731 	pid = clone(namespace_sandbox_proc, &sandbox_stack[sizeof(sandbox_stack) - 64],
   1732 		    CLONE_NEWUSER | CLONE_NEWPID, 0);
   1733 	return wait_for_loop(pid);
   1734 }
   1735 #endif
   1736 
   1737 #if SYZ_EXECUTOR || SYZ_REPEAT && SYZ_USE_TMP_DIR
   1738 #include <dirent.h>
   1739 #include <errno.h>
   1740 #include <string.h>
   1741 #include <sys/ioctl.h>
   1742 #include <sys/mount.h>
   1743 
   1744 #define FS_IOC_SETFLAGS _IOW('f', 2, long)
   1745 
   1746 // One does not simply remove a directory.
   1747 // There can be mounts, so we need to try to umount.
   1748 // Moreover, a mount can be mounted several times, so we need to try to umount in a loop.
   1749 // Moreover, after umount a dir can become non-empty again, so we need another loop.
   1750 // Moreover, a mount can be re-mounted as read-only and then we will fail to make a dir empty.
   1751 static void remove_dir(const char* dir)
   1752 {
   1753 	DIR* dp;
   1754 	struct dirent* ep;
   1755 	int iter = 0;
   1756 retry:
   1757 	while (umount2(dir, MNT_DETACH) == 0) {
   1758 		debug("umount(%s)\n", dir);
   1759 	}
   1760 	dp = opendir(dir);
   1761 	if (dp == NULL) {
   1762 		if (errno == EMFILE) {
   1763 			// This happens when the test process casts prlimit(NOFILE) on us.
   1764 			// Ideally we somehow prevent test processes from messing with parent processes.
   1765 			// But full sandboxing is expensive, so let's ignore this error for now.
   1766 			exitf("opendir(%s) failed due to NOFILE, exiting", dir);
   1767 		}
   1768 		exitf("opendir(%s) failed", dir);
   1769 	}
   1770 	while ((ep = readdir(dp))) {
   1771 		if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
   1772 			continue;
   1773 		char filename[FILENAME_MAX];
   1774 		snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
   1775 		// If it's 9p mount with broken transport, lstat will fail.
   1776 		// So try to umount first.
   1777 		while (umount2(filename, MNT_DETACH) == 0) {
   1778 			debug("umount(%s)\n", filename);
   1779 		}
   1780 		struct stat st;
   1781 		if (lstat(filename, &st))
   1782 			exitf("lstat(%s) failed", filename);
   1783 		if (S_ISDIR(st.st_mode)) {
   1784 			remove_dir(filename);
   1785 			continue;
   1786 		}
   1787 		int i;
   1788 		for (i = 0;; i++) {
   1789 			debug("unlink(%s)\n", filename);
   1790 			if (unlink(filename) == 0)
   1791 				break;
   1792 			if (errno == EPERM) {
   1793 				// Try to reset FS_XFLAG_IMMUTABLE.
   1794 				int fd = open(filename, O_RDONLY);
   1795 				if (fd != -1) {
   1796 					long flags = 0;
   1797 					if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
   1798 						debug("reset FS_XFLAG_IMMUTABLE\n");
   1799 					close(fd);
   1800 					continue;
   1801 				}
   1802 			}
   1803 			if (errno == EROFS) {
   1804 				debug("ignoring EROFS\n");
   1805 				break;
   1806 			}
   1807 			if (errno != EBUSY || i > 100)
   1808 				exitf("unlink(%s) failed", filename);
   1809 			debug("umount(%s)\n", filename);
   1810 			if (umount2(filename, MNT_DETACH))
   1811 				exitf("umount(%s) failed", filename);
   1812 		}
   1813 	}
   1814 	closedir(dp);
   1815 	int i;
   1816 	for (i = 0;; i++) {
   1817 		debug("rmdir(%s)\n", dir);
   1818 		if (rmdir(dir) == 0)
   1819 			break;
   1820 		if (i < 100) {
   1821 			if (errno == EPERM) {
   1822 				// Try to reset FS_XFLAG_IMMUTABLE.
   1823 				int fd = open(dir, O_RDONLY);
   1824 				if (fd != -1) {
   1825 					long flags = 0;
   1826 					if (ioctl(fd, FS_IOC_SETFLAGS, &flags) == 0)
   1827 						debug("reset FS_XFLAG_IMMUTABLE\n");
   1828 					close(fd);
   1829 					continue;
   1830 				}
   1831 			}
   1832 			if (errno == EROFS) {
   1833 				debug("ignoring EROFS\n");
   1834 				break;
   1835 			}
   1836 			if (errno == EBUSY) {
   1837 				debug("umount(%s)\n", dir);
   1838 				if (umount2(dir, MNT_DETACH))
   1839 					exitf("umount(%s) failed", dir);
   1840 				continue;
   1841 			}
   1842 			if (errno == ENOTEMPTY) {
   1843 				if (iter < 100) {
   1844 					iter++;
   1845 					goto retry;
   1846 				}
   1847 			}
   1848 		}
   1849 		exitf("rmdir(%s) failed", dir);
   1850 	}
   1851 }
   1852 #endif
   1853 
   1854 #if SYZ_EXECUTOR || SYZ_FAULT_INJECTION
   1855 #include <fcntl.h>
   1856 #include <string.h>
   1857 #include <sys/stat.h>
   1858 #include <sys/types.h>
   1859 
   1860 static int inject_fault(int nth)
   1861 {
   1862 	int fd;
   1863 	char buf[16];
   1864 
   1865 	fd = open("/proc/thread-self/fail-nth", O_RDWR);
   1866 	// We treat errors here as temporal/non-critical because we see
   1867 	// occasional ENOENT/EACCES errors returned. It seems that fuzzer
   1868 	// somehow gets its hands to it.
   1869 	if (fd == -1)
   1870 		exitf("failed to open /proc/thread-self/fail-nth");
   1871 	sprintf(buf, "%d", nth + 1);
   1872 	if (write(fd, buf, strlen(buf)) != (ssize_t)strlen(buf))
   1873 		exitf("failed to write /proc/thread-self/fail-nth");
   1874 	return fd;
   1875 }
   1876 #endif
   1877 
   1878 #if SYZ_EXECUTOR
   1879 static int fault_injected(int fail_fd)
   1880 {
   1881 	char buf[16];
   1882 	int n = read(fail_fd, buf, sizeof(buf) - 1);
   1883 	if (n <= 0)
   1884 		exitf("failed to read /proc/thread-self/fail-nth");
   1885 	int res = n == 2 && buf[0] == '0' && buf[1] == '\n';
   1886 	buf[0] = '0';
   1887 	if (write(fail_fd, buf, 1) != 1)
   1888 		exitf("failed to write /proc/thread-self/fail-nth");
   1889 	close(fail_fd);
   1890 	return res;
   1891 }
   1892 #endif
   1893 
   1894 #if SYZ_EXECUTOR || SYZ_REPEAT
   1895 #include <dirent.h>
   1896 #include <errno.h>
   1897 #include <fcntl.h>
   1898 #include <signal.h>
   1899 #include <string.h>
   1900 #include <sys/stat.h>
   1901 #include <sys/types.h>
   1902 #include <sys/wait.h>
   1903 
   1904 static void kill_and_wait(int pid, int* status)
   1905 {
   1906 	kill(-pid, SIGKILL);
   1907 	kill(pid, SIGKILL);
   1908 	int i;
   1909 	// First, give it up to 100 ms to surrender.
   1910 	for (i = 0; i < 100; i++) {
   1911 		if (waitpid(-1, status, WNOHANG | __WALL) == pid)
   1912 			return;
   1913 		usleep(1000);
   1914 	}
   1915 	// Now, try to abort fuse connections as they cause deadlocks,
   1916 	// see Documentation/filesystems/fuse.txt for details.
   1917 	// There is no good way to figure out the right connections
   1918 	// provided that the process could use unshare(CLONE_NEWNS),
   1919 	// so we abort all.
   1920 	debug("kill is not working\n");
   1921 	DIR* dir = opendir("/sys/fs/fuse/connections");
   1922 	if (dir) {
   1923 		for (;;) {
   1924 			struct dirent* ent = readdir(dir);
   1925 			if (!ent)
   1926 				break;
   1927 			if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
   1928 				continue;
   1929 			char abort[300];
   1930 			snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name);
   1931 			int fd = open(abort, O_WRONLY);
   1932 			if (fd == -1) {
   1933 				debug("failed to open %s: %d\n", abort, errno);
   1934 				continue;
   1935 			}
   1936 			debug("aborting fuse conn %s\n", ent->d_name);
   1937 			if (write(fd, abort, 1) < 0) {
   1938 				debug("failed to abort: %d\n", errno);
   1939 			}
   1940 			close(fd);
   1941 		}
   1942 		closedir(dir);
   1943 	} else {
   1944 		debug("failed to open /sys/fs/fuse/connections: %d\n", errno);
   1945 	}
   1946 	// Now, just wait, no other options.
   1947 	while (waitpid(-1, status, __WALL) != pid) {
   1948 	}
   1949 }
   1950 #endif
   1951 
   1952 #if SYZ_EXECUTOR || SYZ_REPEAT && (SYZ_ENABLE_CGROUPS || SYZ_RESET_NET_NAMESPACE)
   1953 #include <fcntl.h>
   1954 #include <sys/ioctl.h>
   1955 #include <sys/stat.h>
   1956 #include <sys/types.h>
   1957 #include <unistd.h>
   1958 
   1959 #define SYZ_HAVE_SETUP_LOOP 1
   1960 static void setup_loop()
   1961 {
   1962 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
   1963 	int pid = getpid();
   1964 	char cgroupdir[64];
   1965 	char procs_file[128];
   1966 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu", procid);
   1967 	if (mkdir(cgroupdir, 0777)) {
   1968 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
   1969 	}
   1970 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
   1971 	if (!write_file(procs_file, "%d", pid)) {
   1972 		debug("write(%s) failed: %d\n", procs_file, errno);
   1973 	}
   1974 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu", procid);
   1975 	if (mkdir(cgroupdir, 0777)) {
   1976 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
   1977 	}
   1978 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
   1979 	if (!write_file(procs_file, "%d", pid)) {
   1980 		debug("write(%s) failed: %d\n", procs_file, errno);
   1981 	}
   1982 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu", procid);
   1983 	if (mkdir(cgroupdir, 0777)) {
   1984 		debug("mkdir(%s) failed: %d\n", cgroupdir, errno);
   1985 	}
   1986 	snprintf(procs_file, sizeof(procs_file), "%s/cgroup.procs", cgroupdir);
   1987 	if (!write_file(procs_file, "%d", pid)) {
   1988 		debug("write(%s) failed: %d\n", procs_file, errno);
   1989 	}
   1990 #endif
   1991 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
   1992 	checkpoint_net_namespace();
   1993 #endif
   1994 }
   1995 #endif
   1996 
   1997 #if SYZ_EXECUTOR || SYZ_REPEAT && (SYZ_RESET_NET_NAMESPACE || __NR_syz_mount_image || __NR_syz_read_part_table)
   1998 #define SYZ_HAVE_RESET_LOOP 1
   1999 static void reset_loop()
   2000 {
   2001 #if SYZ_EXECUTOR || __NR_syz_mount_image || __NR_syz_read_part_table
   2002 	char buf[64];
   2003 	snprintf(buf, sizeof(buf), "/dev/loop%llu", procid);
   2004 	int loopfd = open(buf, O_RDWR);
   2005 	if (loopfd != -1) {
   2006 		ioctl(loopfd, LOOP_CLR_FD, 0);
   2007 		close(loopfd);
   2008 	}
   2009 #endif
   2010 #if SYZ_EXECUTOR || SYZ_RESET_NET_NAMESPACE
   2011 	reset_net_namespace();
   2012 #endif
   2013 }
   2014 #endif
   2015 
   2016 #if SYZ_EXECUTOR || SYZ_REPEAT
   2017 #include <sys/prctl.h>
   2018 
   2019 #define SYZ_HAVE_SETUP_TEST 1
   2020 static void setup_test()
   2021 {
   2022 	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
   2023 	setpgrp();
   2024 #if SYZ_EXECUTOR || SYZ_ENABLE_CGROUPS
   2025 	char cgroupdir[64];
   2026 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/unified/syz%llu", procid);
   2027 	if (symlink(cgroupdir, "./cgroup")) {
   2028 		debug("symlink(%s, ./cgroup) failed: %d\n", cgroupdir, errno);
   2029 	}
   2030 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/cpu/syz%llu", procid);
   2031 	if (symlink(cgroupdir, "./cgroup.cpu")) {
   2032 		debug("symlink(%s, ./cgroup.cpu) failed: %d\n", cgroupdir, errno);
   2033 	}
   2034 	snprintf(cgroupdir, sizeof(cgroupdir), "/syzcgroup/net/syz%llu", procid);
   2035 	if (symlink(cgroupdir, "./cgroup.net")) {
   2036 		debug("symlink(%s, ./cgroup.net) failed: %d\n", cgroupdir, errno);
   2037 	}
   2038 #endif
   2039 #if SYZ_EXECUTOR || SYZ_TUN_ENABLE
   2040 	// Read all remaining packets from tun to better
   2041 	// isolate consequently executing programs.
   2042 	flush_tun();
   2043 #endif
   2044 }
   2045 
   2046 #define SYZ_HAVE_RESET_TEST 1
   2047 static void reset_test()
   2048 {
   2049 	// Keeping a 9p transport pipe open will hang the proccess dead,
   2050 	// so close all opened file descriptors.
   2051 	int fd;
   2052 	for (fd = 3; fd < 30; fd++)
   2053 		close(fd);
   2054 }
   2055 #endif
   2056