Home | History | Annotate | Download | only in x86
      1 // SPDX-License-Identifier: GPL-2.0
      2 /*
      3  * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt)
      4  *
      5  * There are examples in here of:
      6  *  * how to set protection keys on memory
      7  *  * how to set/clear bits in PKRU (the rights register)
      8  *  * how to handle SEGV_PKRU signals and extract pkey-relevant
      9  *    information from the siginfo
     10  *
     11  * Things to add:
     12  *	make sure KSM and KSM COW breaking works
     13  *	prefault pages in at malloc, or not
     14  *	protect MPX bounds tables with protection keys?
     15  *	make sure VMA splitting/merging is working correctly
     16  *	OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
     17  *	look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
     18  *	do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
     19  *
     20  * Compile like this:
     21  *	gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
     22  *	gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
     23  */
     24 #define _GNU_SOURCE
     25 #include <errno.h>
     26 #include <linux/futex.h>
     27 #include <sys/time.h>
     28 #include <sys/syscall.h>
     29 #include <string.h>
     30 #include <stdio.h>
     31 #include <stdint.h>
     32 #include <stdbool.h>
     33 #include <signal.h>
     34 #include <assert.h>
     35 #include <stdlib.h>
     36 #include <ucontext.h>
     37 #include <sys/mman.h>
     38 #include <sys/types.h>
     39 #include <sys/wait.h>
     40 #include <sys/stat.h>
     41 #include <fcntl.h>
     42 #include <unistd.h>
     43 #include <sys/ptrace.h>
     44 #include <setjmp.h>
     45 
     46 #include "pkey-helpers.h"
     47 
     48 int iteration_nr = 1;
     49 int test_nr;
     50 
     51 unsigned int shadow_pkru;
     52 
     53 #define HPAGE_SIZE	(1UL<<21)
     54 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
     55 #define ALIGN_UP(x, align_to)	(((x) + ((align_to)-1)) & ~((align_to)-1))
     56 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
     57 #define ALIGN_PTR_UP(p, ptr_align_to)	((typeof(p))ALIGN_UP((unsigned long)(p),	ptr_align_to))
     58 #define ALIGN_PTR_DOWN(p, ptr_align_to)	((typeof(p))ALIGN_DOWN((unsigned long)(p),	ptr_align_to))
     59 #define __stringify_1(x...)     #x
     60 #define __stringify(x...)       __stringify_1(x)
     61 
     62 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
     63 
     64 int dprint_in_signal;
     65 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
     66 
     67 extern void abort_hooks(void);
     68 #define pkey_assert(condition) do {		\
     69 	if (!(condition)) {			\
     70 		dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
     71 				__FILE__, __LINE__,	\
     72 				test_nr, iteration_nr);	\
     73 		dprintf0("errno at assert: %d", errno);	\
     74 		abort_hooks();			\
     75 		assert(condition);		\
     76 	}					\
     77 } while (0)
     78 #define raw_assert(cond) assert(cond)
     79 
     80 void cat_into_file(char *str, char *file)
     81 {
     82 	int fd = open(file, O_RDWR);
     83 	int ret;
     84 
     85 	dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
     86 	/*
     87 	 * these need to be raw because they are called under
     88 	 * pkey_assert()
     89 	 */
     90 	raw_assert(fd >= 0);
     91 	ret = write(fd, str, strlen(str));
     92 	if (ret != strlen(str)) {
     93 		perror("write to file failed");
     94 		fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
     95 		raw_assert(0);
     96 	}
     97 	close(fd);
     98 }
     99 
    100 #if CONTROL_TRACING > 0
    101 static int warned_tracing;
    102 int tracing_root_ok(void)
    103 {
    104 	if (geteuid() != 0) {
    105 		if (!warned_tracing)
    106 			fprintf(stderr, "WARNING: not run as root, "
    107 					"can not do tracing control\n");
    108 		warned_tracing = 1;
    109 		return 0;
    110 	}
    111 	return 1;
    112 }
    113 #endif
    114 
    115 void tracing_on(void)
    116 {
    117 #if CONTROL_TRACING > 0
    118 #define TRACEDIR "/sys/kernel/debug/tracing"
    119 	char pidstr[32];
    120 
    121 	if (!tracing_root_ok())
    122 		return;
    123 
    124 	sprintf(pidstr, "%d", getpid());
    125 	cat_into_file("0", TRACEDIR "/tracing_on");
    126 	cat_into_file("\n", TRACEDIR "/trace");
    127 	if (1) {
    128 		cat_into_file("function_graph", TRACEDIR "/current_tracer");
    129 		cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
    130 	} else {
    131 		cat_into_file("nop", TRACEDIR "/current_tracer");
    132 	}
    133 	cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
    134 	cat_into_file("1", TRACEDIR "/tracing_on");
    135 	dprintf1("enabled tracing\n");
    136 #endif
    137 }
    138 
    139 void tracing_off(void)
    140 {
    141 #if CONTROL_TRACING > 0
    142 	if (!tracing_root_ok())
    143 		return;
    144 	cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
    145 #endif
    146 }
    147 
    148 void abort_hooks(void)
    149 {
    150 	fprintf(stderr, "running %s()...\n", __func__);
    151 	tracing_off();
    152 #ifdef SLEEP_ON_ABORT
    153 	sleep(SLEEP_ON_ABORT);
    154 #endif
    155 }
    156 
    157 static inline void __page_o_noops(void)
    158 {
    159 	/* 8-bytes of instruction * 512 bytes = 1 page */
    160 	asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
    161 }
    162 
    163 /*
    164  * This attempts to have roughly a page of instructions followed by a few
    165  * instructions that do a write, and another page of instructions.  That
    166  * way, we are pretty sure that the write is in the second page of
    167  * instructions and has at least a page of padding behind it.
    168  *
    169  * *That* lets us be sure to madvise() away the write instruction, which
    170  * will then fault, which makes sure that the fault code handles
    171  * execute-only memory properly.
    172  */
    173 __attribute__((__aligned__(PAGE_SIZE)))
    174 void lots_o_noops_around_write(int *write_to_me)
    175 {
    176 	dprintf3("running %s()\n", __func__);
    177 	__page_o_noops();
    178 	/* Assume this happens in the second page of instructions: */
    179 	*write_to_me = __LINE__;
    180 	/* pad out by another page: */
    181 	__page_o_noops();
    182 	dprintf3("%s() done\n", __func__);
    183 }
    184 
    185 /* Define some kernel-like types */
    186 #define  u8 uint8_t
    187 #define u16 uint16_t
    188 #define u32 uint32_t
    189 #define u64 uint64_t
    190 
    191 #ifdef __i386__
    192 
    193 #ifndef SYS_mprotect_key
    194 # define SYS_mprotect_key 380
    195 #endif
    196 #ifndef SYS_pkey_alloc
    197 # define SYS_pkey_alloc	 381
    198 # define SYS_pkey_free	 382
    199 #endif
    200 #define REG_IP_IDX REG_EIP
    201 #define si_pkey_offset 0x14
    202 
    203 #else
    204 
    205 #ifndef SYS_mprotect_key
    206 # define SYS_mprotect_key 329
    207 #endif
    208 #ifndef SYS_pkey_alloc
    209 # define SYS_pkey_alloc	 330
    210 # define SYS_pkey_free	 331
    211 #endif
    212 #define REG_IP_IDX REG_RIP
    213 #define si_pkey_offset 0x20
    214 
    215 #endif
    216 
    217 void dump_mem(void *dumpme, int len_bytes)
    218 {
    219 	char *c = (void *)dumpme;
    220 	int i;
    221 
    222 	for (i = 0; i < len_bytes; i += sizeof(u64)) {
    223 		u64 *ptr = (u64 *)(c + i);
    224 		dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
    225 	}
    226 }
    227 
    228 #define SEGV_BNDERR     3  /* failed address bound checks */
    229 #define SEGV_PKUERR     4
    230 
    231 static char *si_code_str(int si_code)
    232 {
    233 	if (si_code == SEGV_MAPERR)
    234 		return "SEGV_MAPERR";
    235 	if (si_code == SEGV_ACCERR)
    236 		return "SEGV_ACCERR";
    237 	if (si_code == SEGV_BNDERR)
    238 		return "SEGV_BNDERR";
    239 	if (si_code == SEGV_PKUERR)
    240 		return "SEGV_PKUERR";
    241 	return "UNKNOWN";
    242 }
    243 
    244 int pkru_faults;
    245 int last_si_pkey = -1;
    246 void signal_handler(int signum, siginfo_t *si, void *vucontext)
    247 {
    248 	ucontext_t *uctxt = vucontext;
    249 	int trapno;
    250 	unsigned long ip;
    251 	char *fpregs;
    252 	u32 *pkru_ptr;
    253 	u64 siginfo_pkey;
    254 	u32 *si_pkey_ptr;
    255 	int pkru_offset;
    256 	fpregset_t fpregset;
    257 
    258 	dprint_in_signal = 1;
    259 	dprintf1(">>>>===============SIGSEGV============================\n");
    260 	dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
    261 			__rdpkru(), shadow_pkru);
    262 
    263 	trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
    264 	ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
    265 	fpregset = uctxt->uc_mcontext.fpregs;
    266 	fpregs = (void *)fpregset;
    267 
    268 	dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
    269 			trapno, ip, si_code_str(si->si_code), si->si_code);
    270 #ifdef __i386__
    271 	/*
    272 	 * 32-bit has some extra padding so that userspace can tell whether
    273 	 * the XSTATE header is present in addition to the "legacy" FPU
    274 	 * state.  We just assume that it is here.
    275 	 */
    276 	fpregs += 0x70;
    277 #endif
    278 	pkru_offset = pkru_xstate_offset();
    279 	pkru_ptr = (void *)(&fpregs[pkru_offset]);
    280 
    281 	dprintf1("siginfo: %p\n", si);
    282 	dprintf1(" fpregs: %p\n", fpregs);
    283 	/*
    284 	 * If we got a PKRU fault, we *HAVE* to have at least one bit set in
    285 	 * here.
    286 	 */
    287 	dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
    288 	if (DEBUG_LEVEL > 4)
    289 		dump_mem(pkru_ptr - 128, 256);
    290 	pkey_assert(*pkru_ptr);
    291 
    292 	si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
    293 	dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
    294 	dump_mem(si_pkey_ptr - 8, 24);
    295 	siginfo_pkey = *si_pkey_ptr;
    296 	pkey_assert(siginfo_pkey < NR_PKEYS);
    297 	last_si_pkey = siginfo_pkey;
    298 
    299 	if ((si->si_code == SEGV_MAPERR) ||
    300 	    (si->si_code == SEGV_ACCERR) ||
    301 	    (si->si_code == SEGV_BNDERR)) {
    302 		printf("non-PK si_code, exiting...\n");
    303 		exit(4);
    304 	}
    305 
    306 	dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
    307 	/* need __rdpkru() version so we do not do shadow_pkru checking */
    308 	dprintf1("signal pkru from  pkru: %08x\n", __rdpkru());
    309 	dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
    310 	*(u64 *)pkru_ptr = 0x00000000;
    311 	dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
    312 	pkru_faults++;
    313 	dprintf1("<<<<==================================================\n");
    314 	return;
    315 	if (trapno == 14) {
    316 		fprintf(stderr,
    317 			"ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
    318 			trapno, ip);
    319 		fprintf(stderr, "si_addr %p\n", si->si_addr);
    320 		fprintf(stderr, "REG_ERR: %lx\n",
    321 				(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
    322 		exit(1);
    323 	} else {
    324 		fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
    325 		fprintf(stderr, "si_addr %p\n", si->si_addr);
    326 		fprintf(stderr, "REG_ERR: %lx\n",
    327 				(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
    328 		exit(2);
    329 	}
    330 	dprint_in_signal = 0;
    331 }
    332 
    333 int wait_all_children(void)
    334 {
    335 	int status;
    336 	return waitpid(-1, &status, 0);
    337 }
    338 
    339 void sig_chld(int x)
    340 {
    341 	dprint_in_signal = 1;
    342 	dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
    343 	dprint_in_signal = 0;
    344 }
    345 
    346 void setup_sigsegv_handler(void)
    347 {
    348 	int r, rs;
    349 	struct sigaction newact;
    350 	struct sigaction oldact;
    351 
    352 	/* #PF is mapped to sigsegv */
    353 	int signum  = SIGSEGV;
    354 
    355 	newact.sa_handler = 0;
    356 	newact.sa_sigaction = signal_handler;
    357 
    358 	/*sigset_t - signals to block while in the handler */
    359 	/* get the old signal mask. */
    360 	rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
    361 	pkey_assert(rs == 0);
    362 
    363 	/* call sa_sigaction, not sa_handler*/
    364 	newact.sa_flags = SA_SIGINFO;
    365 
    366 	newact.sa_restorer = 0;  /* void(*)(), obsolete */
    367 	r = sigaction(signum, &newact, &oldact);
    368 	r = sigaction(SIGALRM, &newact, &oldact);
    369 	pkey_assert(r == 0);
    370 }
    371 
    372 void setup_handlers(void)
    373 {
    374 	signal(SIGCHLD, &sig_chld);
    375 	setup_sigsegv_handler();
    376 }
    377 
    378 pid_t fork_lazy_child(void)
    379 {
    380 	pid_t forkret;
    381 
    382 	forkret = fork();
    383 	pkey_assert(forkret >= 0);
    384 	dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
    385 
    386 	if (!forkret) {
    387 		/* in the child */
    388 		while (1) {
    389 			dprintf1("child sleeping...\n");
    390 			sleep(30);
    391 		}
    392 	}
    393 	return forkret;
    394 }
    395 
    396 void davecmp(void *_a, void *_b, int len)
    397 {
    398 	int i;
    399 	unsigned long *a = _a;
    400 	unsigned long *b = _b;
    401 
    402 	for (i = 0; i < len / sizeof(*a); i++) {
    403 		if (a[i] == b[i])
    404 			continue;
    405 
    406 		dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]);
    407 	}
    408 }
    409 
    410 void dumpit(char *f)
    411 {
    412 	int fd = open(f, O_RDONLY);
    413 	char buf[100];
    414 	int nr_read;
    415 
    416 	dprintf2("maps fd: %d\n", fd);
    417 	do {
    418 		nr_read = read(fd, &buf[0], sizeof(buf));
    419 		write(1, buf, nr_read);
    420 	} while (nr_read > 0);
    421 	close(fd);
    422 }
    423 
    424 #define PKEY_DISABLE_ACCESS    0x1
    425 #define PKEY_DISABLE_WRITE     0x2
    426 
    427 u32 pkey_get(int pkey, unsigned long flags)
    428 {
    429 	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
    430 	u32 pkru = __rdpkru();
    431 	u32 shifted_pkru;
    432 	u32 masked_pkru;
    433 
    434 	dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
    435 			__func__, pkey, flags, 0, 0);
    436 	dprintf2("%s() raw pkru: %x\n", __func__, pkru);
    437 
    438 	shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
    439 	dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
    440 	masked_pkru = shifted_pkru & mask;
    441 	dprintf2("%s() masked  pkru: %x\n", __func__, masked_pkru);
    442 	/*
    443 	 * shift down the relevant bits to the lowest two, then
    444 	 * mask off all the other high bits.
    445 	 */
    446 	return masked_pkru;
    447 }
    448 
    449 int pkey_set(int pkey, unsigned long rights, unsigned long flags)
    450 {
    451 	u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
    452 	u32 old_pkru = __rdpkru();
    453 	u32 new_pkru;
    454 
    455 	/* make sure that 'rights' only contains the bits we expect: */
    456 	assert(!(rights & ~mask));
    457 
    458 	/* copy old pkru */
    459 	new_pkru = old_pkru;
    460 	/* mask out bits from pkey in old value: */
    461 	new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
    462 	/* OR in new bits for pkey: */
    463 	new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
    464 
    465 	__wrpkru(new_pkru);
    466 
    467 	dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
    468 			__func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
    469 	return 0;
    470 }
    471 
    472 void pkey_disable_set(int pkey, int flags)
    473 {
    474 	unsigned long syscall_flags = 0;
    475 	int ret;
    476 	int pkey_rights;
    477 	u32 orig_pkru = rdpkru();
    478 
    479 	dprintf1("START->%s(%d, 0x%x)\n", __func__,
    480 		pkey, flags);
    481 	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
    482 
    483 	pkey_rights = pkey_get(pkey, syscall_flags);
    484 
    485 	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
    486 			pkey, pkey, pkey_rights);
    487 	pkey_assert(pkey_rights >= 0);
    488 
    489 	pkey_rights |= flags;
    490 
    491 	ret = pkey_set(pkey, pkey_rights, syscall_flags);
    492 	assert(!ret);
    493 	/*pkru and flags have the same format */
    494 	shadow_pkru |= flags << (pkey * 2);
    495 	dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
    496 
    497 	pkey_assert(ret >= 0);
    498 
    499 	pkey_rights = pkey_get(pkey, syscall_flags);
    500 	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
    501 			pkey, pkey, pkey_rights);
    502 
    503 	dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
    504 	if (flags)
    505 		pkey_assert(rdpkru() > orig_pkru);
    506 	dprintf1("END<---%s(%d, 0x%x)\n", __func__,
    507 		pkey, flags);
    508 }
    509 
    510 void pkey_disable_clear(int pkey, int flags)
    511 {
    512 	unsigned long syscall_flags = 0;
    513 	int ret;
    514 	int pkey_rights = pkey_get(pkey, syscall_flags);
    515 	u32 orig_pkru = rdpkru();
    516 
    517 	pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
    518 
    519 	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
    520 			pkey, pkey, pkey_rights);
    521 	pkey_assert(pkey_rights >= 0);
    522 
    523 	pkey_rights |= flags;
    524 
    525 	ret = pkey_set(pkey, pkey_rights, 0);
    526 	/* pkru and flags have the same format */
    527 	shadow_pkru &= ~(flags << (pkey * 2));
    528 	pkey_assert(ret >= 0);
    529 
    530 	pkey_rights = pkey_get(pkey, syscall_flags);
    531 	dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
    532 			pkey, pkey, pkey_rights);
    533 
    534 	dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
    535 	if (flags)
    536 		assert(rdpkru() > orig_pkru);
    537 }
    538 
    539 void pkey_write_allow(int pkey)
    540 {
    541 	pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
    542 }
    543 void pkey_write_deny(int pkey)
    544 {
    545 	pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
    546 }
    547 void pkey_access_allow(int pkey)
    548 {
    549 	pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
    550 }
    551 void pkey_access_deny(int pkey)
    552 {
    553 	pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
    554 }
    555 
    556 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
    557 		unsigned long pkey)
    558 {
    559 	int sret;
    560 
    561 	dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
    562 			ptr, size, orig_prot, pkey);
    563 
    564 	errno = 0;
    565 	sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
    566 	if (errno) {
    567 		dprintf2("SYS_mprotect_key sret: %d\n", sret);
    568 		dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
    569 		dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
    570 		if (DEBUG_LEVEL >= 2)
    571 			perror("SYS_mprotect_pkey");
    572 	}
    573 	return sret;
    574 }
    575 
    576 int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
    577 {
    578 	int ret = syscall(SYS_pkey_alloc, flags, init_val);
    579 	dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
    580 			__func__, flags, init_val, ret, errno);
    581 	return ret;
    582 }
    583 
    584 int alloc_pkey(void)
    585 {
    586 	int ret;
    587 	unsigned long init_val = 0x0;
    588 
    589 	dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
    590 			__LINE__, __rdpkru(), shadow_pkru);
    591 	ret = sys_pkey_alloc(0, init_val);
    592 	/*
    593 	 * pkey_alloc() sets PKRU, so we need to reflect it in
    594 	 * shadow_pkru:
    595 	 */
    596 	dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
    597 			__LINE__, ret, __rdpkru(), shadow_pkru);
    598 	if (ret) {
    599 		/* clear both the bits: */
    600 		shadow_pkru &= ~(0x3      << (ret * 2));
    601 		dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
    602 				__LINE__, ret, __rdpkru(), shadow_pkru);
    603 		/*
    604 		 * move the new state in from init_val
    605 		 * (remember, we cheated and init_val == pkru format)
    606 		 */
    607 		shadow_pkru |=  (init_val << (ret * 2));
    608 	}
    609 	dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
    610 			__LINE__, ret, __rdpkru(), shadow_pkru);
    611 	dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
    612 	/* for shadow checking: */
    613 	rdpkru();
    614 	dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
    615 			__LINE__, ret, __rdpkru(), shadow_pkru);
    616 	return ret;
    617 }
    618 
    619 int sys_pkey_free(unsigned long pkey)
    620 {
    621 	int ret = syscall(SYS_pkey_free, pkey);
    622 	dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
    623 	return ret;
    624 }
    625 
    626 /*
    627  * I had a bug where pkey bits could be set by mprotect() but
    628  * not cleared.  This ensures we get lots of random bit sets
    629  * and clears on the vma and pte pkey bits.
    630  */
    631 int alloc_random_pkey(void)
    632 {
    633 	int max_nr_pkey_allocs;
    634 	int ret;
    635 	int i;
    636 	int alloced_pkeys[NR_PKEYS];
    637 	int nr_alloced = 0;
    638 	int random_index;
    639 	memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
    640 
    641 	/* allocate every possible key and make a note of which ones we got */
    642 	max_nr_pkey_allocs = NR_PKEYS;
    643 	max_nr_pkey_allocs = 1;
    644 	for (i = 0; i < max_nr_pkey_allocs; i++) {
    645 		int new_pkey = alloc_pkey();
    646 		if (new_pkey < 0)
    647 			break;
    648 		alloced_pkeys[nr_alloced++] = new_pkey;
    649 	}
    650 
    651 	pkey_assert(nr_alloced > 0);
    652 	/* select a random one out of the allocated ones */
    653 	random_index = rand() % nr_alloced;
    654 	ret = alloced_pkeys[random_index];
    655 	/* now zero it out so we don't free it next */
    656 	alloced_pkeys[random_index] = 0;
    657 
    658 	/* go through the allocated ones that we did not want and free them */
    659 	for (i = 0; i < nr_alloced; i++) {
    660 		int free_ret;
    661 		if (!alloced_pkeys[i])
    662 			continue;
    663 		free_ret = sys_pkey_free(alloced_pkeys[i]);
    664 		pkey_assert(!free_ret);
    665 	}
    666 	dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
    667 			__LINE__, ret, __rdpkru(), shadow_pkru);
    668 	return ret;
    669 }
    670 
    671 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
    672 		unsigned long pkey)
    673 {
    674 	int nr_iterations = random() % 100;
    675 	int ret;
    676 
    677 	while (0) {
    678 		int rpkey = alloc_random_pkey();
    679 		ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
    680 		dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
    681 				ptr, size, orig_prot, pkey, ret);
    682 		if (nr_iterations-- < 0)
    683 			break;
    684 
    685 		dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
    686 			__LINE__, ret, __rdpkru(), shadow_pkru);
    687 		sys_pkey_free(rpkey);
    688 		dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
    689 			__LINE__, ret, __rdpkru(), shadow_pkru);
    690 	}
    691 	pkey_assert(pkey < NR_PKEYS);
    692 
    693 	ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
    694 	dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
    695 			ptr, size, orig_prot, pkey, ret);
    696 	pkey_assert(!ret);
    697 	dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
    698 			__LINE__, ret, __rdpkru(), shadow_pkru);
    699 	return ret;
    700 }
    701 
    702 struct pkey_malloc_record {
    703 	void *ptr;
    704 	long size;
    705 };
    706 struct pkey_malloc_record *pkey_malloc_records;
    707 long nr_pkey_malloc_records;
    708 void record_pkey_malloc(void *ptr, long size)
    709 {
    710 	long i;
    711 	struct pkey_malloc_record *rec = NULL;
    712 
    713 	for (i = 0; i < nr_pkey_malloc_records; i++) {
    714 		rec = &pkey_malloc_records[i];
    715 		/* find a free record */
    716 		if (rec)
    717 			break;
    718 	}
    719 	if (!rec) {
    720 		/* every record is full */
    721 		size_t old_nr_records = nr_pkey_malloc_records;
    722 		size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
    723 		size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
    724 		dprintf2("new_nr_records: %zd\n", new_nr_records);
    725 		dprintf2("new_size: %zd\n", new_size);
    726 		pkey_malloc_records = realloc(pkey_malloc_records, new_size);
    727 		pkey_assert(pkey_malloc_records != NULL);
    728 		rec = &pkey_malloc_records[nr_pkey_malloc_records];
    729 		/*
    730 		 * realloc() does not initialize memory, so zero it from
    731 		 * the first new record all the way to the end.
    732 		 */
    733 		for (i = 0; i < new_nr_records - old_nr_records; i++)
    734 			memset(rec + i, 0, sizeof(*rec));
    735 	}
    736 	dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
    737 		(int)(rec - pkey_malloc_records), rec, ptr, size);
    738 	rec->ptr = ptr;
    739 	rec->size = size;
    740 	nr_pkey_malloc_records++;
    741 }
    742 
    743 void free_pkey_malloc(void *ptr)
    744 {
    745 	long i;
    746 	int ret;
    747 	dprintf3("%s(%p)\n", __func__, ptr);
    748 	for (i = 0; i < nr_pkey_malloc_records; i++) {
    749 		struct pkey_malloc_record *rec = &pkey_malloc_records[i];
    750 		dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
    751 				ptr, i, rec, rec->ptr, rec->size);
    752 		if ((ptr <  rec->ptr) ||
    753 		    (ptr >= rec->ptr + rec->size))
    754 			continue;
    755 
    756 		dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
    757 				ptr, i, rec, rec->ptr, rec->size);
    758 		nr_pkey_malloc_records--;
    759 		ret = munmap(rec->ptr, rec->size);
    760 		dprintf3("munmap ret: %d\n", ret);
    761 		pkey_assert(!ret);
    762 		dprintf3("clearing rec->ptr, rec: %p\n", rec);
    763 		rec->ptr = NULL;
    764 		dprintf3("done clearing rec->ptr, rec: %p\n", rec);
    765 		return;
    766 	}
    767 	pkey_assert(false);
    768 }
    769 
    770 
    771 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
    772 {
    773 	void *ptr;
    774 	int ret;
    775 
    776 	rdpkru();
    777 	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    778 			size, prot, pkey);
    779 	pkey_assert(pkey < NR_PKEYS);
    780 	ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
    781 	pkey_assert(ptr != (void *)-1);
    782 	ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
    783 	pkey_assert(!ret);
    784 	record_pkey_malloc(ptr, size);
    785 	rdpkru();
    786 
    787 	dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
    788 	return ptr;
    789 }
    790 
    791 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
    792 {
    793 	int ret;
    794 	void *ptr;
    795 
    796 	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    797 			size, prot, pkey);
    798 	/*
    799 	 * Guarantee we can fit at least one huge page in the resulting
    800 	 * allocation by allocating space for 2:
    801 	 */
    802 	size = ALIGN_UP(size, HPAGE_SIZE * 2);
    803 	ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
    804 	pkey_assert(ptr != (void *)-1);
    805 	record_pkey_malloc(ptr, size);
    806 	mprotect_pkey(ptr, size, prot, pkey);
    807 
    808 	dprintf1("unaligned ptr: %p\n", ptr);
    809 	ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
    810 	dprintf1("  aligned ptr: %p\n", ptr);
    811 	ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
    812 	dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
    813 	ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
    814 	dprintf1("MADV_WILLNEED ret: %d\n", ret);
    815 	memset(ptr, 0, HPAGE_SIZE);
    816 
    817 	dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
    818 	return ptr;
    819 }
    820 
    821 int hugetlb_setup_ok;
    822 #define GET_NR_HUGE_PAGES 10
    823 void setup_hugetlbfs(void)
    824 {
    825 	int err;
    826 	int fd;
    827 	char buf[] = "123";
    828 
    829 	if (geteuid() != 0) {
    830 		fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
    831 		return;
    832 	}
    833 
    834 	cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
    835 
    836 	/*
    837 	 * Now go make sure that we got the pages and that they
    838 	 * are 2M pages.  Someone might have made 1G the default.
    839 	 */
    840 	fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
    841 	if (fd < 0) {
    842 		perror("opening sysfs 2M hugetlb config");
    843 		return;
    844 	}
    845 
    846 	/* -1 to guarantee leaving the trailing \0 */
    847 	err = read(fd, buf, sizeof(buf)-1);
    848 	close(fd);
    849 	if (err <= 0) {
    850 		perror("reading sysfs 2M hugetlb config");
    851 		return;
    852 	}
    853 
    854 	if (atoi(buf) != GET_NR_HUGE_PAGES) {
    855 		fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
    856 			buf, GET_NR_HUGE_PAGES);
    857 		return;
    858 	}
    859 
    860 	hugetlb_setup_ok = 1;
    861 }
    862 
    863 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
    864 {
    865 	void *ptr;
    866 	int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
    867 
    868 	if (!hugetlb_setup_ok)
    869 		return PTR_ERR_ENOTSUP;
    870 
    871 	dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
    872 	size = ALIGN_UP(size, HPAGE_SIZE * 2);
    873 	pkey_assert(pkey < NR_PKEYS);
    874 	ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
    875 	pkey_assert(ptr != (void *)-1);
    876 	mprotect_pkey(ptr, size, prot, pkey);
    877 
    878 	record_pkey_malloc(ptr, size);
    879 
    880 	dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
    881 	return ptr;
    882 }
    883 
    884 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
    885 {
    886 	void *ptr;
    887 	int fd;
    888 
    889 	dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
    890 			size, prot, pkey);
    891 	pkey_assert(pkey < NR_PKEYS);
    892 	fd = open("/dax/foo", O_RDWR);
    893 	pkey_assert(fd >= 0);
    894 
    895 	ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
    896 	pkey_assert(ptr != (void *)-1);
    897 
    898 	mprotect_pkey(ptr, size, prot, pkey);
    899 
    900 	record_pkey_malloc(ptr, size);
    901 
    902 	dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
    903 	close(fd);
    904 	return ptr;
    905 }
    906 
    907 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
    908 
    909 	malloc_pkey_with_mprotect,
    910 	malloc_pkey_anon_huge,
    911 	malloc_pkey_hugetlb
    912 /* can not do direct with the pkey_mprotect() API:
    913 	malloc_pkey_mmap_direct,
    914 	malloc_pkey_mmap_dax,
    915 */
    916 };
    917 
    918 void *malloc_pkey(long size, int prot, u16 pkey)
    919 {
    920 	void *ret;
    921 	static int malloc_type;
    922 	int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
    923 
    924 	pkey_assert(pkey < NR_PKEYS);
    925 
    926 	while (1) {
    927 		pkey_assert(malloc_type < nr_malloc_types);
    928 
    929 		ret = pkey_malloc[malloc_type](size, prot, pkey);
    930 		pkey_assert(ret != (void *)-1);
    931 
    932 		malloc_type++;
    933 		if (malloc_type >= nr_malloc_types)
    934 			malloc_type = (random()%nr_malloc_types);
    935 
    936 		/* try again if the malloc_type we tried is unsupported */
    937 		if (ret == PTR_ERR_ENOTSUP)
    938 			continue;
    939 
    940 		break;
    941 	}
    942 
    943 	dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
    944 			size, prot, pkey, ret);
    945 	return ret;
    946 }
    947 
    948 int last_pkru_faults;
    949 void expected_pk_fault(int pkey)
    950 {
    951 	dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
    952 			__func__, last_pkru_faults, pkru_faults);
    953 	dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
    954 	pkey_assert(last_pkru_faults + 1 == pkru_faults);
    955 	pkey_assert(last_si_pkey == pkey);
    956 	/*
    957 	 * The signal handler shold have cleared out PKRU to let the
    958 	 * test program continue.  We now have to restore it.
    959 	 */
    960 	if (__rdpkru() != 0)
    961 		pkey_assert(0);
    962 
    963 	__wrpkru(shadow_pkru);
    964 	dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
    965 			__func__, shadow_pkru);
    966 	last_pkru_faults = pkru_faults;
    967 	last_si_pkey = -1;
    968 }
    969 
    970 void do_not_expect_pk_fault(void)
    971 {
    972 	pkey_assert(last_pkru_faults == pkru_faults);
    973 }
    974 
    975 int test_fds[10] = { -1 };
    976 int nr_test_fds;
    977 void __save_test_fd(int fd)
    978 {
    979 	pkey_assert(fd >= 0);
    980 	pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
    981 	test_fds[nr_test_fds] = fd;
    982 	nr_test_fds++;
    983 }
    984 
    985 int get_test_read_fd(void)
    986 {
    987 	int test_fd = open("/etc/passwd", O_RDONLY);
    988 	__save_test_fd(test_fd);
    989 	return test_fd;
    990 }
    991 
    992 void close_test_fds(void)
    993 {
    994 	int i;
    995 
    996 	for (i = 0; i < nr_test_fds; i++) {
    997 		if (test_fds[i] < 0)
    998 			continue;
    999 		close(test_fds[i]);
   1000 		test_fds[i] = -1;
   1001 	}
   1002 	nr_test_fds = 0;
   1003 }
   1004 
   1005 #define barrier() __asm__ __volatile__("": : :"memory")
   1006 __attribute__((noinline)) int read_ptr(int *ptr)
   1007 {
   1008 	/*
   1009 	 * Keep GCC from optimizing this away somehow
   1010 	 */
   1011 	barrier();
   1012 	return *ptr;
   1013 }
   1014 
   1015 void test_read_of_write_disabled_region(int *ptr, u16 pkey)
   1016 {
   1017 	int ptr_contents;
   1018 
   1019 	dprintf1("disabling write access to PKEY[1], doing read\n");
   1020 	pkey_write_deny(pkey);
   1021 	ptr_contents = read_ptr(ptr);
   1022 	dprintf1("*ptr: %d\n", ptr_contents);
   1023 	dprintf1("\n");
   1024 }
   1025 void test_read_of_access_disabled_region(int *ptr, u16 pkey)
   1026 {
   1027 	int ptr_contents;
   1028 
   1029 	dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
   1030 	rdpkru();
   1031 	pkey_access_deny(pkey);
   1032 	ptr_contents = read_ptr(ptr);
   1033 	dprintf1("*ptr: %d\n", ptr_contents);
   1034 	expected_pk_fault(pkey);
   1035 }
   1036 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
   1037 {
   1038 	dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
   1039 	pkey_write_deny(pkey);
   1040 	*ptr = __LINE__;
   1041 	expected_pk_fault(pkey);
   1042 }
   1043 void test_write_of_access_disabled_region(int *ptr, u16 pkey)
   1044 {
   1045 	dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
   1046 	pkey_access_deny(pkey);
   1047 	*ptr = __LINE__;
   1048 	expected_pk_fault(pkey);
   1049 }
   1050 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
   1051 {
   1052 	int ret;
   1053 	int test_fd = get_test_read_fd();
   1054 
   1055 	dprintf1("disabling access to PKEY[%02d], "
   1056 		 "having kernel read() to buffer\n", pkey);
   1057 	pkey_access_deny(pkey);
   1058 	ret = read(test_fd, ptr, 1);
   1059 	dprintf1("read ret: %d\n", ret);
   1060 	pkey_assert(ret);
   1061 }
   1062 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
   1063 {
   1064 	int ret;
   1065 	int test_fd = get_test_read_fd();
   1066 
   1067 	pkey_write_deny(pkey);
   1068 	ret = read(test_fd, ptr, 100);
   1069 	dprintf1("read ret: %d\n", ret);
   1070 	if (ret < 0 && (DEBUG_LEVEL > 0))
   1071 		perror("verbose read result (OK for this to be bad)");
   1072 	pkey_assert(ret);
   1073 }
   1074 
   1075 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
   1076 {
   1077 	int pipe_ret, vmsplice_ret;
   1078 	struct iovec iov;
   1079 	int pipe_fds[2];
   1080 
   1081 	pipe_ret = pipe(pipe_fds);
   1082 
   1083 	pkey_assert(pipe_ret == 0);
   1084 	dprintf1("disabling access to PKEY[%02d], "
   1085 		 "having kernel vmsplice from buffer\n", pkey);
   1086 	pkey_access_deny(pkey);
   1087 	iov.iov_base = ptr;
   1088 	iov.iov_len = PAGE_SIZE;
   1089 	vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
   1090 	dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
   1091 	pkey_assert(vmsplice_ret == -1);
   1092 
   1093 	close(pipe_fds[0]);
   1094 	close(pipe_fds[1]);
   1095 }
   1096 
   1097 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
   1098 {
   1099 	int ignored = 0xdada;
   1100 	int futex_ret;
   1101 	int some_int = __LINE__;
   1102 
   1103 	dprintf1("disabling write to PKEY[%02d], "
   1104 		 "doing futex gunk in buffer\n", pkey);
   1105 	*ptr = some_int;
   1106 	pkey_write_deny(pkey);
   1107 	futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
   1108 			&ignored, ignored);
   1109 	if (DEBUG_LEVEL > 0)
   1110 		perror("futex");
   1111 	dprintf1("futex() ret: %d\n", futex_ret);
   1112 }
   1113 
   1114 /* Assumes that all pkeys other than 'pkey' are unallocated */
   1115 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
   1116 {
   1117 	int err;
   1118 	int i;
   1119 
   1120 	/* Note: 0 is the default pkey, so don't mess with it */
   1121 	for (i = 1; i < NR_PKEYS; i++) {
   1122 		if (pkey == i)
   1123 			continue;
   1124 
   1125 		dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
   1126 		err = sys_pkey_free(i);
   1127 		pkey_assert(err);
   1128 
   1129 		err = sys_pkey_free(i);
   1130 		pkey_assert(err);
   1131 
   1132 		err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
   1133 		pkey_assert(err);
   1134 	}
   1135 }
   1136 
   1137 /* Assumes that all pkeys other than 'pkey' are unallocated */
   1138 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
   1139 {
   1140 	int err;
   1141 	int bad_pkey = NR_PKEYS+99;
   1142 
   1143 	/* pass a known-invalid pkey in: */
   1144 	err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
   1145 	pkey_assert(err);
   1146 }
   1147 
   1148 /* Assumes that all pkeys other than 'pkey' are unallocated */
   1149 void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
   1150 {
   1151 	int err;
   1152 	int allocated_pkeys[NR_PKEYS] = {0};
   1153 	int nr_allocated_pkeys = 0;
   1154 	int i;
   1155 
   1156 	for (i = 0; i < NR_PKEYS*2; i++) {
   1157 		int new_pkey;
   1158 		dprintf1("%s() alloc loop: %d\n", __func__, i);
   1159 		new_pkey = alloc_pkey();
   1160 		dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
   1161 				__LINE__, err, __rdpkru(), shadow_pkru);
   1162 		rdpkru(); /* for shadow checking */
   1163 		dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
   1164 		if ((new_pkey == -1) && (errno == ENOSPC)) {
   1165 			dprintf2("%s() failed to allocate pkey after %d tries\n",
   1166 				__func__, nr_allocated_pkeys);
   1167 			break;
   1168 		}
   1169 		pkey_assert(nr_allocated_pkeys < NR_PKEYS);
   1170 		allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
   1171 	}
   1172 
   1173 	dprintf3("%s()::%d\n", __func__, __LINE__);
   1174 
   1175 	/*
   1176 	 * ensure it did not reach the end of the loop without
   1177 	 * failure:
   1178 	 */
   1179 	pkey_assert(i < NR_PKEYS*2);
   1180 
   1181 	/*
   1182 	 * There are 16 pkeys supported in hardware.  One is taken
   1183 	 * up for the default (0) and another can be taken up by
   1184 	 * an execute-only mapping.  Ensure that we can allocate
   1185 	 * at least 14 (16-2).
   1186 	 */
   1187 	pkey_assert(i >= NR_PKEYS-2);
   1188 
   1189 	for (i = 0; i < nr_allocated_pkeys; i++) {
   1190 		err = sys_pkey_free(allocated_pkeys[i]);
   1191 		pkey_assert(!err);
   1192 		rdpkru(); /* for shadow checking */
   1193 	}
   1194 }
   1195 
   1196 void test_ptrace_of_child(int *ptr, u16 pkey)
   1197 {
   1198 	__attribute__((__unused__)) int peek_result;
   1199 	pid_t child_pid;
   1200 	void *ignored = 0;
   1201 	long ret;
   1202 	int status;
   1203 	/*
   1204 	 * This is the "control" for our little expermient.  Make sure
   1205 	 * we can always access it when ptracing.
   1206 	 */
   1207 	int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
   1208 	int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
   1209 
   1210 	/*
   1211 	 * Fork a child which is an exact copy of this process, of course.
   1212 	 * That means we can do all of our tests via ptrace() and then plain
   1213 	 * memory access and ensure they work differently.
   1214 	 */
   1215 	child_pid = fork_lazy_child();
   1216 	dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
   1217 
   1218 	ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
   1219 	if (ret)
   1220 		perror("attach");
   1221 	dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
   1222 	pkey_assert(ret != -1);
   1223 	ret = waitpid(child_pid, &status, WUNTRACED);
   1224 	if ((ret != child_pid) || !(WIFSTOPPED(status))) {
   1225 		fprintf(stderr, "weird waitpid result %ld stat %x\n",
   1226 				ret, status);
   1227 		pkey_assert(0);
   1228 	}
   1229 	dprintf2("waitpid ret: %ld\n", ret);
   1230 	dprintf2("waitpid status: %d\n", status);
   1231 
   1232 	pkey_access_deny(pkey);
   1233 	pkey_write_deny(pkey);
   1234 
   1235 	/* Write access, untested for now:
   1236 	ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
   1237 	pkey_assert(ret != -1);
   1238 	dprintf1("poke at %p: %ld\n", peek_at, ret);
   1239 	*/
   1240 
   1241 	/*
   1242 	 * Try to access the pkey-protected "ptr" via ptrace:
   1243 	 */
   1244 	ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
   1245 	/* expect it to work, without an error: */
   1246 	pkey_assert(ret != -1);
   1247 	/* Now access from the current task, and expect an exception: */
   1248 	peek_result = read_ptr(ptr);
   1249 	expected_pk_fault(pkey);
   1250 
   1251 	/*
   1252 	 * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
   1253 	 */
   1254 	ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
   1255 	/* expect it to work, without an error: */
   1256 	pkey_assert(ret != -1);
   1257 	/* Now access from the current task, and expect NO exception: */
   1258 	peek_result = read_ptr(plain_ptr);
   1259 	do_not_expect_pk_fault();
   1260 
   1261 	ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
   1262 	pkey_assert(ret != -1);
   1263 
   1264 	ret = kill(child_pid, SIGKILL);
   1265 	pkey_assert(ret != -1);
   1266 
   1267 	wait(&status);
   1268 
   1269 	free(plain_ptr_unaligned);
   1270 }
   1271 
   1272 void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
   1273 {
   1274 	void *p1;
   1275 	int scratch;
   1276 	int ptr_contents;
   1277 	int ret;
   1278 
   1279 	p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
   1280 	dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
   1281 	/* lots_o_noops_around_write should be page-aligned already */
   1282 	assert(p1 == &lots_o_noops_around_write);
   1283 
   1284 	/* Point 'p1' at the *second* page of the function: */
   1285 	p1 += PAGE_SIZE;
   1286 
   1287 	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
   1288 	lots_o_noops_around_write(&scratch);
   1289 	ptr_contents = read_ptr(p1);
   1290 	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
   1291 
   1292 	ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
   1293 	pkey_assert(!ret);
   1294 	pkey_access_deny(pkey);
   1295 
   1296 	dprintf2("pkru: %x\n", rdpkru());
   1297 
   1298 	/*
   1299 	 * Make sure this is an *instruction* fault
   1300 	 */
   1301 	madvise(p1, PAGE_SIZE, MADV_DONTNEED);
   1302 	lots_o_noops_around_write(&scratch);
   1303 	do_not_expect_pk_fault();
   1304 	ptr_contents = read_ptr(p1);
   1305 	dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
   1306 	expected_pk_fault(pkey);
   1307 }
   1308 
   1309 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
   1310 {
   1311 	int size = PAGE_SIZE;
   1312 	int sret;
   1313 
   1314 	if (cpu_has_pku()) {
   1315 		dprintf1("SKIP: %s: no CPU support\n", __func__);
   1316 		return;
   1317 	}
   1318 
   1319 	sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
   1320 	pkey_assert(sret < 0);
   1321 }
   1322 
   1323 void (*pkey_tests[])(int *ptr, u16 pkey) = {
   1324 	test_read_of_write_disabled_region,
   1325 	test_read_of_access_disabled_region,
   1326 	test_write_of_write_disabled_region,
   1327 	test_write_of_access_disabled_region,
   1328 	test_kernel_write_of_access_disabled_region,
   1329 	test_kernel_write_of_write_disabled_region,
   1330 	test_kernel_gup_of_access_disabled_region,
   1331 	test_kernel_gup_write_to_write_disabled_region,
   1332 	test_executing_on_unreadable_memory,
   1333 	test_ptrace_of_child,
   1334 	test_pkey_syscalls_on_non_allocated_pkey,
   1335 	test_pkey_syscalls_bad_args,
   1336 	test_pkey_alloc_exhaust,
   1337 };
   1338 
   1339 void run_tests_once(void)
   1340 {
   1341 	int *ptr;
   1342 	int prot = PROT_READ|PROT_WRITE;
   1343 
   1344 	for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
   1345 		int pkey;
   1346 		int orig_pkru_faults = pkru_faults;
   1347 
   1348 		dprintf1("======================\n");
   1349 		dprintf1("test %d preparing...\n", test_nr);
   1350 
   1351 		tracing_on();
   1352 		pkey = alloc_random_pkey();
   1353 		dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
   1354 		ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
   1355 		dprintf1("test %d starting...\n", test_nr);
   1356 		pkey_tests[test_nr](ptr, pkey);
   1357 		dprintf1("freeing test memory: %p\n", ptr);
   1358 		free_pkey_malloc(ptr);
   1359 		sys_pkey_free(pkey);
   1360 
   1361 		dprintf1("pkru_faults: %d\n", pkru_faults);
   1362 		dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
   1363 
   1364 		tracing_off();
   1365 		close_test_fds();
   1366 
   1367 		printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
   1368 		dprintf1("======================\n\n");
   1369 	}
   1370 	iteration_nr++;
   1371 }
   1372 
   1373 void pkey_setup_shadow(void)
   1374 {
   1375 	shadow_pkru = __rdpkru();
   1376 }
   1377 
   1378 int main(void)
   1379 {
   1380 	int nr_iterations = 22;
   1381 
   1382 	setup_handlers();
   1383 
   1384 	printf("has pku: %d\n", cpu_has_pku());
   1385 
   1386 	if (!cpu_has_pku()) {
   1387 		int size = PAGE_SIZE;
   1388 		int *ptr;
   1389 
   1390 		printf("running PKEY tests for unsupported CPU/OS\n");
   1391 
   1392 		ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
   1393 		assert(ptr != (void *)-1);
   1394 		test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
   1395 		exit(0);
   1396 	}
   1397 
   1398 	pkey_setup_shadow();
   1399 	printf("startup pkru: %x\n", rdpkru());
   1400 	setup_hugetlbfs();
   1401 
   1402 	while (nr_iterations-- > 0)
   1403 		run_tests_once();
   1404 
   1405 	printf("done (all tests OK)\n");
   1406 	return 0;
   1407 }
   1408