Home | History | Annotate | Download | only in cpuset_lib
      1 /*
      2  * cpuset user library implementation.
      3  *
      4  * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
      5  *
      6  * Paul Jackson <pj (at) sgi.com>
      7  */
      8 
      9 /*
     10  *  This program is free software; you can redistribute it and/or modify
     11  *  it under the terms of the GNU Lesser General Public License as published by
     12  *  the Free Software Foundation; either version 2.1 of the License, or
     13  *  (at your option) any later version.
     14  *
     15  *  This program is distributed in the hope that it will be useful,
     16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     18  *  GNU Lesser General Public License for more details.
     19  *
     20  *  You should have received a copy of the GNU Lesser General Public License
     21  *  along with this program; if not, write to the Free Software
     22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
     23  */
     24 
     25 #define _XOPEN_SOURCE 500	/* need to see pread() */
     26 #define _BSD_SOURCE 1		/* need to see syscall() */
     27 #include <unistd.h>
     28 
     29 #include <ctype.h>
     30 #include <dirent.h>
     31 #include <errno.h>
     32 #include <fcntl.h>
     33 #include <fts.h>
     34 #include <limits.h>
     35 #include <signal.h>
     36 #include <stdint.h>
     37 #include <stdio.h>
     38 #include <stdlib.h>
     39 #include <string.h>
     40 #include <sys/stat.h>
     41 #include <sys/syscall.h>
     42 #include <sys/types.h>
     43 #include <time.h>
     44 #include <utime.h>
     45 #include <sys/utsname.h>	/* for cpuset_would_crash_kernel() */
     46 
     47 #include "bitmask.h"
     48 #include "cpuset.h"
     49 #include "common.h"
     50 #include "test.h"
     51 #include "linux_syscall_numbers.h"
     52 #include "config.h"
     53 #if HAVE_LINUX_MEMPOLICY_H
     54 #include <linux/mempolicy.h>
     55 
     56 /* Bump version, and update Change History, when libcpuset API changes */
     57 #define CPUSET_VERSION 3
     58 
     59 /*
     60  * For a history of what changed in each version, see the "Change
     61  * History" section, at the end of the libcpuset master document.
     62  */
     63 
     64 int cpuset_version(void)
     65 {
     66 	return CPUSET_VERSION;
     67 }
     68 
     69 struct cpuset {
     70 	struct bitmask *cpus;
     71 	struct bitmask *mems;
     72 	char cpu_exclusive;
     73 	char mem_exclusive;
     74 	char mem_hardwall;
     75 	char notify_on_release;
     76 	char memory_migrate;
     77 	char memory_pressure_enabled;
     78 	char memory_spread_page;
     79 	char memory_spread_slab;
     80 	char sched_load_balance;
     81 	int sched_relax_domain_level;
     82 
     83 	/*
     84 	 * Each field 'x' above gets an 'x_valid' field below.
     85 	 * The apply_cpuset_settings() will only set those fields whose
     86 	 * corresponding *_valid flags are set.  The cpuset_alloc()
     87 	 * routine clears these flags as part of the clear in calloc(),
     88 	 * and the various cpuset_set*() routines set these flags when
     89 	 * setting the corresponding value.
     90 	 *
     91 	 * The purpose of these valid fields is to ensure that when
     92 	 * we create a new cpuset, we don't accidentally overwrite
     93 	 * some non-zero kernel default, such as an inherited
     94 	 * memory_spread_* flag, just because the user application
     95 	 * code didn't override the default zero settings resulting
     96 	 * from the calloc() call in cpuset_alloc().
     97 	 *
     98 	 * The choice of 'char' for the type of the flags above,
     99 	 * but a bitfield for the flags below, is somewhat capricious.
    100 	 */
    101 	unsigned cpus_valid:1;
    102 	unsigned mems_valid:1;
    103 	unsigned cpu_exclusive_valid:1;
    104 	unsigned mem_exclusive_valid:1;
    105 	unsigned mem_hardwall_valid:1;
    106 	unsigned notify_on_release_valid:1;
    107 	unsigned memory_migrate_valid:1;
    108 	unsigned memory_pressure_enabled_valid:1;
    109 	unsigned memory_spread_page_valid:1;
    110 	unsigned memory_spread_slab_valid:1;
    111 	unsigned sched_load_balance_valid:1;
    112 	unsigned sched_relax_domain_level_valid:1;
    113 
    114 	/*
    115 	 * if the relative variable was modified, use following flags
    116 	 * to put a mark
    117 	 */
    118 	unsigned cpus_dirty:1;
    119 	unsigned mems_dirty:1;
    120 	unsigned cpu_exclusive_dirty:1;
    121 	unsigned mem_exclusive_dirty:1;
    122 	unsigned mem_hardwall_dirty:1;
    123 	unsigned notify_on_release_dirty:1;
    124 	unsigned memory_migrate_dirty:1;
    125 	unsigned memory_pressure_enabled_dirty:1;
    126 	unsigned memory_spread_page_dirty:1;
    127 	unsigned memory_spread_slab_dirty:1;
    128 	unsigned sched_load_balance_dirty:1;
    129 	unsigned sched_relax_domain_level_dirty:1;
    130 };
    131 
    132 /* Presumed cpuset file system mount point */
    133 static const char *cpusetmnt = "/dev/cpuset";
    134 
    135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
    136 static const char *mapfile = "/var/run/cpunodemap";
    137 
    138 /* The primary source for the cpunodemap[] is available below here. */
    139 static const char *sysdevices = "/sys/devices/system";
    140 
    141 #define max(a,b) ((a) > (b) ? (a) : (b))
    142 #define min(a,b) ((a) < (b) ? (a) : (b))
    143 
    144 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
    145 #define SMALL_BUFSZ 16
    146 
    147 /*
    148  * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
    149  * and nodemask_t sizes.  The lines in this file that begin with the
    150  * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
    151  * and nodemask string, respectively.  The lengths of these strings
    152  * reflect the kernel's internal cpumask_t and nodemask_t sizes,
    153  * which sizes are needed to correctly call the sched_setaffinity
    154  * and set_mempolicy system calls, and to size user level
    155  * bitmasks to match the kernels.
    156  */
    157 
    158 static const char *mask_size_file = "/proc/self/status";
    159 static const char *cpumask_prefix = "Cpus_allowed:\t";
    160 static const char *nodemask_prefix = "Mems_allowed:\t";
    161 
    162 /*
    163  * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
    164  *
    165  * The first time we need these, we parse the Cpus_allowed and
    166  * Mems_allowed lines from mask_size_file ("/proc/self/status").
    167  */
    168 
    169 static int cpumask_sz;
    170 static int nodemask_sz;
    171 
    172 /*
    173  * These defaults only kick in if we fail to size the kernel
    174  * cpumask and nodemask by reading the Cpus_allowed and
    175  * Mems_allowed fields from the /proc/self/status file.
    176  */
    177 
    178 #define DEFCPUBITS (512)
    179 #define DEFNODEBITS (DEFCPUBITS/2)
    180 
    181 /*
    182  * Arch-neutral API for obtaining NUMA distances between CPUs
    183  * and Memory Nodes, via the files:
    184  *	/sys/devices/system/node/nodeN/distance
    185  * which have lines such as:
    186  *	46 66 10 20
    187  * which say that for cpu on node N (from the path above), the
    188  * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
    189  * respectively.
    190  */
    191 
    192 static const char *distance_directory = "/sys/devices/system/node";
    193 
    194 /*
    195  * Someday, we should disable, then later discard, the SN code
    196  * marked ALTERNATE_SN_DISTMAP.
    197  */
    198 
    199 #define ALTERNATE_SN_DISTMAP 1
    200 #ifdef ALTERNATE_SN_DISTMAP
    201 
    202 /*
    203  * Alternative SN (SGI ia64) architecture specific API for obtaining
    204  * NUMA distances between CPUs and Memory Nodes is via the file
    205  * /proc/sgi_sn/sn_topology, which has lines such as:
    206  *
    207  *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
    208  *
    209  * which says that for each CPU on node 2, the distance to nodes
    210  * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
    211  *
    212  * This file has other lines as well, which start with other
    213  * keywords than "node".  Ignore these other lines.
    214  */
    215 
    216 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
    217 static const char *sn_top_node_prefix = "node ";
    218 
    219 #endif
    220 
    221 /*
    222  * Check that cpusets supported, /dev/cpuset mounted.
    223  * If ok, return 0.
    224  * If not, return -1 and set errno:
    225  *	ENOSYS - kernel doesn't support cpusets
    226  *	ENODEV - /dev/cpuset not mounted
    227  */
    228 
    229 static enum {
    230 	check_notdone,
    231 	check_enosys,
    232 	check_enodev,
    233 	check_ok
    234 } check_state = check_notdone;
    235 
    236 static int check()
    237 {
    238 	if (check_state == check_notdone) {
    239 		struct stat statbuf;
    240 
    241 		if (stat("/proc/self/cpuset", &statbuf) < 0) {
    242 			check_state = check_enosys;
    243 			goto done;
    244 		}
    245 
    246 		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
    247 			check_state = check_enodev;
    248 			goto done;
    249 		}
    250 
    251 		check_state = check_ok;
    252 	}
    253 done:
    254 	switch (check_state) {
    255 	case check_enosys:
    256 		errno = ENOSYS;
    257 		return -1;
    258 	case check_enodev:
    259 		errno = ENODEV;
    260 		return -1;
    261 	default:
    262 		break;
    263 	}
    264 	return 0;
    265 }
    266 
    267 static void chomp(char *s)
    268 {
    269 	char *t;
    270 
    271 	for (t = s + strlen(s) - 1; t >= s; t--) {
    272 		if (*t == '\n' || *t == '\r')
    273 			*t = '\0';
    274 		else
    275 			break;
    276 	}
    277 }
    278 
    279 /*
    280  * Determine number of bytes in a seekable open file, without
    281  * assuming that stat(2) on that file has a useful size.
    282  * Has side affect of leaving the file rewound to the beginnning.
    283  */
    284 static int filesize(FILE * fp)
    285 {
    286 	int sz = 0;
    287 	rewind(fp);
    288 	while (fgetc(fp) != EOF)
    289 		sz++;
    290 	rewind(fp);
    291 	return sz;
    292 }
    293 
    294 /* Are strings s1 and s2 equal? */
    295 static int streq(const char *s1, const char *s2)
    296 {
    297 	return strcmp(s1, s2) == 0;
    298 }
    299 
    300 /* Is string 'pre' a prefix of string 's'? */
    301 static int strprefix(const char *s, const char *pre)
    302 {
    303 	return strncmp(s, pre, strlen(pre)) == 0;
    304 }
    305 
    306 /*
    307  * char *flgets(char *buf, int buflen, FILE *fp)
    308  *
    309  * Obtain one line from input file fp.  Copy up to first
    310  * buflen-1 chars of line into buffer buf, discarding any remainder
    311  * of line.  Stop reading at newline, discarding newline.
    312  * Nul terminate result and return pointer to buffer buf
    313  * on success, or NULL if nothing more to read or failure.
    314  */
    315 
    316 static char *flgets(char *buf, int buflen, FILE * fp)
    317 {
    318 	int c = -1;
    319 	char *bp;
    320 
    321 	bp = buf;
    322 	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
    323 		if (c == '\n')
    324 			goto newline;
    325 		*bp++ = c;
    326 	}
    327 	if ((c < 0) && (bp == buf))
    328 		return NULL;
    329 
    330 	if (c > 0) {
    331 		while ((c = getc(fp)) >= 0) {
    332 			if (c == '\n')
    333 				break;
    334 		}
    335 	}
    336 
    337 newline:
    338 	*bp++ = '\0';
    339 	return buf;
    340 }
    341 
    342 /*
    343  * sgetc(const char *inputbuf, int *offsetptr)
    344  *
    345  * Return next char from nul-terminated input buffer inputbuf,
    346  * starting at offset *offsetptr.  Increment *offsetptr.
    347  * If next char would be nul ('\0'), return EOF and don't
    348  * increment *offsetptr.
    349  */
    350 
    351 static int sgetc(const char *inputbuf, int *offsetptr)
    352 {
    353 	char c;
    354 
    355 	if ((c = inputbuf[*offsetptr]) != 0) {
    356 		*offsetptr = *offsetptr + 1;
    357 		return c;
    358 	} else {
    359 		return EOF;
    360 	}
    361 }
    362 
    363 /*
    364  * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
    365  *
    366  * Obtain next line from nul-terminated input buffer 'inputbuf',
    367  * starting at offset *offsetptr.  Copy up to first buflen-1
    368  * chars of line into output buffer buf, discarding any remainder
    369  * of line.  Stop reading at newline, discarding newline.
    370  * Nul terminate result and return pointer to output buffer
    371  * buf on success, or NULL if nothing more to read.
    372  */
    373 
    374 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
    375 {
    376 	int c = -1;
    377 	char *bp;
    378 
    379 	bp = buf;
    380 	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
    381 		if (c == '\n')
    382 			goto newline;
    383 		*bp++ = c;
    384 	}
    385 	if ((c < 0) && (bp == buf))
    386 		return NULL;
    387 
    388 	if (c > 0) {
    389 		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
    390 			if (c == '\n')
    391 				break;
    392 		}
    393 	}
    394 
    395 newline:
    396 	*bp++ = '\0';
    397 	return buf;
    398 }
    399 
    400 /*
    401  * time_t get_mtime(char *path)
    402  *
    403  * Return modtime of file at location path, else return 0.
    404  */
    405 
    406 static time_t get_mtime(const char *path)
    407 {
    408 	struct stat statbuf;
    409 
    410 	if (stat(path, &statbuf) != 0)
    411 		return 0;
    412 	return statbuf.st_mtime;
    413 }
    414 
    415 /*
    416  * int set_mtime(const char *path, time_t mtime)
    417  *
    418  * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
    419  * or -1 on error, setting errno.
    420  */
    421 
    422 static int set_mtime(const char *path, time_t mtime)
    423 {
    424 	struct utimbuf times;
    425 
    426 	times.actime = mtime;
    427 	times.modtime = mtime;
    428 	return utime(path, &times);
    429 }
    430 
    431 /*
    432  * True if two pathnames resolve to same file.
    433  * False if either path can not be stat'd,
    434  * or if the two paths resolve to a different file.
    435  */
    436 
    437 static int samefile(const char *path1, const char *path2)
    438 {
    439 	struct stat sb1, sb2;
    440 
    441 	if (stat(path1, &sb1) != 0)
    442 		return 0;
    443 	if (stat(path2, &sb2) != 0)
    444 		return 0;
    445 	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
    446 }
    447 
    448 #define slash(c) (*(c) == '/')
    449 #define eocomp(c) (slash(c) || !*(c))
    450 #define dot1(c) (*(c) == '.' && eocomp(c+1))
    451 
    452 /* In place path compression.  Remove extra dots and slashes. */
    453 static char *pathcomp(char *p)
    454 {
    455 	char *a = p;
    456 	char *b = p;
    457 
    458 	if (!p || !*p)
    459 		return p;
    460 	if (slash(p))
    461 		*b++ = *a++;
    462 	for (;;) {
    463 		if (slash(a))
    464 			while (slash(++a))
    465 				continue;
    466 		if (!*a) {
    467 			if (b == p)
    468 				*b++ = '.';
    469 			*b = '\0';
    470 			return (p);
    471 		} else if (dot1(a)) {
    472 			a++;
    473 		} else {
    474 			if ((b != p) && !slash(b - 1))
    475 				*b++ = '/';
    476 			while (!eocomp(a))
    477 				*b++ = *a++;
    478 		}
    479 	}
    480 }
    481 
    482 #undef slash
    483 #undef eocomp
    484 #undef dot1
    485 
    486 /*
    487  * pathcat2(buf, buflen, name1, name2)
    488  *
    489  * Return buf, of length buflen, with name1/name2 stored in it.
    490  */
    491 
    492 static char *pathcat2(char *buf, int buflen, const char *name1,
    493 		      const char *name2)
    494 {
    495 	(void)snprintf(buf, buflen, "%s/%s", name1, name2);
    496 	return pathcomp(buf);
    497 }
    498 
    499 /*
    500  * pathcat3(buf, buflen, name1, name2, name3)
    501  *
    502  * Return buf, of length buflen, with name1/name2/name3 stored in it.
    503  */
    504 
    505 static char *pathcat3(char *buf, int buflen, const char *name1,
    506 		      const char *name2, const char *name3)
    507 {
    508 	(void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
    509 	return pathcomp(buf);
    510 }
    511 
    512 /*
    513  * fullpath(buf, buflen, name)
    514  *
    515  * Put full path of cpuset 'name' in buffer 'buf'.  If name
    516  * starts with a slash (``/``) character, then this a path
    517  * relative to ``/dev/cpuset``, otherwise it is relative to
    518  * the current tasks cpuset.  Return 0 on success, else
    519  * -1 on error, setting errno.
    520  */
    521 
    522 static int fullpath(char *buf, int buflen, const char *name)
    523 {
    524 	int len;
    525 
    526 	/* easy case */
    527 	if (*name == '/') {
    528 		pathcat2(buf, buflen, cpusetmnt, name);
    529 		pathcomp(buf);
    530 		return 0;
    531 	}
    532 
    533 	/* hard case */
    534 	snprintf(buf, buflen, "%s/", cpusetmnt);
    535 	len = strlen(buf);
    536 	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
    537 		return -1;
    538 	if (strlen(buf) >= buflen - 1 - strlen(name)) {
    539 		errno = E2BIG;
    540 		return -1;
    541 	}
    542 	strcat(buf, "/");
    543 	strcat(buf, name);
    544 	pathcomp(buf);
    545 	return 0;
    546 }
    547 
    548 /*
    549  * fullpath2(buf, buflen, name1, name2)
    550  *
    551  * Like fullpath(), only concatenate two pathname components on end.
    552  */
    553 
    554 static int fullpath2(char *buf, int buflen, const char *name1,
    555 		     const char *name2)
    556 {
    557 	if (fullpath(buf, buflen, name1) < 0)
    558 		return -1;
    559 	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
    560 		errno = E2BIG;
    561 		return -1;
    562 	}
    563 	strcat(buf, "/");
    564 	strcat(buf, name2);
    565 	pathcomp(buf);
    566 	return 0;
    567 }
    568 
    569 /*
    570  * Convert the string length of an ascii hex mask to the number
    571  * of bits represented by that mask.
    572  *
    573  * The cpumask and nodemask values in /proc/self/status are in an
    574  * ascii format that uses 9 characters for each 32 bits of mask.
    575  */
    576 static int s2nbits(const char *s)
    577 {
    578 	return strlen(s) * 32 / 9;
    579 }
    580 
    581 static void update_mask_sizes()
    582 {
    583 	FILE *fp = NULL;
    584 	char *buf = NULL;
    585 	int fsize;
    586 
    587 	if ((fp = fopen(mask_size_file, "r")) == NULL)
    588 		goto done;
    589 	fsize = filesize(fp);
    590 	if ((buf = malloc(fsize)) == NULL)
    591 		goto done;
    592 
    593 	/*
    594 	 * Beware: mask sizing arithmetic is fussy.
    595 	 * The trailing newline left by fgets() is required.
    596 	 */
    597 	while (fgets(buf, fsize, fp)) {
    598 		if (strprefix(buf, cpumask_prefix))
    599 			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
    600 		if (strprefix(buf, nodemask_prefix))
    601 			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
    602 	}
    603 done:
    604 	free(buf);
    605 	if (fp != NULL)
    606 		fclose(fp);
    607 	if (cpumask_sz == 0)
    608 		cpumask_sz = DEFCPUBITS;
    609 	if (nodemask_sz == 0)
    610 		nodemask_sz = DEFNODEBITS;
    611 }
    612 
    613 /* Allocate a new struct cpuset */
    614 struct cpuset *cpuset_alloc()
    615 {
    616 	struct cpuset *cp = NULL;
    617 	int nbits;
    618 
    619 	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
    620 		goto err;
    621 
    622 	nbits = cpuset_cpus_nbits();
    623 	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
    624 		goto err;
    625 
    626 	nbits = cpuset_mems_nbits();
    627 	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
    628 		goto err;
    629 
    630 	return cp;
    631 err:
    632 	if (cp && cp->cpus)
    633 		bitmask_free(cp->cpus);
    634 	if (cp && cp->mems)
    635 		bitmask_free(cp->mems);
    636 	free(cp);
    637 	return NULL;
    638 }
    639 
    640 /* Free struct cpuset *cp */
    641 void cpuset_free(struct cpuset *cp)
    642 {
    643 	if (!cp)
    644 		return;
    645 	if (cp->cpus)
    646 		bitmask_free(cp->cpus);
    647 	if (cp->mems)
    648 		bitmask_free(cp->mems);
    649 	free(cp);
    650 }
    651 
    652 /* Number of bits in a CPU bitmask on current system */
    653 int cpuset_cpus_nbits()
    654 {
    655 	if (cpumask_sz == 0)
    656 		update_mask_sizes();
    657 	return cpumask_sz;
    658 }
    659 
    660 /* Number of bits in a Memory bitmask on current system */
    661 int cpuset_mems_nbits()
    662 {
    663 	if (nodemask_sz == 0)
    664 		update_mask_sizes();
    665 	return nodemask_sz;
    666 }
    667 
    668 /* Set CPUs in cpuset cp to bitmask cpus */
    669 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
    670 {
    671 	if (cp->cpus)
    672 		bitmask_free(cp->cpus);
    673 	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
    674 	if (cp->cpus == NULL)
    675 		return -1;
    676 	bitmask_copy(cp->cpus, cpus);
    677 	cp->cpus_valid = 1;
    678 	cp->cpus_dirty = 1;
    679 	return 0;
    680 }
    681 
    682 /* Set Memory Nodes in cpuset cp to bitmask mems */
    683 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
    684 {
    685 	if (cp->mems)
    686 		bitmask_free(cp->mems);
    687 	cp->mems = bitmask_alloc(bitmask_nbits(mems));
    688 	if (cp->mems == NULL)
    689 		return -1;
    690 	bitmask_copy(cp->mems, mems);
    691 	cp->mems_valid = 1;
    692 	cp->mems_dirty = 1;
    693 	return 0;
    694 }
    695 
    696 /* Set integer value optname of cpuset cp */
    697 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
    698 {
    699 	if (streq(optionname, "cpu_exclusive")) {
    700 		cp->cpu_exclusive = ! !value;
    701 		cp->cpu_exclusive_valid = 1;
    702 		cp->cpu_exclusive_dirty = 1;
    703 	} else if (streq(optionname, "mem_exclusive")) {
    704 		cp->mem_exclusive = ! !value;
    705 		cp->mem_exclusive_valid = 1;
    706 		cp->mem_exclusive_dirty = 1;
    707 	} else if (streq(optionname, "mem_hardwall")) {
    708 		cp->mem_hardwall = ! !value;
    709 		cp->mem_hardwall_valid = 1;
    710 		cp->mem_hardwall_dirty = 1;
    711 	} else if (streq(optionname, "notify_on_release")) {
    712 		cp->notify_on_release = ! !value;
    713 		cp->notify_on_release_valid = 1;
    714 		cp->notify_on_release_dirty = 1;
    715 	} else if (streq(optionname, "memory_pressure_enabled")) {
    716 		cp->memory_pressure_enabled = ! !value;
    717 		cp->memory_pressure_enabled_valid = 1;
    718 		cp->memory_pressure_enabled_dirty = 1;
    719 	} else if (streq(optionname, "memory_migrate")) {
    720 		cp->memory_migrate = ! !value;
    721 		cp->memory_migrate_valid = 1;
    722 		cp->memory_migrate_dirty = 1;
    723 	} else if (streq(optionname, "memory_spread_page")) {
    724 		cp->memory_spread_page = ! !value;
    725 		cp->memory_spread_page_valid = 1;
    726 		cp->memory_spread_page_dirty = 1;
    727 	} else if (streq(optionname, "memory_spread_slab")) {
    728 		cp->memory_spread_slab = ! !value;
    729 		cp->memory_spread_slab_valid = 1;
    730 		cp->memory_spread_slab_dirty = 1;
    731 	} else if (streq(optionname, "sched_load_balance")) {
    732 		cp->sched_load_balance = ! !value;
    733 		cp->sched_load_balance_valid = 1;
    734 		cp->sched_load_balance_dirty = 1;
    735 	} else if (streq(optionname, "sched_relax_domain_level")) {
    736 		cp->sched_relax_domain_level = value;
    737 		cp->sched_relax_domain_level_valid = 1;
    738 		cp->sched_relax_domain_level_dirty = 1;
    739 	} else
    740 		return -2;	/* optionname not recognized */
    741 	return 0;
    742 }
    743 
    744 /* [optional] Set string value optname */
    745 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
    746 		    UNUSED const char *value)
    747 {
    748 	return -2;		/* For now, all string options unrecognized */
    749 }
    750 
    751 /* Return handle for reading memory_pressure. */
    752 int cpuset_open_memory_pressure(const char *cpusetpath)
    753 {
    754 	char buf[PATH_MAX];
    755 
    756 	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
    757 	return open(buf, O_RDONLY);
    758 }
    759 
    760 /* Return current memory_pressure of cpuset. */
    761 int cpuset_read_memory_pressure(int han)
    762 {
    763 	char buf[SMALL_BUFSZ];
    764 
    765 	if (pread(han, buf, sizeof(buf), 0L) < 0)
    766 		return -1;
    767 	return atoi(buf);
    768 }
    769 
    770 /* Close handle for reading memory pressure. */
    771 void cpuset_close_memory_pressure(int han)
    772 {
    773 	close(han);
    774 }
    775 
    776 /*
    777  * Resolve cpuset pointer (to that of current task if cp == NULL).
    778  *
    779  * If cp not NULL, just return it.  If cp is NULL, return pointer
    780  * to temporary cpuset for current task, and set *cp_tofree to
    781  * pointer to that same temporary cpuset, to be freed later.
    782  *
    783  * Return NULL and set errno on error.  Errors can occur when
    784  * resolving the current tasks cpuset.
    785  */
    786 static const struct cpuset *resolve_cp(const struct cpuset *cp,
    787 				       struct cpuset **cp_tofree)
    788 {
    789 	const struct cpuset *rcp;
    790 
    791 	if (cp) {
    792 		rcp = cp;
    793 	} else {
    794 		struct cpuset *cp1 = cpuset_alloc();
    795 		if (cp1 == NULL)
    796 			goto err;
    797 		if (cpuset_cpusetofpid(cp1, 0) < 0) {
    798 			cpuset_free(cp1);
    799 			goto err;
    800 		}
    801 		*cp_tofree = cp1;
    802 		rcp = cp1;
    803 	}
    804 	return rcp;
    805 err:
    806 	return NULL;
    807 }
    808 
    809 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
    810 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
    811 {
    812 	struct cpuset *cp_tofree = NULL;
    813 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
    814 
    815 	if (!cp1)
    816 		goto err;
    817 	if (cp1->cpus == NULL) {
    818 		errno = EINVAL;
    819 		goto err;
    820 	}
    821 	bitmask_copy(cpus, cp1->cpus);
    822 	cpuset_free(cp_tofree);
    823 	return 0;
    824 err:
    825 	cpuset_free(cp_tofree);
    826 	return -1;
    827 }
    828 
    829 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
    830 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
    831 {
    832 	struct cpuset *cp_tofree = NULL;
    833 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
    834 
    835 	if (!cp1)
    836 		goto err;
    837 	if (cp1->mems == NULL) {
    838 		errno = EINVAL;
    839 		goto err;
    840 	}
    841 	bitmask_copy(mems, cp1->mems);
    842 	cpuset_free(cp_tofree);
    843 	return 0;
    844 err:
    845 	cpuset_free(cp_tofree);
    846 	return -1;
    847 }
    848 
    849 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
    850 int cpuset_cpus_weight(const struct cpuset *cp)
    851 {
    852 	struct cpuset *cp_tofree = NULL;
    853 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
    854 	int w = -1;
    855 
    856 	if (!cp1)
    857 		goto err;
    858 	if (cp1->cpus == NULL) {
    859 		errno = EINVAL;
    860 		goto err;
    861 	}
    862 	w = bitmask_weight(cp1->cpus);
    863 	/* fall into ... */
    864 err:
    865 	cpuset_free(cp_tofree);
    866 	return w;
    867 }
    868 
    869 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
    870 int cpuset_mems_weight(const struct cpuset *cp)
    871 {
    872 	struct cpuset *cp_tofree = NULL;
    873 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
    874 	int w = -1;
    875 
    876 	if (!cp1)
    877 		goto err;
    878 	if (cp1->mems == NULL) {
    879 		errno = EINVAL;
    880 		goto err;
    881 	}
    882 	w = bitmask_weight(cp1->mems);
    883 	/* fall into ... */
    884 err:
    885 	cpuset_free(cp_tofree);
    886 	return w;
    887 }
    888 
    889 /* Return integer value of option optname in cp */
    890 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
    891 {
    892 	if (streq(optionname, "cpu_exclusive"))
    893 		return cp->cpu_exclusive;
    894 	else if (streq(optionname, "mem_exclusive"))
    895 		return cp->mem_exclusive;
    896 	else if (streq(optionname, "mem_hardwall"))
    897 		return cp->mem_hardwall;
    898 	else if (streq(optionname, "notify_on_release"))
    899 		return cp->notify_on_release;
    900 	else if (streq(optionname, "memory_pressure_enabled"))
    901 		return cp->memory_pressure_enabled;
    902 	else if (streq(optionname, "memory_migrate"))
    903 		return cp->memory_migrate;
    904 	else if (streq(optionname, "memory_spread_page"))
    905 		return cp->memory_spread_page;
    906 	else if (streq(optionname, "memory_spread_slab"))
    907 		return cp->memory_spread_slab;
    908 	else if (streq(optionname, "sched_load_balance"))
    909 		return cp->sched_load_balance;
    910 	else if (streq(optionname, "sched_relax_domain_level"))
    911 		return cp->sched_relax_domain_level;
    912 	else
    913 		return -2;	/* optionname not recognized */
    914 }
    915 
    916 /* [optional] Return string value of optname */
    917 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
    918 			    UNUSED const char *optionname)
    919 {
    920 	return NULL;		/* For now, all string options unrecognized */
    921 }
    922 
    923 static int read_flag(const char *filepath, char *flagp)
    924 {
    925 	char buf[SMALL_BUFSZ];	/* buffer a "0" or "1" flag line */
    926 	int fd = -1;
    927 
    928 	if ((fd = open(filepath, O_RDONLY)) < 0)
    929 		goto err;
    930 	if (read(fd, buf, sizeof(buf)) < 1)
    931 		goto err;
    932 	if (atoi(buf))
    933 		*flagp = 1;
    934 	else
    935 		*flagp = 0;
    936 	close(fd);
    937 	return 0;
    938 err:
    939 	if (fd >= 0)
    940 		close(fd);
    941 	return -1;
    942 }
    943 
    944 static int load_flag(const char *path, char *flagp, const char *flag)
    945 {
    946 	char buf[PATH_MAX];
    947 
    948 	pathcat2(buf, sizeof(buf), path, flag);
    949 	return read_flag(buf, flagp);
    950 }
    951 
    952 static int read_number(const char *filepath, int *numberp)
    953 {
    954 	char buf[SMALL_BUFSZ];
    955 	int fd = -1;
    956 
    957 	if ((fd = open(filepath, O_RDONLY)) < 0)
    958 		goto err;
    959 	if (read(fd, buf, sizeof(buf)) < 1)
    960 		goto err;
    961 	*numberp = atoi(buf);
    962 	close(fd);
    963 	return 0;
    964 err:
    965 	if (fd >= 0)
    966 		close(fd);
    967 	return -1;
    968 }
    969 
    970 static int load_number(const char *path, int *numberp, const char *file)
    971 {
    972 	char buf[PATH_MAX];
    973 
    974 	pathcat2(buf, sizeof(buf), path, file);
    975 	return read_number(buf, numberp);
    976 }
    977 
    978 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
    979 {
    980 	FILE *fp = NULL;
    981 	char *buf = NULL;
    982 	int buflen;
    983 	struct bitmask *bmp = NULL;
    984 
    985 	if ((fp = fopen(filepath, "r")) == NULL)
    986 		goto err;
    987 	buflen = filesize(fp) + 1;	/* + 1 for nul term */
    988 	if ((buf = malloc(buflen)) == NULL)
    989 		goto err;
    990 	if (flgets(buf, buflen, fp) == NULL)
    991 		goto err;
    992 	fclose(fp);
    993 	fp = NULL;
    994 
    995 	if ((bmp = bitmask_alloc(nbits)) == NULL)
    996 		goto err;
    997 	if (*buf && bitmask_parselist(buf, bmp) < 0)
    998 		goto err;
    999 	if (*bmpp)
   1000 		bitmask_free(*bmpp);
   1001 	*bmpp = bmp;
   1002 	free(buf);
   1003 	buf = NULL;
   1004 	return 0;
   1005 err:
   1006 	if (buf != NULL)
   1007 		free(buf);
   1008 	if (fp != NULL)
   1009 		fclose(fp);
   1010 	if (bmp != NULL)
   1011 		bitmask_free(bmp);
   1012 	return -1;
   1013 }
   1014 
   1015 static int load_mask(const char *path, struct bitmask **bmpp,
   1016 		     int nbits, const char *mask)
   1017 {
   1018 	char buf[PATH_MAX];
   1019 
   1020 	pathcat2(buf, sizeof(buf), path, mask);
   1021 	return read_mask(buf, bmpp, nbits);
   1022 }
   1023 
   1024 /* Write string to file at given filepath.  Create or truncate file. */
   1025 static int write_string_file(const char *filepath, const char *str)
   1026 {
   1027 	int fd = -1;
   1028 
   1029 	if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
   1030 		goto err;
   1031 	if (write(fd, str, strlen(str)) < 0)
   1032 		goto err;
   1033 	close(fd);
   1034 	return 0;
   1035 err:
   1036 	if (fd >= 0)
   1037 		close(fd);
   1038 	return -1;
   1039 }
   1040 
   1041 /* Size and allocate buffer.  Write bitmask into it.  Caller must free */
   1042 static char *sprint_mask_buf(const struct bitmask *bmp)
   1043 {
   1044 	char *buf = NULL;
   1045 	int buflen;
   1046 	char c;
   1047 
   1048 	/* First bitmask_displaylist() call just to get the length */
   1049 	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
   1050 	if ((buf = malloc(buflen)) == NULL)
   1051 		return NULL;
   1052 	bitmask_displaylist(buf, buflen, bmp);
   1053 	return buf;
   1054 }
   1055 
   1056 static int exists_flag(const char *path, const char *flag)
   1057 {
   1058 	char buf[PATH_MAX];
   1059 	struct stat statbuf;
   1060 	int rc;
   1061 
   1062 	pathcat2(buf, sizeof(buf), path, flag);
   1063 	rc = (stat(buf, &statbuf) == 0);
   1064 	errno = 0;
   1065 	return rc;
   1066 }
   1067 
   1068 static int store_flag(const char *path, const char *flag, int val)
   1069 {
   1070 	char buf[PATH_MAX];
   1071 
   1072 	pathcat2(buf, sizeof(buf), path, flag);
   1073 	return write_string_file(buf, val ? "1" : "0");
   1074 }
   1075 
   1076 static int store_number(const char *path, const char *file, int val)
   1077 {
   1078 	char buf[PATH_MAX];
   1079 	char data[SMALL_BUFSZ];
   1080 
   1081 	memset(data, 0, sizeof(data));
   1082 	pathcat2(buf, sizeof(buf), path, file);
   1083 	snprintf(data, sizeof(data), "%d", val);
   1084 	return write_string_file(buf, data);
   1085 }
   1086 
   1087 static int store_mask(const char *path, const char *mask,
   1088 		      const struct bitmask *bmp)
   1089 {
   1090 	char maskpath[PATH_MAX];
   1091 	char *bp = NULL;
   1092 	int rc;
   1093 
   1094 	if (bmp == NULL)
   1095 		return 0;
   1096 	pathcat2(maskpath, sizeof(maskpath), path, mask);
   1097 	if ((bp = sprint_mask_buf(bmp)) == NULL)
   1098 		return -1;
   1099 	rc = write_string_file(maskpath, bp);
   1100 	free(bp);
   1101 	return rc;
   1102 }
   1103 
   1104 /*
   1105  * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
   1106  * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
   1107  * were N == cpu number.
   1108  */
   1109 
   1110 char cpu_online(unsigned int cpu)
   1111 {
   1112 	char online;
   1113 	char cpupath[PATH_MAX];
   1114 
   1115 	(void)snprintf(cpupath, sizeof(cpupath),
   1116 		       "/sys/devices/system/cpu/cpu%d/online", cpu);
   1117 	if (read_flag(cpupath, &online) < 0)
   1118 		return 0;	/* oops - guess that cpu's not there */
   1119 	return online;
   1120 }
   1121 
   1122 /*
   1123  * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
   1124  * to the node on which that cpu resides or cpuset_mems_nbits().
   1125  *
   1126  * To avoid every user having to recalculate this relation
   1127  * from various clues in the sysfs file system (below the
   1128  * path /sys/devices/system) a copy of this map is kept at
   1129  * /var/run/cpunodemap.
   1130  *
   1131  * The system automatically cleans out files below
   1132  * /var/run on each system reboot (see the init script
   1133  * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
   1134  * about stale data in this file across reboots.  If the file
   1135  * is missing, let the first process that needs it, and has
   1136  * permission to write in the /var/run directory, rebuild it.
   1137  *
   1138  * If using this cached data, remember the mtime of the mapfile
   1139  * the last time we read it in case something like a hotplug
   1140  * event results in the file being removed and rebuilt, so we
   1141  * can detect if we're using a stale cache, and need to reload.
   1142  *
   1143  * The mtime of this file is set to the time when we did
   1144  * the recalculation of the map, from the clues beneath
   1145  * /sys/devices/system.  This is done so that a program
   1146  * won't see the mapfile it just wrote as being newer than what
   1147  * it just wrote out (store_map) and read the same map back in
   1148  * (load_file).
   1149  */
   1150 
   1151 /*
   1152  * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
   1153  *
   1154  * Note on locking and flockfile(FILE *):
   1155  *
   1156  *  We use flockfile() and funlockfile() instead of directly
   1157  *  calling pthread_mutex_lock and pthread_mutex_unlock on
   1158  *  a pthread_mutex_t, because this avoids forcing the app
   1159  *  to link with libpthread.  The glibc implementation of
   1160  *  flockfile/funlockfile will fall back to no-ops if libpthread
   1161  *  doesn't happen to be linked.
   1162  *
   1163  *  Since flockfile already has the moderately convoluted
   1164  *  combination of weak and strong symbols required to accomplish
   1165  *  this, it is easier to use flockfile() on some handy FILE *
   1166  *  stream as a surrogate for pthread locking than it is to so
   1167  *  re-invent that wheel.
   1168  *
   1169  *  Forcing all apps that use cpusets to link with libpthread
   1170  *  would force non-transparent initialization on apps that
   1171  *  might not be prepared to handle it.
   1172  *
   1173  *  The application using libcpuset should never notice this
   1174  *  odd use of flockfile(), because we never return to the
   1175  *  application from any libcpuset call with any such lock held.
   1176  *  We just use this locking for guarding some non-atomic cached
   1177  *  data updates and accesses, internal to some libcpuset calls.
   1178  *  Also, flockfile() allows recursive nesting, so if the app
   1179  *  calls libcpuset holding such a file lock, we won't deadlock
   1180  *  if we go to acquire the same lock.  We'll just get the lock
   1181  *  and increment its counter while we hold it.
   1182  */
   1183 
   1184 static struct cpunodemap {
   1185 	int *map;		/* map[cpumask_sz]: maps cpu to its node */
   1186 	time_t mtime;		/* modtime of mapfile when last read */
   1187 } cpunodemap;
   1188 
   1189 /*
   1190  * rebuild_map() - Rebuild cpunodemap[] from scratch.
   1191  *
   1192  * Situation:
   1193  *	Neither our in-memory cpunodemap[] array nor the
   1194  *	cache of it in mapfile is current.
   1195  * Action:
   1196  *	Rebuild it from first principles and the information
   1197  *	available below /sys/devices/system.
   1198  */
   1199 
   1200 static void rebuild_map()
   1201 {
   1202 	char buf[PATH_MAX];
   1203 	DIR *dir1, *dir2;
   1204 	struct dirent *dent1, *dent2;
   1205 	int ncpus = cpuset_cpus_nbits();
   1206 	int nmems = cpuset_mems_nbits();
   1207 	unsigned int cpu, mem;
   1208 
   1209 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
   1210 		cpunodemap.map[cpu] = -1;
   1211 	pathcat2(buf, sizeof(buf), sysdevices, "node");
   1212 	if ((dir1 = opendir(buf)) == NULL)
   1213 		return;
   1214 	while ((dent1 = readdir(dir1)) != NULL) {
   1215 		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
   1216 			continue;
   1217 		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
   1218 		if ((dir2 = opendir(buf)) == NULL)
   1219 			continue;
   1220 		while ((dent2 = readdir(dir2)) != NULL) {
   1221 			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
   1222 				continue;
   1223 			if (cpu >= (unsigned int)ncpus
   1224 			    || mem >= (unsigned int)nmems)
   1225 				continue;
   1226 			cpunodemap.map[cpu] = mem;
   1227 		}
   1228 		closedir(dir2);
   1229 	}
   1230 	closedir(dir1);
   1231 	cpunodemap.mtime = time(0);
   1232 }
   1233 
   1234 /*
   1235  * load_map() - Load cpunodemap[] from mapfile.
   1236  *
   1237  * Situation:
   1238  *	The cpunodemap in mapfile is more recent than
   1239  *	what we have in the cpunodemap[] array.
   1240  * Action:
   1241  *	Reload the cpunodemap[] array from the file.
   1242  */
   1243 
   1244 static void load_map()
   1245 {
   1246 	char buf[SMALL_BUFSZ];	/* buffer 1 line of mapfile */
   1247 	FILE *mapfp;		/* File stream on mapfile */
   1248 	int ncpus = cpuset_cpus_nbits();
   1249 	int nmems = cpuset_mems_nbits();
   1250 	unsigned int cpu, mem;
   1251 
   1252 	if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
   1253 		return;
   1254 	cpunodemap.mtime = get_mtime(mapfile);
   1255 	if ((mapfp = fopen(mapfile, "r")) == NULL)
   1256 		return;
   1257 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
   1258 		cpunodemap.map[cpu] = nmems;
   1259 	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
   1260 		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
   1261 			continue;
   1262 		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
   1263 			continue;
   1264 		cpunodemap.map[cpu] = mem;
   1265 	}
   1266 	fclose(mapfp);
   1267 }
   1268 
   1269 /*
   1270  * store_map() - Write cpunodemap[] out to mapfile.
   1271  *
   1272  * Situation:
   1273  *	The cpunodemap in the cpunodemap[] array is
   1274  *	more recent than the one in mapfile.
   1275  * Action:
   1276  *	Write cpunodemap[] out to mapfile.
   1277  */
   1278 
   1279 static void store_map()
   1280 {
   1281 	char buf[PATH_MAX];
   1282 	int fd = -1;
   1283 	FILE *mapfp = NULL;
   1284 	int ncpus = cpuset_cpus_nbits();
   1285 	int nmems = cpuset_mems_nbits();
   1286 	unsigned int cpu, mem;
   1287 
   1288 	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
   1289 	if ((fd = mkstemp(buf)) < 0)
   1290 		goto err;
   1291 	if ((mapfp = fdopen(fd, "w")) == NULL)
   1292 		goto err;
   1293 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
   1294 		mem = cpunodemap.map[cpu];
   1295 		if (mem < (unsigned int)nmems)
   1296 			fprintf(mapfp, "%u %u\n", cpu, mem);
   1297 	}
   1298 	fclose(mapfp);
   1299 	set_mtime(buf, cpunodemap.mtime);
   1300 	if (rename(buf, mapfile) < 0)
   1301 		goto err;
   1302 	/* mkstemp() creates mode 0600 - change to world readable */
   1303 	(void)chmod(mapfile, 0444);
   1304 	return;
   1305 err:
   1306 	if (mapfp != NULL) {
   1307 		fclose(mapfp);
   1308 		fd = -1;
   1309 	}
   1310 	if (fd >= 0)
   1311 		close(fd);
   1312 	(void)unlink(buf);
   1313 }
   1314 
   1315 /*
   1316  * Load and gain thread safe access to the <cpu, node> map.
   1317  *
   1318  * Return 0 on success with flockfile(stdin) held.
   1319  * Each successful get_map() call must be matched with a
   1320  * following put_map() call to release the lock.
   1321  *
   1322  * On error, return -1 with errno set and no lock held.
   1323  */
   1324 
   1325 static int get_map()
   1326 {
   1327 	time_t file_mtime;
   1328 
   1329 	flockfile(stdin);
   1330 
   1331 	if (cpunodemap.map == NULL) {
   1332 		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
   1333 		if (cpunodemap.map == NULL)
   1334 			goto err;
   1335 	}
   1336 
   1337 	/* If no one has a good cpunodemap, rebuild from scratch */
   1338 	file_mtime = get_mtime(mapfile);
   1339 	if (cpunodemap.mtime == 0 && file_mtime == 0)
   1340 		rebuild_map();
   1341 
   1342 	/* If either cpunodemap[] or mapfile newer, update other with it */
   1343 	file_mtime = get_mtime(mapfile);
   1344 	if (cpunodemap.mtime < file_mtime)
   1345 		load_map();
   1346 	else if (cpunodemap.mtime > file_mtime)
   1347 		store_map();
   1348 	return 0;
   1349 err:
   1350 	funlockfile(stdin);
   1351 	return -1;
   1352 }
   1353 
   1354 static void put_map()
   1355 {
   1356 	funlockfile(stdin);
   1357 }
   1358 
   1359 /* Set cpus to those local to Memory Nodes mems */
   1360 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
   1361 {
   1362 	int ncpus = cpuset_cpus_nbits();
   1363 	unsigned int cpu;
   1364 
   1365 	if (check() < 0)
   1366 		return -1;
   1367 
   1368 	get_map();
   1369 	bitmask_clearall(cpus);
   1370 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
   1371 		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
   1372 			bitmask_setbit(cpus, cpu);
   1373 	}
   1374 	put_map();
   1375 	return 0;
   1376 }
   1377 
   1378 /* Set mems to those local to CPUs cpus */
   1379 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
   1380 {
   1381 	int ncpus = cpuset_cpus_nbits();
   1382 	unsigned int cpu;
   1383 
   1384 	if (check() < 0)
   1385 		return -1;
   1386 
   1387 	get_map();
   1388 	bitmask_clearall(mems);
   1389 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
   1390 		if (bitmask_isbitset(cpus, cpu))
   1391 			bitmask_setbit(mems, cpunodemap.map[cpu]);
   1392 	}
   1393 	put_map();
   1394 	return 0;
   1395 }
   1396 
   1397 /*
   1398  * distmap[]
   1399  *
   1400  * Array of ints of size cpumask_sz by nodemask_sz.
   1401  *
   1402  * Element distmap[cpu][mem] is the distance between CPU cpu
   1403  * and Memory Node mem.  Distances are weighted to roughly
   1404  * approximate the cost of memory references, and scaled so that
   1405  * the distance from a CPU to its local Memory Node is ten (10).
   1406  *
   1407  * The first call to cpuset_cpumemdist() builds this map, from
   1408  * whatever means the kernel provides to obtain these distances.
   1409  *
   1410  * These distances derive from ACPI SLIT table entries, which are
   1411  * eight bits in size.
   1412  *
   1413  * Hold flockfile(stdout) while using distmap for posix thread safety.
   1414  */
   1415 
   1416 typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
   1417 
   1418 static distmap_entry_t *distmap;	/* maps <cpu, mem> to distance */
   1419 
   1420 #define DISTMAP_MAX UCHAR_MAX	/* maximum value in distmap[] */
   1421 
   1422 #define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
   1423 
   1424 /*
   1425  * Parse arch neutral lines from 'distance' files of form:
   1426  *
   1427  *	46 66 10 20
   1428  *
   1429  * The lines contain a space separated list of distances, which is parsed
   1430  * into array dists[] of each nodes distance from the specified node.
   1431  *
   1432  * Result is placed in distmap[ncpus][nmems]:
   1433  *
   1434  *	For each cpu c on node:
   1435  *		For each node position n in list of distances:
   1436  *			distmap[c][n] = dists[n]
   1437  */
   1438 
   1439 static int parse_distmap_line(unsigned int node, char *buf)
   1440 {
   1441 	char *p, *q;
   1442 	int ncpus = cpuset_cpus_nbits();
   1443 	int nmems = cpuset_mems_nbits();
   1444 	unsigned int c, n;
   1445 	distmap_entry_t *dists = NULL;
   1446 	struct bitmask *cpus = NULL, *mems = NULL;
   1447 	int ret = -1;
   1448 
   1449 	p = buf;
   1450 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
   1451 		goto err;
   1452 	for (n = 0; n < (unsigned int)nmems; n++)
   1453 		dists[n] = DISTMAP_MAX;
   1454 
   1455 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
   1456 		unsigned int d;
   1457 
   1458 		if ((p = strpbrk(p, "0123456789")) == NULL)
   1459 			break;
   1460 		d = strtoul(p, &q, 10);
   1461 		if (p == q)
   1462 			break;
   1463 		if (d < DISTMAP_MAX)
   1464 			dists[n] = (distmap_entry_t) d;
   1465 	}
   1466 
   1467 	if ((mems = bitmask_alloc(nmems)) == NULL)
   1468 		goto err;
   1469 	bitmask_setbit(mems, node);
   1470 
   1471 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
   1472 		goto err;
   1473 	cpuset_localcpus(mems, cpus);
   1474 
   1475 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
   1476 	     c = bitmask_next(cpus, c + 1))
   1477 		for (n = 0; n < (unsigned int)nmems; n++)
   1478 			distmap[I(c, n)] = dists[n];
   1479 	ret = 0;
   1480 	/* fall into ... */
   1481 err:
   1482 	bitmask_free(mems);
   1483 	bitmask_free(cpus);
   1484 	free(dists);
   1485 	return ret;
   1486 }
   1487 
   1488 static int parse_distance_file(unsigned int node, const char *path)
   1489 {
   1490 	FILE *fp;
   1491 	char *buf = NULL;
   1492 	int buflen;
   1493 
   1494 	if ((fp = fopen(path, "r")) == NULL)
   1495 		goto err;
   1496 
   1497 	buflen = filesize(fp);
   1498 
   1499 	if ((buf = malloc(buflen)) == NULL)
   1500 		goto err;
   1501 
   1502 	if (flgets(buf, buflen, fp) == NULL)
   1503 		goto err;
   1504 
   1505 	if (parse_distmap_line(node, buf) < 0)
   1506 		goto err;
   1507 
   1508 	free(buf);
   1509 	fclose(fp);
   1510 	return 0;
   1511 err:
   1512 	free(buf);
   1513 	if (fp)
   1514 		fclose(fp);
   1515 	return -1;
   1516 }
   1517 
   1518 static void build_distmap()
   1519 {
   1520 	static int tried_before = 0;
   1521 	int ncpus = cpuset_cpus_nbits();
   1522 	int nmems = cpuset_mems_nbits();
   1523 	int c, m;
   1524 	DIR *dir = NULL;
   1525 	struct dirent *dent;
   1526 
   1527 	if (tried_before)
   1528 		goto err;
   1529 	tried_before = 1;
   1530 
   1531 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
   1532 		goto err;
   1533 
   1534 	for (c = 0; c < ncpus; c++)
   1535 		for (m = 0; m < nmems; m++)
   1536 			distmap[I(c, m)] = DISTMAP_MAX;
   1537 
   1538 	if ((dir = opendir(distance_directory)) == NULL)
   1539 		goto err;
   1540 	while ((dent = readdir(dir)) != NULL) {
   1541 		char buf[PATH_MAX];
   1542 		unsigned int node;
   1543 
   1544 		if (sscanf(dent->d_name, "node%u", &node) < 1)
   1545 			continue;
   1546 		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
   1547 			 "distance");
   1548 		if (parse_distance_file(node, buf) < 0)
   1549 			goto err;
   1550 	}
   1551 	closedir(dir);
   1552 	return;
   1553 err:
   1554 	if (dir)
   1555 		closedir(dir);
   1556 	free(distmap);
   1557 	distmap = NULL;
   1558 }
   1559 
   1560 #ifdef ALTERNATE_SN_DISTMAP
   1561 
   1562 /*
   1563  * Parse SN architecture specific line of form:
   1564  *
   1565  *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
   1566  *
   1567  * Second field is node number.  The "dist" field is the colon separated list
   1568  * of distances, which is parsed into array dists[] of each nodes distance
   1569  * from that node.
   1570  *
   1571  * Result is placed in distmap[ncpus][nmems]:
   1572  *
   1573  *	For each cpu c on that node:
   1574  *		For each node position n in list of distances:
   1575  *			distmap[c][n] = dists[n]
   1576  */
   1577 
   1578 static void parse_distmap_line_sn(char *buf)
   1579 {
   1580 	char *p, *pend, *q;
   1581 	int ncpus = cpuset_cpus_nbits();
   1582 	int nmems = cpuset_mems_nbits();
   1583 	unsigned long c, n, node;
   1584 	distmap_entry_t *dists = NULL;
   1585 	struct bitmask *cpus = NULL, *mems = NULL;
   1586 
   1587 	if ((p = strchr(buf, ' ')) == NULL)
   1588 		goto err;
   1589 	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
   1590 		goto err;
   1591 	if ((p = strstr(q, " dist ")) == NULL)
   1592 		goto err;
   1593 	p += strlen(" dist ");
   1594 	if ((pend = strchr(p, ' ')) != NULL)
   1595 		*pend = '\0';
   1596 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
   1597 		goto err;
   1598 	for (n = 0; n < (unsigned int)nmems; n++)
   1599 		dists[n] = DISTMAP_MAX;
   1600 
   1601 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
   1602 		unsigned long d;
   1603 
   1604 		if ((p = strpbrk(p, "0123456789")) == NULL)
   1605 			break;
   1606 		d = strtoul(p, &q, 10);
   1607 		if (p == q)
   1608 			break;
   1609 		if (d < DISTMAP_MAX)
   1610 			dists[n] = (distmap_entry_t) d;
   1611 	}
   1612 
   1613 	if ((mems = bitmask_alloc(nmems)) == NULL)
   1614 		goto err;
   1615 	bitmask_setbit(mems, node);
   1616 
   1617 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
   1618 		goto err;
   1619 	cpuset_localcpus(mems, cpus);
   1620 
   1621 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
   1622 	     c = bitmask_next(cpus, c + 1))
   1623 		for (n = 0; n < (unsigned int)nmems; n++)
   1624 			distmap[I(c, n)] = dists[n];
   1625 	/* fall into ... */
   1626 err:
   1627 	bitmask_free(mems);
   1628 	bitmask_free(cpus);
   1629 	free(dists);
   1630 }
   1631 
   1632 static void build_distmap_sn()
   1633 {
   1634 	int ncpus = cpuset_cpus_nbits();
   1635 	int nmems = cpuset_mems_nbits();
   1636 	int c, m;
   1637 	static int tried_before = 0;
   1638 	FILE *fp = NULL;
   1639 	char *buf = NULL;
   1640 	int buflen;
   1641 
   1642 	if (tried_before)
   1643 		goto err;
   1644 	tried_before = 1;
   1645 
   1646 	if ((fp = fopen(sn_topology, "r")) == NULL)
   1647 		goto err;
   1648 
   1649 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
   1650 		goto err;
   1651 
   1652 	for (c = 0; c < ncpus; c++)
   1653 		for (m = 0; m < nmems; m++)
   1654 			distmap[I(c, m)] = DISTMAP_MAX;
   1655 
   1656 	buflen = filesize(fp);
   1657 	if ((buf = malloc(buflen)) == NULL)
   1658 		goto err;
   1659 
   1660 	while (flgets(buf, buflen, fp) != NULL)
   1661 		if (strprefix(buf, sn_top_node_prefix))
   1662 			parse_distmap_line_sn(buf);
   1663 
   1664 	free(buf);
   1665 	fclose(fp);
   1666 	return;
   1667 err:
   1668 	free(buf);
   1669 	free(distmap);
   1670 	distmap = NULL;
   1671 	if (fp)
   1672 		fclose(fp);
   1673 }
   1674 
   1675 #endif
   1676 
   1677 /* [optional] Hardware distance from CPU to Memory Node */
   1678 unsigned int cpuset_cpumemdist(int cpu, int mem)
   1679 {
   1680 	int ncpus = cpuset_cpus_nbits();
   1681 	int nmems = cpuset_mems_nbits();
   1682 	distmap_entry_t r = DISTMAP_MAX;
   1683 
   1684 	flockfile(stdout);
   1685 
   1686 	if (check() < 0)
   1687 		goto err;
   1688 
   1689 	if (distmap == NULL)
   1690 		build_distmap();
   1691 
   1692 #ifdef ALTERNATE_SN_DISTMAP
   1693 	if (distmap == NULL)
   1694 		build_distmap_sn();
   1695 #endif
   1696 
   1697 	if (distmap == NULL)
   1698 		goto err;
   1699 
   1700 	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
   1701 		goto err;
   1702 
   1703 	r = distmap[I(cpu, mem)];
   1704 	/* fall into ... */
   1705 err:
   1706 	funlockfile(stdout);
   1707 	return r;
   1708 }
   1709 
   1710 /* [optional] Return Memory Node closest to cpu */
   1711 int cpuset_cpu2node(int cpu)
   1712 {
   1713 	int ncpus = cpuset_cpus_nbits();
   1714 	int nmems = cpuset_mems_nbits();
   1715 	struct bitmask *cpus = NULL, *mems = NULL;
   1716 	int r = -1;
   1717 
   1718 	if (check() < 0)
   1719 		goto err;
   1720 
   1721 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
   1722 		goto err;
   1723 	bitmask_setbit(cpus, cpu);
   1724 
   1725 	if ((mems = bitmask_alloc(nmems)) == NULL)
   1726 		goto err;
   1727 	cpuset_localmems(cpus, mems);
   1728 	r = bitmask_first(mems);
   1729 	/* fall into ... */
   1730 err:
   1731 	bitmask_free(cpus);
   1732 	bitmask_free(mems);
   1733 	return r;
   1734 }
   1735 
   1736 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
   1737 {
   1738 	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
   1739 		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
   1740 			goto err;
   1741 	}
   1742 
   1743 	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
   1744 		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
   1745 			goto err;
   1746 	}
   1747 
   1748 	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
   1749 		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
   1750 			goto err;
   1751 	}
   1752 
   1753 	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
   1754 		if (store_flag(path, "notify_on_release", cp->notify_on_release)
   1755 		    < 0)
   1756 			goto err;
   1757 	}
   1758 
   1759 	if (cp->memory_migrate_valid &&
   1760 	    cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
   1761 		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
   1762 			goto err;
   1763 	}
   1764 
   1765 	if (cp->memory_pressure_enabled_valid &&
   1766 	    cp->memory_pressure_enabled_dirty &&
   1767 	    exists_flag(path, "memory_pressure_enabled")) {
   1768 		if (store_flag
   1769 		    (path, "memory_pressure_enabled",
   1770 		     cp->memory_pressure_enabled) < 0)
   1771 			goto err;
   1772 	}
   1773 
   1774 	if (cp->memory_spread_page_valid &&
   1775 	    cp->memory_spread_page_dirty &&
   1776 	    exists_flag(path, "memory_spread_page")) {
   1777 		if (store_flag
   1778 		    (path, "memory_spread_page", cp->memory_spread_page) < 0)
   1779 			goto err;
   1780 	}
   1781 
   1782 	if (cp->memory_spread_slab_valid &&
   1783 	    cp->memory_spread_slab_dirty &&
   1784 	    exists_flag(path, "memory_spread_slab")) {
   1785 		if (store_flag
   1786 		    (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
   1787 			goto err;
   1788 	}
   1789 
   1790 	if (cp->sched_load_balance_valid &&
   1791 	    cp->sched_load_balance_dirty &&
   1792 	    exists_flag(path, "sched_load_balance")) {
   1793 		if (store_flag
   1794 		    (path, "sched_load_balance", cp->sched_load_balance) < 0)
   1795 			goto err;
   1796 	}
   1797 
   1798 	if (cp->sched_relax_domain_level_valid &&
   1799 	    cp->sched_relax_domain_level_dirty &&
   1800 	    exists_flag(path, "sched_relax_domain_level")) {
   1801 		if (store_number
   1802 		    (path, "sched_relax_domain_level",
   1803 		     cp->sched_relax_domain_level) < 0)
   1804 			goto err;
   1805 	}
   1806 
   1807 	if (cp->cpus_valid && cp->cpus_dirty) {
   1808 		if (store_mask(path, "cpus", cp->cpus) < 0)
   1809 			goto err;
   1810 	}
   1811 
   1812 	if (cp->mems_valid && cp->mems_dirty) {
   1813 		if (store_mask(path, "mems", cp->mems) < 0)
   1814 			goto err;
   1815 	}
   1816 	return 0;
   1817 err:
   1818 	return -1;
   1819 }
   1820 
   1821 /*
   1822  * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
   1823  *
   1824  * Extract max value of any 'siblings' field in /proc/cpuinfo.
   1825  * Cache the result - only need to extract once in lifetime of task.
   1826  *
   1827  * The siblings field is the number of logical CPUs in a physical
   1828  * processor package.  It is equal to the product of the number of
   1829  * cores in that package, times the number of hyper-threads per core.
   1830  * The bug that cpuset_would_crash_kernel() is detecting arises
   1831  * when a cpu_exclusive cpuset tries to include just some, not all,
   1832  * of the sibling logical CPUs available in a processor package.
   1833  *
   1834  * In the improbable case that a system has mixed values of siblings
   1835  * (some processor packages have more than others, perhaps due to
   1836  * partially enabling Hyper-Threading), we take the worse case value,
   1837  * the largest siblings value.  This might be overkill.  I don't know
   1838  * if this kernel bug considers each processor package's siblings
   1839  * separately or not.  But it sure is easier this way ...
   1840  *
   1841  * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
   1842  * open to close, the first time called.
   1843  */
   1844 
   1845 static int get_siblings()
   1846 {
   1847 	static int siblings;
   1848 	char buf[32];		/* big enough for one 'siblings' line */
   1849 	FILE *fp;
   1850 
   1851 	if (siblings)
   1852 		return siblings;
   1853 
   1854 	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
   1855 		return 4;	/* wing it - /proc not mounted ? */
   1856 	while (flgets(buf, sizeof(buf), fp) != NULL) {
   1857 		int s;
   1858 
   1859 		if (sscanf(buf, "siblings : %d", &s) < 1)
   1860 			continue;
   1861 		if (s > siblings)
   1862 			siblings = s;
   1863 	}
   1864 	fclose(fp);
   1865 	if (siblings == 0)
   1866 		siblings = 1;	/* old kernel, no siblings, default to 1 */
   1867 	return siblings;
   1868 }
   1869 
   1870 /*
   1871  * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
   1872  * scheduler domain code invoked for cpu_exclusive cpusets that causes
   1873  * the kernel to freeze, requiring a hardware reset.
   1874  *
   1875  * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
   1876  * cpuset is defined where that cpusets 'cpus' are not on package
   1877  * boundaries then the kernel will freeze, usually as soon as this
   1878  * cpuset is created, requiring a hardware reset.
   1879  *
   1880  * A cpusets 'cpus' are not on package boundaries if the cpuset
   1881  * includes a proper non-empty subset (some, but not all) of the
   1882  * logical cpus on a processor package.  This requires multiple
   1883  * logical CPUs per package, available with either Hyper-Thread or
   1884  * Multi-Core support.  Without one of these features, there is only
   1885  * one logical CPU per physical package, and it's not possible to
   1886  * have a proper, non-empty subset of a set of cardinality one.
   1887  *
   1888  * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
   1889  * on i386 and x86_64 arch's.
   1890  *
   1891  * The objective of this routine cpuset_would_crash_kernel() is to
   1892  * determine if a proposed cpuset setting would crash the kernel due
   1893  * to this bug, so that the caller can avoid the crash.
   1894  *
   1895  * Ideally we'd check for exactly these conditions here, but computing
   1896  * the package (identified by the 'physical id' field of /proc/cpuinfo)
   1897  * of each cpu in a cpuset is more effort than it's worth here.
   1898  *
   1899  * Also there is no obvious way to identify exactly whether the kernel
   1900  * one is executing on has this bug, short of trying it, and seeing
   1901  * if the kernel just crashed.
   1902  *
   1903  * So for now, we look for a simpler set of conditions, that meets
   1904  * our immediate need - avoid this crash on SUSE SLES10 systems that
   1905  * are susceptible to it.  We look for the kernel version 2.6.16.*,
   1906  * which is the base kernel of SUSE SLES10, and for i386 or x86_64
   1907  * processors, which had CONFIG_SCHED_MC enabled.
   1908  *
   1909  * If these simpler conditions are met, we further simplify the check,
   1910  * by presuming that the logical CPUs are numbered on processor
   1911  * package boundaries.  If each package has S siblings, we assume
   1912  * that CPUs numbered N through N + S -1 are on the same package,
   1913  * for any CPU N such that N mod S == 0.
   1914  *
   1915  * Yes, this is a hack, focused on avoiding kernel freezes on
   1916  * susceptible SUSE SLES10 systems.
   1917  */
   1918 
   1919 static int cpuset_would_crash_kernel(const struct cpuset *cp)
   1920 {
   1921 	static int susceptible_system = -1;
   1922 
   1923 	if (!cp->cpu_exclusive)
   1924 		goto ok;
   1925 
   1926 	if (susceptible_system == -1) {
   1927 		struct utsname u;
   1928 		int rel_2_6_16, arch_i386, arch_x86_64;
   1929 
   1930 		if (uname(&u) < 0)
   1931 			goto fail;
   1932 		rel_2_6_16 = strprefix(u.release, "2.6.16.");
   1933 		arch_i386 = streq(u.machine, "i386");
   1934 		arch_x86_64 = streq(u.machine, "x86_64");
   1935 		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
   1936 	}
   1937 
   1938 	if (susceptible_system) {
   1939 		int ncpus = cpuset_cpus_nbits();
   1940 		int siblings = get_siblings();
   1941 		unsigned int cpu;
   1942 
   1943 		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
   1944 			int s, num_set = 0;
   1945 
   1946 			for (s = 0; s < siblings; s++) {
   1947 				if (bitmask_isbitset(cp->cpus, cpu + s))
   1948 					num_set++;
   1949 			}
   1950 
   1951 			/* If none or all siblings set, we're still ok */
   1952 			if (num_set == 0 || num_set == siblings)
   1953 				continue;
   1954 
   1955 			/* Found one that would crash kernel.  Fail.  */
   1956 			errno = ENXIO;
   1957 			goto fail;
   1958 		}
   1959 	}
   1960 	/* If not susceptible, or if all ok, fall into "ok" ... */
   1961 ok:
   1962 	return 0;		/* would not crash */
   1963 fail:
   1964 	return 1;		/* would crash */
   1965 }
   1966 
   1967 /* compare two cpuset and mark the dirty variable */
   1968 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
   1969 {
   1970 	if (cp1->cpu_exclusive_valid &&
   1971 	    cp1->cpu_exclusive != cp2->cpu_exclusive)
   1972 		cp1->cpu_exclusive_dirty = 1;
   1973 
   1974 	if (cp1->mem_exclusive_valid &&
   1975 	    cp1->mem_exclusive != cp2->mem_exclusive)
   1976 		cp1->mem_exclusive_dirty = 1;
   1977 
   1978 	if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
   1979 		cp1->mem_hardwall_dirty = 1;
   1980 
   1981 	if (cp1->notify_on_release_valid &&
   1982 	    cp1->notify_on_release != cp2->notify_on_release)
   1983 		cp1->notify_on_release_dirty = 1;
   1984 
   1985 	if (cp1->memory_migrate_valid &&
   1986 	    cp1->memory_migrate != cp2->memory_migrate)
   1987 		cp1->memory_migrate_dirty = 1;
   1988 
   1989 	if (cp1->memory_pressure_enabled_valid &&
   1990 	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
   1991 		cp1->memory_pressure_enabled_dirty = 1;
   1992 
   1993 	if (cp1->memory_spread_page_valid &&
   1994 	    cp1->memory_spread_page != cp2->memory_spread_page)
   1995 		cp1->memory_spread_page_dirty = 1;
   1996 
   1997 	if (cp1->memory_spread_slab_valid &&
   1998 	    cp1->memory_spread_slab != cp2->memory_spread_slab)
   1999 		cp1->memory_spread_slab_dirty = 1;
   2000 
   2001 	if (cp1->sched_load_balance_valid &&
   2002 	    cp1->sched_load_balance != cp2->sched_load_balance)
   2003 		cp1->sched_load_balance_dirty = 1;
   2004 
   2005 	if (cp1->sched_relax_domain_level_valid &&
   2006 	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
   2007 		cp1->sched_relax_domain_level_dirty = 1;
   2008 
   2009 	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
   2010 		cp1->cpus_dirty = 1;
   2011 	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
   2012 		cp1->mems_dirty = 1;
   2013 }
   2014 
   2015 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
   2016 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
   2017 {
   2018 	char buf[PATH_MAX];
   2019 	int do_rmdir_on_err = 0;
   2020 	int do_restore_cp_sav_on_err = 0;
   2021 	struct cpuset *cp_sav = NULL;
   2022 	int sav_errno;
   2023 
   2024 	if (check() < 0)
   2025 		goto err;
   2026 
   2027 	if (cpuset_would_crash_kernel(cp))
   2028 		goto err;
   2029 
   2030 	fullpath(buf, sizeof(buf), relpath);
   2031 
   2032 	if (new) {
   2033 		if (mkdir(buf, 0755) < 0)
   2034 			goto err;
   2035 		/* we made it, so we should remove it on error */
   2036 		do_rmdir_on_err = 1;
   2037 	}
   2038 
   2039 	if ((cp_sav = cpuset_alloc()) == NULL)
   2040 		goto err;
   2041 	if (cpuset_query(cp_sav, relpath) < 0)
   2042 		goto err;
   2043 	/* we have old settings to restore on error */
   2044 	do_restore_cp_sav_on_err = 1;
   2045 
   2046 	/* check which variable need to restore on error */
   2047 	mark_dirty_variable(cp_sav, cp);
   2048 
   2049 	if (apply_cpuset_settings(buf, cp) < 0)
   2050 		goto err;
   2051 
   2052 	cpuset_free(cp_sav);
   2053 	return 0;
   2054 err:
   2055 	sav_errno = errno;
   2056 	if (do_restore_cp_sav_on_err)
   2057 		(void)apply_cpuset_settings(buf, cp_sav);
   2058 	if (cp_sav)
   2059 		cpuset_free(cp_sav);
   2060 	if (do_rmdir_on_err)
   2061 		(void)rmdir(buf);
   2062 	errno = sav_errno;
   2063 	return -1;
   2064 }
   2065 
   2066 /* Create cpuset 'cp' at location 'relpath' */
   2067 int cpuset_create(const char *relpath, const struct cpuset *cp)
   2068 {
   2069 	return cr_or_mod(relpath, cp, 1);
   2070 }
   2071 
   2072 /* Delete cpuset at location 'path' (if empty) */
   2073 int cpuset_delete(const char *relpath)
   2074 {
   2075 	char buf[PATH_MAX];
   2076 
   2077 	if (check() < 0)
   2078 		goto err;
   2079 
   2080 	fullpath(buf, sizeof(buf), relpath);
   2081 	if (rmdir(buf) < 0)
   2082 		goto err;
   2083 
   2084 	return 0;
   2085 err:
   2086 	return -1;
   2087 }
   2088 
   2089 /* Set cpuset cp to the cpuset at location 'path' */
   2090 int cpuset_query(struct cpuset *cp, const char *relpath)
   2091 {
   2092 	char buf[PATH_MAX];
   2093 
   2094 	if (check() < 0)
   2095 		goto err;
   2096 
   2097 	fullpath(buf, sizeof(buf), relpath);
   2098 
   2099 	if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
   2100 		goto err;
   2101 	cp->cpu_exclusive_valid = 1;
   2102 
   2103 	if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
   2104 		goto err;
   2105 	cp->mem_exclusive_valid = 1;
   2106 
   2107 	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
   2108 		goto err;
   2109 	cp->notify_on_release_valid = 1;
   2110 
   2111 	if (exists_flag(buf, "memory_migrate")) {
   2112 		if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
   2113 			goto err;
   2114 		cp->memory_migrate_valid = 1;
   2115 	}
   2116 
   2117 	if (exists_flag(buf, "mem_hardwall")) {
   2118 		if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
   2119 			goto err;
   2120 		cp->mem_hardwall_valid = 1;
   2121 	}
   2122 
   2123 	if (exists_flag(buf, "memory_pressure_enabled")) {
   2124 		if (load_flag
   2125 		    (buf, &cp->memory_pressure_enabled,
   2126 		     "memory_pressure_enabled") < 0)
   2127 			goto err;
   2128 		cp->memory_pressure_enabled_valid = 1;
   2129 	}
   2130 
   2131 	if (exists_flag(buf, "memory_spread_page")) {
   2132 		if (load_flag
   2133 		    (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
   2134 			goto err;
   2135 		cp->memory_spread_page_valid = 1;
   2136 	}
   2137 
   2138 	if (exists_flag(buf, "memory_spread_slab")) {
   2139 		if (load_flag
   2140 		    (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
   2141 			goto err;
   2142 		cp->memory_spread_slab_valid = 1;
   2143 	}
   2144 
   2145 	if (exists_flag(buf, "sched_load_balance")) {
   2146 		if (load_flag
   2147 		    (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
   2148 			goto err;
   2149 		cp->sched_load_balance_valid = 1;
   2150 	}
   2151 
   2152 	if (exists_flag(buf, "sched_relax_domain_level")) {
   2153 		if (load_number
   2154 		    (buf, &cp->sched_relax_domain_level,
   2155 		     "sched_relax_domain_level") < 0)
   2156 			goto err;
   2157 		cp->sched_relax_domain_level_valid = 1;
   2158 	}
   2159 
   2160 	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
   2161 		goto err;
   2162 	cp->cpus_valid = 1;
   2163 
   2164 	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
   2165 		goto err;
   2166 	cp->mems_valid = 1;
   2167 
   2168 	return 0;
   2169 err:
   2170 	return -1;
   2171 }
   2172 
   2173 /* Modify cpuset at location 'relpath' to values of 'cp' */
   2174 int cpuset_modify(const char *relpath, const struct cpuset *cp)
   2175 {
   2176 	return cr_or_mod(relpath, cp, 0);
   2177 }
   2178 
   2179 /* Get cpuset path of pid into buf */
   2180 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
   2181 {
   2182 	int fd;			/* dual use: cpuset file for pid and self */
   2183 	int rc;			/* dual use: snprintf and read return codes */
   2184 
   2185 	if (check() < 0)
   2186 		return NULL;
   2187 
   2188 	/* borrow result buf[] to build cpuset file path */
   2189 	if (pid == 0)
   2190 		rc = snprintf(buf, size, "/proc/self/cpuset");
   2191 	else
   2192 		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
   2193 	if (rc >= (int)size) {
   2194 		errno = E2BIG;
   2195 		return NULL;
   2196 	}
   2197 	if ((fd = open(buf, O_RDONLY)) < 0) {
   2198 		int e = errno;
   2199 		if (e == ENOENT)
   2200 			e = ESRCH;
   2201 		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
   2202 			e = ENOSYS;
   2203 		else
   2204 			close(fd);
   2205 		errno = e;
   2206 		return NULL;
   2207 	}
   2208 	rc = read(fd, buf, size);
   2209 	close(fd);
   2210 	if (rc < 0)
   2211 		return NULL;
   2212 	if (rc >= (int)size) {
   2213 		errno = E2BIG;
   2214 		return NULL;
   2215 	}
   2216 	buf[rc] = 0;
   2217 	chomp(buf);
   2218 	return buf;
   2219 
   2220 }
   2221 
   2222 /* Get cpuset 'cp' of pid */
   2223 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
   2224 {
   2225 	char buf[PATH_MAX];
   2226 
   2227 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
   2228 		return -1;
   2229 	if (cpuset_query(cp, buf) < 0)
   2230 		return -1;
   2231 	return 0;
   2232 }
   2233 
   2234 /* [optional] Return mountpoint of cpuset filesystem */
   2235 const char *cpuset_mountpoint()
   2236 {
   2237 	if (check() < 0) {
   2238 		switch (errno) {
   2239 		case ENODEV:
   2240 			return "[cpuset filesystem not mounted]";
   2241 		default:
   2242 			return "[cpuset filesystem not supported]";
   2243 		}
   2244 	}
   2245 	return cpusetmnt;
   2246 }
   2247 
   2248 /* Return true if path is a directory. */
   2249 static int isdir(const char *path)
   2250 {
   2251 	struct stat statbuf;
   2252 
   2253 	if (stat(path, &statbuf) < 0)
   2254 		return 0;
   2255 	return S_ISDIR(statbuf.st_mode);
   2256 }
   2257 
   2258 /*
   2259  * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
   2260  *
   2261  * Return true iff the specified cpuset would overlap with any
   2262  * sibling cpusets in either cpus or mems, where either this
   2263  * cpuset or the sibling is cpu_exclusive or mem_exclusive.
   2264  *
   2265  * cpuset_create() fails with errno == EINVAL if the requested cpuset
   2266  * would overlap with any sibling, where either one is cpu_exclusive or
   2267  * mem_exclusive.  This is a common, and not obvious error.  The
   2268  * following routine checks for this particular case, so that code
   2269  * creating cpusets can better identify the situation, perhaps to issue
   2270  * a more informative error message.
   2271  *
   2272  * Can also be used to diagnose cpuset_modify failures.  This
   2273  * routine ignores any existing cpuset with the same path as the
   2274  * given 'cpusetpath', and only looks for exclusive collisions with
   2275  * sibling cpusets of that path.
   2276  *
   2277  * In case of any error, returns (0) -- does not collide.  Presumably
   2278  * any actual attempt to create or modify a cpuset will encounter the
   2279  * same error, and report it usefully.
   2280  *
   2281  * This routine is not particularly efficient; most likely code creating or
   2282  * modifying a cpuset will want to try the operation first, and then if that
   2283  * fails with errno EINVAL, perhaps call this routine to determine if an
   2284  * exclusive cpuset collision caused the error.
   2285  */
   2286 
   2287 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
   2288 {
   2289 	char parent[PATH_MAX];
   2290 	char *p;
   2291 	char *pathcopy = NULL;
   2292 	char *base;
   2293 	DIR *dir = NULL;
   2294 	struct dirent *dent;
   2295 	struct cpuset *cp2 = NULL;
   2296 	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
   2297 	struct bitmask *mems1 = NULL, *mems2 = NULL;
   2298 	int ret;
   2299 
   2300 	if (check() < 0)
   2301 		goto err;
   2302 
   2303 	fullpath(parent, sizeof(parent), cpusetpath);
   2304 	if (streq(parent, cpusetmnt))
   2305 		goto err;	/* only one cpuset root - can't collide */
   2306 	pathcopy = strdup(parent);
   2307 	p = strrchr(parent, '/');
   2308 	if (!p)
   2309 		goto err;	/* huh? - impossible - run and hide */
   2310 	*p = 0;			/* now parent is dirname of fullpath */
   2311 
   2312 	p = strrchr(pathcopy, '/');
   2313 	base = p + 1;		/* now base is basename of fullpath */
   2314 	if (!*base)
   2315 		goto err;	/* this is also impossible - run away */
   2316 
   2317 	if ((dir = opendir(parent)) == NULL)
   2318 		goto err;
   2319 	if ((cp2 = cpuset_alloc()) == NULL)
   2320 		goto err;
   2321 	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
   2322 		goto err;
   2323 	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
   2324 		goto err;
   2325 	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
   2326 		goto err;
   2327 	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
   2328 		goto err;
   2329 
   2330 	while ((dent = readdir(dir)) != NULL) {
   2331 		char child[PATH_MAX];
   2332 
   2333 		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
   2334 			continue;
   2335 		if (streq(dent->d_name, base))
   2336 			continue;
   2337 		pathcat2(child, sizeof(child), parent, dent->d_name);
   2338 		if (!isdir(child))
   2339 			continue;
   2340 		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
   2341 			goto err;
   2342 		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
   2343 			cpuset_getcpus(cp1, cpus1);
   2344 			cpuset_getcpus(cp2, cpus2);
   2345 			if (bitmask_intersects(cpus1, cpus2))
   2346 				goto collides;
   2347 		}
   2348 		if (cp1->mem_exclusive || cp2->mem_exclusive) {
   2349 			cpuset_getmems(cp1, mems1);
   2350 			cpuset_getmems(cp2, mems2);
   2351 			if (bitmask_intersects(mems1, mems2))
   2352 				goto collides;
   2353 		}
   2354 	}
   2355 err:
   2356 	/* error, or did not collide */
   2357 	ret = 0;
   2358 	goto done;
   2359 collides:
   2360 	/* collides */
   2361 	ret = 1;
   2362 	/* fall into ... */
   2363 done:
   2364 	if (dir)
   2365 		closedir(dir);
   2366 	cpuset_free(cp2);
   2367 	free(pathcopy);
   2368 	bitmask_free(cpus1);
   2369 	bitmask_free(cpus2);
   2370 	bitmask_free(mems1);
   2371 	bitmask_free(mems2);
   2372 	return ret;
   2373 }
   2374 
   2375 /*
   2376  * [optional] cpuset_nuke() - Remove cpuset anyway possible
   2377  *
   2378  * Remove a cpuset, including killing tasks in it, and
   2379  * removing any descendent cpusets and killing their tasks.
   2380  *
   2381  * Tasks can take a long time (minutes on some configurations)
   2382  * to exit.  Loop up to 'seconds' seconds, trying to kill them.
   2383  *
   2384  * How we do it:
   2385  *	1) First, kill all the pids, looping until there are
   2386  *	   no more pids in this cpuset or below, or until the
   2387  *	   'seconds' timeout limit is exceeded.
   2388  *	2) Then depth first recursively rmdir the cpuset directories.
   2389  *	3) If by this point the original cpuset is gone, we succeeded.
   2390  *
   2391  * If the timeout is exceeded, and tasks still exist, fail with
   2392  * errno == ETIME.
   2393  *
   2394  * We sleep a variable amount of time.  After the first attempt to
   2395  * kill all the tasks in the cpuset or its descendents, we sleep 1
   2396  * second, the next time 2 seconds, increasing 1 second each loop
   2397  * up to a max of 10 seconds.  If more loops past 10 are required
   2398  * to kill all the tasks, we sleep 10 seconds each subsequent loop.
   2399  * In any case, before the last loop, we sleep however many seconds
   2400  * remain of the original timeout 'seconds' requested.  The total
   2401  * time of all sleeps will be no more than the requested 'seconds'.
   2402  *
   2403  * If the cpuset started out empty of any tasks, or if the passed in
   2404  * 'seconds' was zero, then this routine will return quickly, having
   2405  * not slept at all.  Otherwise, this routine will at a minimum send
   2406  * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
   2407  * second, before looking to see if any tasks remain.  If tasks remain
   2408  * in the cpuset subtree, and a longer 'seconds' timeout was requested
   2409  * (more than one), it will continue to kill remaining tasks and sleep,
   2410  * in a loop, for as long as time and tasks remain.
   2411  *
   2412  * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
   2413  * other signal should be sent first, use a separate code loop,
   2414  * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
   2415  * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
   2416  * this cpuset_nuke() routine can still be called to recursively
   2417  * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
   2418  *
   2419  * On success, returns 0 with errno == 0.
   2420  *
   2421  * On failure, returns -1, with errno possibly one of:
   2422  *  EACCES - search permission denied on intervening directory
   2423  *  ETIME - timed out - tasks remain after 'seconds' timeout
   2424  *  EMFILE - too many open files
   2425  *  ENODEV - /dev/cpuset not mounted
   2426  *  ENOENT - component of cpuset path doesn't exist
   2427  *  ENOMEM - out of memory
   2428  *  ENOSYS - kernel doesn't support cpusets
   2429  *  ENOTDIR - component of cpuset path is not a directory
   2430  *  EPERM - lacked permission to kill a task
   2431  *  EPERM - lacked permission to read cpusets or files therein
   2432  */
   2433 
   2434 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
   2435 
   2436 int cpuset_nuke(const char *relpath, unsigned int seconds)
   2437 {
   2438 	unsigned int secs_left = seconds;	/* total sleep seconds left */
   2439 	unsigned int secs_loop = 1;	/* how much sleep next loop */
   2440 	unsigned int secs_slept;	/* seconds slept in sleep() */
   2441 	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
   2442 	struct cpuset_fts_tree *cs_tree;
   2443 	const struct cpuset_fts_entry *cs_entry;
   2444 	int ret, sav_errno = 0;
   2445 
   2446 	if (check() < 0)
   2447 		return -1;
   2448 
   2449 	if (seconds == 0)
   2450 		goto rmdir_cpusets;
   2451 
   2452 	while (1) {
   2453 		int plen, j;
   2454 
   2455 		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
   2456 			/* missing cpuset is as good as if already nuked */
   2457 			if (errno == ENOENT) {
   2458 				ret = 0;
   2459 				goto no_more_cpuset;
   2460 			}
   2461 
   2462 			/* other problems reading cpuset are bad news */
   2463 			sav_errno = errno;
   2464 			goto failed;
   2465 		}
   2466 
   2467 		if ((plen = cpuset_pidlist_length(pl)) == 0)
   2468 			goto rmdir_cpusets;
   2469 
   2470 		for (j = 0; j < plen; j++) {
   2471 			pid_t pid;
   2472 
   2473 			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
   2474 				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
   2475 					sav_errno = errno;
   2476 					goto failed;
   2477 				}
   2478 			}
   2479 		}
   2480 
   2481 		if (secs_left == 0)
   2482 			goto took_too_long;
   2483 
   2484 		cpuset_freepidlist(pl);
   2485 		pl = NULL;
   2486 
   2487 		secs_slept = secs_loop - sleep(secs_loop);
   2488 
   2489 		/* Ensure forward progress */
   2490 		if (secs_slept == 0)
   2491 			secs_slept = 1;
   2492 
   2493 		/* Ensure sane sleep() return (unnecessary?) */
   2494 		if (secs_slept > secs_loop)
   2495 			secs_slept = secs_loop;
   2496 
   2497 		secs_left -= secs_slept;
   2498 
   2499 		if (secs_loop < 10)
   2500 			secs_loop++;
   2501 
   2502 		secs_loop = min(secs_left, secs_loop);
   2503 	}
   2504 
   2505 took_too_long:
   2506 	sav_errno = ETIME;
   2507 	/* fall into ... */
   2508 failed:
   2509 	cpuset_freepidlist(pl);
   2510 	errno = sav_errno;
   2511 	return -1;
   2512 
   2513 rmdir_cpusets:
   2514 	/* Let's try removing cpuset(s) now. */
   2515 	cpuset_freepidlist(pl);
   2516 
   2517 	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
   2518 		return -1;
   2519 	ret = 0;
   2520 	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
   2521 	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
   2522 		char buf[PATH_MAX];
   2523 
   2524 		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
   2525 		if (rmdir(buf) < 0 && errno != ENOENT) {
   2526 			sav_errno = errno;
   2527 			ret = -1;
   2528 		}
   2529 	}
   2530 	cpuset_fts_close(cs_tree);
   2531 	/* fall into ... */
   2532 no_more_cpuset:
   2533 	if (ret == 0)
   2534 		errno = 0;
   2535 	else
   2536 		errno = sav_errno;
   2537 	return ret;
   2538 }
   2539 
   2540 /*
   2541  * When recursively reading all the tasks files from a subtree,
   2542  * chain together the read results, one pidblock per tasks file,
   2543  * containing the raw unprocessed ascii as read(2) in.  After
   2544  * we gather up this raw data, we then go back to count how
   2545  * many pid's there are in total, allocate an array of pid_t
   2546  * of that size, and transform the raw ascii data into this
   2547  * array of pid_t's.
   2548  */
   2549 
   2550 struct pidblock {
   2551 	char *buf;
   2552 	int buflen;
   2553 	struct pidblock *next;
   2554 };
   2555 
   2556 /*
   2557  * Chain the raw contents of a file onto the pbhead list.
   2558  *
   2559  * We malloc "+ 1" extra byte for a nul-terminator, so that
   2560  * the strtoul() loop in pid_transform() won't scan past
   2561  * the end of pb->buf[] and accidentally find more pids.
   2562  */
   2563 static void add_pidblock(const char *file, struct pidblock **ppbhead)
   2564 {
   2565 	FILE *fp = NULL;
   2566 	struct pidblock *pb = NULL;
   2567 	int fsz;
   2568 
   2569 	if ((fp = fopen(file, "r")) == NULL)
   2570 		goto err;
   2571 	fsz = filesize(fp);
   2572 	if (fsz == 0)
   2573 		goto err;
   2574 	if ((pb = calloc(1, sizeof(*pb))) == NULL)
   2575 		goto err;
   2576 	pb->buflen = fsz;
   2577 	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
   2578 		goto err;
   2579 	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
   2580 		pb->buf[pb->buflen] = '\0';
   2581 		pb->next = *ppbhead;
   2582 		*ppbhead = pb;
   2583 	}
   2584 	fclose(fp);
   2585 	return;
   2586 err:
   2587 	if (fp)
   2588 		fclose(fp);
   2589 	free(pb);
   2590 }
   2591 
   2592 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
   2593 {
   2594 	char buf[PATH_MAX];
   2595 
   2596 	fullpath2(buf, sizeof(buf), relpath, "tasks");
   2597 	add_pidblock(buf, ppbhead);
   2598 }
   2599 
   2600 struct cpuset_pidlist {
   2601 	pid_t *pids;
   2602 	int npids;
   2603 };
   2604 
   2605 /* Count how many pids in buf (one per line - just count newlines) */
   2606 static int pidcount(const char *buf, int buflen)
   2607 {
   2608 	int n = 0;
   2609 	const char *cp;
   2610 
   2611 	for (cp = buf; cp < buf + buflen; cp++) {
   2612 		if (*cp == '\n')
   2613 			n++;
   2614 	}
   2615 	return n;
   2616 }
   2617 
   2618 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
   2619 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
   2620 {
   2621 	char *a, *b;
   2622 
   2623 	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
   2624 		pid_t p = strtoul(a, &b, 10);
   2625 		if (a == b)
   2626 			break;
   2627 		pl->pids[n++] = p;
   2628 	}
   2629 	return n;
   2630 }
   2631 
   2632 static void free_pidblocks(struct pidblock *pbhead)
   2633 {
   2634 	struct pidblock *pb, *nextpb;
   2635 
   2636 	for (pb = pbhead; pb; pb = nextpb) {
   2637 		nextpb = pb->next;
   2638 		free(pb->buf);
   2639 		free(pb);
   2640 	}
   2641 }
   2642 
   2643 /* numeric comparison routine for qsort */
   2644 static int numericsort(const void *m1, const void *m2)
   2645 {
   2646 	pid_t p1 = *(pid_t *) m1;
   2647 	pid_t p2 = *(pid_t *) m2;
   2648 
   2649 	return p1 - p2;
   2650 }
   2651 
   2652 /* Return list pids in cpuset 'path' */
   2653 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
   2654 					   int recursiveflag)
   2655 {
   2656 	struct pidblock *pb = NULL;
   2657 	struct cpuset_pidlist *pl = NULL;
   2658 	struct pidblock *pbhead = NULL;
   2659 	int n;
   2660 
   2661 	if (check() < 0)
   2662 		goto err;
   2663 
   2664 	if (recursiveflag) {
   2665 		struct cpuset_fts_tree *cs_tree;
   2666 		const struct cpuset_fts_entry *cs_entry;
   2667 
   2668 		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
   2669 			goto err;
   2670 		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
   2671 			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
   2672 				continue;
   2673 			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
   2674 		}
   2675 		cpuset_fts_close(cs_tree);
   2676 	} else {
   2677 		read_task_file(relpath, &pbhead);
   2678 	}
   2679 
   2680 	if ((pl = calloc(1, sizeof(*pl))) == NULL)
   2681 		goto err;
   2682 	pl->npids = 0;
   2683 	for (pb = pbhead; pb; pb = pb->next)
   2684 		pl->npids += pidcount(pb->buf, pb->buflen);
   2685 	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
   2686 		goto err;
   2687 	n = 0;
   2688 	for (pb = pbhead; pb; pb = pb->next)
   2689 		n = pid_transform(pb, pl, n);
   2690 	free_pidblocks(pbhead);
   2691 	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
   2692 	return pl;
   2693 err:
   2694 	cpuset_freepidlist(pl);
   2695 	free_pidblocks(pbhead);
   2696 	return NULL;
   2697 }
   2698 
   2699 /* Return number of elements in pidlist */
   2700 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
   2701 {
   2702 	if (pl)
   2703 		return pl->npids;
   2704 	else
   2705 		return 0;
   2706 }
   2707 
   2708 /* Return i'th element of pidlist */
   2709 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
   2710 {
   2711 	if (pl && i >= 0 && i < pl->npids)
   2712 		return pl->pids[i];
   2713 	else
   2714 		return (pid_t) - 1;
   2715 }
   2716 
   2717 /* Free pidlist */
   2718 void cpuset_freepidlist(struct cpuset_pidlist *pl)
   2719 {
   2720 	if (pl && pl->pids)
   2721 		free(pl->pids);
   2722 	free(pl);
   2723 }
   2724 
   2725 static int __cpuset_move(pid_t pid, const char *path)
   2726 {
   2727 	char buf[SMALL_BUFSZ];
   2728 
   2729 	snprintf(buf, sizeof(buf), "%u", pid);
   2730 	return write_string_file(path, buf);
   2731 }
   2732 
   2733 /* Move task (pid == 0 for current) to a cpuset */
   2734 int cpuset_move(pid_t pid, const char *relpath)
   2735 {
   2736 	char buf[PATH_MAX];
   2737 
   2738 	if (check() < 0)
   2739 		return -1;
   2740 
   2741 	if (pid == 0)
   2742 		pid = getpid();
   2743 
   2744 	fullpath2(buf, sizeof(buf), relpath, "tasks");
   2745 	return __cpuset_move(pid, buf);
   2746 }
   2747 
   2748 /* Move all tasks in pidlist to a cpuset */
   2749 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
   2750 {
   2751 	int i;
   2752 	char buf[PATH_MAX];
   2753 	int ret;
   2754 
   2755 	if (check() < 0)
   2756 		return -1;
   2757 
   2758 	fullpath2(buf, sizeof(buf), relpath, "tasks");
   2759 
   2760 	ret = 0;
   2761 	for (i = 0; i < pl->npids; i++)
   2762 		if (__cpuset_move(pl->pids[i], buf) < 0)
   2763 			ret = -1;
   2764 	return ret;
   2765 }
   2766 
   2767 /*
   2768  * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
   2769  *                                      cpuset to another cpuset
   2770  *
   2771  * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
   2772  * race with tasks being added to or forking into fromrelpath. Loop
   2773  * repeatedly, reading the tasks file of cpuset fromrelpath and writing
   2774  * any task pid's found there to the tasks file of cpuset torelpath,
   2775  * up to ten attempts, or until the tasks file of cpuset fromrelpath
   2776  * is empty, or until fromrelpath is no longer present.
   2777  *
   2778  * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
   2779  * fromrelpath. Of course it is still possible that some independent
   2780  * task could add another task to cpuset fromrelpath at the same time
   2781  * that such a successful result is being returned, so there can be
   2782  * no guarantee that a successful return means that fromrelpath is
   2783  * still empty of tasks.
   2784  *
   2785  * We are careful to allow for the possibility that the cpuset
   2786  * fromrelpath might disappear out from under us, perhaps because it
   2787  * has notify_on_release set and gets automatically removed as soon
   2788  * as we detach its last task from it.  Consider a missing fromrelpath
   2789  * to be a successful move.
   2790  *
   2791  * If called with fromrelpath and torelpath pathnames that evaluate to
   2792  * the same cpuset, then treat that as if cpuset_reattach() was called,
   2793  * rebinding each task in this cpuset one time, and return success or
   2794  * failure depending on the return of that cpuset_reattach() call.
   2795  *
   2796  * On failure, returns -1, with errno possibly one of:
   2797  *  EACCES - search permission denied on intervening directory
   2798  *  ENOTEMPTY - tasks remain after multiple attempts to move them
   2799  *  EMFILE - too many open files
   2800  *  ENODEV - /dev/cpuset not mounted
   2801  *  ENOENT - component of cpuset path doesn't exist
   2802  *  ENOMEM - out of memory
   2803  *  ENOSYS - kernel doesn't support cpusets
   2804  *  ENOTDIR - component of cpuset path is not a directory
   2805  *  EPERM - lacked permission to kill a task
   2806  *  EPERM - lacked permission to read cpusets or files therein
   2807  *
   2808  * This is an [optional] function. Use cpuset_function to invoke it.
   2809  */
   2810 
   2811 #define NUMBER_MOVE_TASK_ATTEMPTS 10
   2812 
   2813 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
   2814 {
   2815 	char fromfullpath[PATH_MAX];
   2816 	char tofullpath[PATH_MAX];
   2817 	int i;
   2818 	struct cpuset_pidlist *pl = NULL;
   2819 	int sav_errno;
   2820 
   2821 	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
   2822 	fullpath(tofullpath, sizeof(tofullpath), torelpath);
   2823 
   2824 	if (samefile(fromfullpath, tofullpath))
   2825 		return cpuset_reattach(fromrelpath);
   2826 
   2827 	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
   2828 		int plen, j;
   2829 
   2830 		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
   2831 			/* missing cpuset is as good as if all moved */
   2832 			if (errno == ENOENT)
   2833 				goto no_more_cpuset;
   2834 
   2835 			/* other problems reading cpuset are bad news */
   2836 			sav_errno = errno;
   2837 			goto failed;
   2838 		}
   2839 
   2840 		if ((plen = cpuset_pidlist_length(pl)) == 0)
   2841 			goto no_more_pids;
   2842 
   2843 		for (j = 0; j < plen; j++) {
   2844 			pid_t pid;
   2845 
   2846 			pid = cpuset_get_pidlist(pl, j);
   2847 			if (cpuset_move(pid, torelpath) < 0) {
   2848 				/* missing task is as good as if moved */
   2849 				if (errno == ESRCH)
   2850 					continue;
   2851 
   2852 				/* other per-task errors are bad news */
   2853 				sav_errno = errno;
   2854 				goto failed;
   2855 			}
   2856 		}
   2857 
   2858 		cpuset_freepidlist(pl);
   2859 		pl = NULL;
   2860 	}
   2861 
   2862 	sav_errno = ENOTEMPTY;
   2863 	/* fall into ... */
   2864 failed:
   2865 	cpuset_freepidlist(pl);
   2866 	errno = sav_errno;
   2867 	return -1;
   2868 
   2869 no_more_pids:
   2870 no_more_cpuset:
   2871 	/* Success - all tasks (or entire cpuset ;) gone. */
   2872 	cpuset_freepidlist(pl);
   2873 	errno = 0;
   2874 	return 0;
   2875 }
   2876 
   2877 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
   2878 int cpuset_migrate(pid_t pid, const char *relpath)
   2879 {
   2880 	char buf[PATH_MAX];
   2881 	char buf2[PATH_MAX];
   2882 	char memory_migrate_flag;
   2883 	int r;
   2884 
   2885 	if (check() < 0)
   2886 		return -1;
   2887 
   2888 	if (pid == 0)
   2889 		pid = getpid();
   2890 
   2891 	fullpath(buf2, sizeof(buf2), relpath);
   2892 
   2893 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
   2894 		return -1;
   2895 	if (store_flag(buf2, "memory_migrate", 1) < 0)
   2896 		return -1;
   2897 
   2898 	fullpath2(buf, sizeof(buf), relpath, "tasks");
   2899 
   2900 	r = __cpuset_move(pid, buf);
   2901 
   2902 	store_flag(buf2, "memory_migrate", memory_migrate_flag);
   2903 	return r;
   2904 }
   2905 
   2906 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
   2907 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
   2908 {
   2909 	int i;
   2910 	char buf[PATH_MAX];
   2911 	char buf2[PATH_MAX];
   2912 	char memory_migrate_flag;
   2913 	int ret;
   2914 
   2915 	if (check() < 0)
   2916 		return -1;
   2917 
   2918 	fullpath(buf2, sizeof(buf2), relpath);
   2919 
   2920 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
   2921 		return -1;
   2922 	if (store_flag(buf2, "memory_migrate", 1) < 0)
   2923 		return -1;
   2924 
   2925 	fullpath2(buf, sizeof(buf), relpath, "tasks");
   2926 
   2927 	ret = 0;
   2928 	for (i = 0; i < pl->npids; i++)
   2929 		if (__cpuset_move(pl->pids[i], buf) < 0)
   2930 			ret = -1;
   2931 
   2932 	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
   2933 		ret = -1;
   2934 	return ret;
   2935 }
   2936 
   2937 /* Rebind cpus_allowed of each task in cpuset 'path' */
   2938 int cpuset_reattach(const char *relpath)
   2939 {
   2940 	struct cpuset_pidlist *pl;
   2941 	int rc;
   2942 
   2943 	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
   2944 		return -1;
   2945 	rc = cpuset_move_all(pl, relpath);
   2946 	cpuset_freepidlist(pl);
   2947 	return rc;
   2948 }
   2949 
   2950 /* Map cpuset relative cpu number to system wide cpu number */
   2951 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
   2952 {
   2953 	struct cpuset *cp_tofree = NULL;
   2954 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
   2955 	int pos = -1;
   2956 
   2957 	if (!cp1)
   2958 		goto err;
   2959 	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
   2960 	/* fall into ... */
   2961 err:
   2962 	cpuset_free(cp_tofree);
   2963 	return pos;
   2964 }
   2965 
   2966 /* Map system wide cpu number to cpuset relative cpu number */
   2967 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
   2968 {
   2969 	struct cpuset *cp_tofree = NULL;
   2970 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
   2971 	int pos = -1;
   2972 
   2973 	if (!cp1)
   2974 		goto err;
   2975 	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
   2976 	/* fall into ... */
   2977 err:
   2978 	cpuset_free(cp_tofree);
   2979 	return pos;
   2980 }
   2981 
   2982 /* Map cpuset relative mem number to system wide mem number */
   2983 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
   2984 {
   2985 	struct cpuset *cp_tofree = NULL;
   2986 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
   2987 	int pos = -1;
   2988 
   2989 	if (!cp1)
   2990 		goto err;
   2991 	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
   2992 	/* fall into ... */
   2993 err:
   2994 	cpuset_free(cp_tofree);
   2995 	return pos;
   2996 }
   2997 
   2998 /* Map system wide mem number to cpuset relative mem number */
   2999 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
   3000 {
   3001 	struct cpuset *cp_tofree = NULL;
   3002 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
   3003 	int pos = -1;
   3004 
   3005 	if (!cp1)
   3006 		goto err;
   3007 	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
   3008 	/* fall into ... */
   3009 err:
   3010 	cpuset_free(cp_tofree);
   3011 	return pos;
   3012 }
   3013 
   3014 /* Map pid's cpuset relative cpu number to system wide cpu number */
   3015 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
   3016 {
   3017 	struct cpuset *cp;
   3018 	int rc = -1;
   3019 
   3020 	if ((cp = cpuset_alloc()) == NULL)
   3021 		goto done;
   3022 	if (cpuset_cpusetofpid(cp, pid) < 0)
   3023 		goto done;
   3024 	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
   3025 done:
   3026 	cpuset_free(cp);
   3027 	return rc;
   3028 }
   3029 
   3030 /* Map system wide cpu number to pid's cpuset relative cpu number */
   3031 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
   3032 {
   3033 	struct cpuset *cp;
   3034 	int rc = -1;
   3035 
   3036 	if ((cp = cpuset_alloc()) == NULL)
   3037 		goto done;
   3038 	if (cpuset_cpusetofpid(cp, pid) < 0)
   3039 		goto done;
   3040 	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
   3041 done:
   3042 	cpuset_free(cp);
   3043 	return rc;
   3044 }
   3045 
   3046 /* Map pid's cpuset relative mem number to system wide mem number */
   3047 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
   3048 {
   3049 	struct cpuset *cp;
   3050 	int rc = -1;
   3051 
   3052 	if ((cp = cpuset_alloc()) == NULL)
   3053 		goto done;
   3054 	if (cpuset_cpusetofpid(cp, pid) < 0)
   3055 		goto done;
   3056 	rc = cpuset_c_rel_to_sys_mem(cp, mem);
   3057 done:
   3058 	cpuset_free(cp);
   3059 	return rc;
   3060 }
   3061 
   3062 /* Map system wide mem number to pid's cpuset relative mem number */
   3063 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
   3064 {
   3065 	struct cpuset *cp;
   3066 	int rc = -1;
   3067 
   3068 	if ((cp = cpuset_alloc()) == NULL)
   3069 		goto done;
   3070 	if (cpuset_cpusetofpid(cp, pid) < 0)
   3071 		goto done;
   3072 	rc = cpuset_c_sys_to_rel_mem(cp, mem);
   3073 done:
   3074 	cpuset_free(cp);
   3075 	return rc;
   3076 }
   3077 
   3078 /*
   3079  * Override glibc's calls for get/set affinity - they have
   3080  * something using cpu_set_t that will die when NR_CPUS > 1024.
   3081  * Go directly to the 'real' system calls.  Also override calls
   3082  * for get_mempolicy and set_mempolicy.  None of these
   3083  * calls are yet (July 2004) guaranteed to be in all glibc versions
   3084  * that we care about.
   3085  */
   3086 
   3087 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
   3088 {
   3089 	return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
   3090 }
   3091 
   3092 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
   3093 static int get_mempolicy(int *policy, unsigned long *nmask,
   3094 			 unsigned long maxnode, void *addr, int flags)
   3095 {
   3096 	return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
   3097 		addr, flags);
   3098 }
   3099 #endif
   3100 
   3101 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
   3102 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
   3103 {
   3104 	return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
   3105 }
   3106 #endif
   3107 
   3108 struct cpuset_placement {
   3109 	struct bitmask *cpus;
   3110 	struct bitmask *mems;
   3111 	char *path;
   3112 };
   3113 
   3114 /* Allocate and fill in a placement struct - cpatures current placement */
   3115 struct cpuset_placement *cpuset_get_placement(pid_t pid)
   3116 {
   3117 	struct cpuset_placement *plc;
   3118 	struct cpuset *cp = NULL;
   3119 	char buf[PATH_MAX];
   3120 	int nbits;
   3121 
   3122 	if ((plc = calloc(1, sizeof(*plc))) == NULL)
   3123 		goto err;
   3124 
   3125 	nbits = cpuset_cpus_nbits();
   3126 	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
   3127 		goto err;
   3128 
   3129 	nbits = cpuset_mems_nbits();
   3130 	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
   3131 		goto err;
   3132 
   3133 	if ((cp = cpuset_alloc()) == NULL)
   3134 		goto err;
   3135 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
   3136 		goto err;
   3137 	if (cpuset_query(cp, buf) < 0)
   3138 		goto err;
   3139 
   3140 	bitmask_copy(plc->cpus, cp->cpus);
   3141 	bitmask_copy(plc->mems, cp->mems);
   3142 	plc->path = strdup(buf);
   3143 
   3144 	cpuset_free(cp);
   3145 	return plc;
   3146 err:
   3147 	cpuset_free(cp);
   3148 	cpuset_free_placement(plc);
   3149 	return NULL;
   3150 }
   3151 
   3152 /* Compare two placement structs - use to detect changes in placement */
   3153 int cpuset_equal_placement(const struct cpuset_placement *plc1,
   3154 			   const struct cpuset_placement *plc2)
   3155 {
   3156 	return bitmask_equal(plc1->cpus, plc2->cpus) &&
   3157 	    bitmask_equal(plc1->mems, plc2->mems) &&
   3158 	    streq(plc1->path, plc2->path);
   3159 }
   3160 
   3161 /* Free a placement struct */
   3162 void cpuset_free_placement(struct cpuset_placement *plc)
   3163 {
   3164 	if (!plc)
   3165 		return;
   3166 	bitmask_free(plc->cpus);
   3167 	bitmask_free(plc->mems);
   3168 	free(plc->path);
   3169 	free(plc);
   3170 }
   3171 
   3172 /*
   3173  * A cpuset_fts_open() call constructs a linked list of entries
   3174  * called a "cpuset_fts_tree", with one entry per cpuset below
   3175  * the specified path.  The cpuset_fts_read() routine returns the
   3176  * next entry on this list.  The various cpuset_fts_get_*() calls
   3177  * return attributes of the specified entry.  The cpuset_fts_close()
   3178  * call frees the linked list and all associated data.  All cpuset
   3179  * entries and attributes for the cpuset_fts_tree returned from a
   3180  * given cpuset_fts_open() call remain allocated and unchanged until
   3181  * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
   3182  * subsequent changes to the cpuset filesystem will go unnoticed
   3183  * (not affect open cpuset_fts_tree's.)
   3184  */
   3185 
   3186 struct cpuset_fts_entry;
   3187 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
   3188 
   3189 struct cpuset_fts_tree {
   3190 	struct cpuset_fts_entry *head;	/* head of linked entry list */
   3191 	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
   3192 };
   3193 
   3194 struct cpuset_fts_entry {
   3195 	struct cpuset_fts_entry *next;	/* linked entry list chain */
   3196 	struct cpuset *cpuset;
   3197 	struct stat *stat;
   3198 	char *path;
   3199 	int info;
   3200 	int err;
   3201 };
   3202 
   3203 /* Open a handle on a cpuset hierarchy.  All the real work is done here. */
   3204 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
   3205 {
   3206 	FTS *fts = NULL;
   3207 	FTSENT *ftsent;
   3208 	char *path_argv[2];
   3209 	char buf[PATH_MAX];
   3210 	struct cpuset_fts_tree *cs_tree = NULL;
   3211 	struct cpuset_fts_entry *ep;	/* the latest new list entry */
   3212 	struct cpuset_fts_entry **pnlep;	/* ptr to next list entry ptr */
   3213 	char *relpath;
   3214 	int fts_flags;
   3215 
   3216 	fullpath(buf, sizeof(buf), cpusetpath);
   3217 	path_argv[0] = buf;
   3218 	path_argv[1] = NULL;
   3219 
   3220 	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
   3221 	fts = fts_open(path_argv, fts_flags, NULL);
   3222 	if (fts == NULL)
   3223 		goto err;
   3224 
   3225 	cs_tree = malloc(sizeof(*cs_tree));
   3226 	if (cs_tree == NULL)
   3227 		goto err;
   3228 	pnlep = &cs_tree->head;
   3229 	*pnlep = NULL;
   3230 
   3231 	while ((ftsent = fts_read(fts)) != NULL) {
   3232 		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
   3233 			continue;
   3234 
   3235 		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
   3236 		ep = calloc(1, sizeof(*ep));
   3237 		if (ep == NULL)
   3238 			goto err;
   3239 		*pnlep = ep;
   3240 		pnlep = &ep->next;
   3241 
   3242 		/* Set entry's path, and if DNR, error */
   3243 		relpath = ftsent->fts_path + strlen(cpusetmnt);
   3244 		if (strlen(relpath) == 0)
   3245 			relpath = "/";
   3246 		ep->path = strdup(relpath);
   3247 		if (ep->path == NULL)
   3248 			goto err;
   3249 		if (ftsent->fts_info == FTS_DNR) {
   3250 			ep->info = CPUSET_FTS_ERR_DNR;
   3251 			ep->err = ftsent->fts_errno;
   3252 			continue;
   3253 		}
   3254 
   3255 		/* ftsent is a -readable- cpuset: set entry's stat, etc */
   3256 		ep->stat = calloc(1, sizeof(struct stat));
   3257 		if (ep->stat == NULL)
   3258 			goto err;
   3259 		if (stat(ftsent->fts_path, ep->stat) < 0) {
   3260 			ep->info = CPUSET_FTS_ERR_STAT;
   3261 			ep->err = ftsent->fts_errno;
   3262 			continue;
   3263 		}
   3264 
   3265 		ep->cpuset = calloc(1, sizeof(struct cpuset));
   3266 		if (ep->cpuset == NULL)
   3267 			goto err;
   3268 		if (cpuset_query(ep->cpuset, relpath) < 0) {
   3269 			ep->info = CPUSET_FTS_ERR_CPUSET;
   3270 			ep->err = errno;
   3271 			continue;
   3272 		}
   3273 		ep->info = CPUSET_FTS_CPUSET;
   3274 	}
   3275 
   3276 	(void)fts_close(fts);
   3277 	cpuset_fts_rewind(cs_tree);
   3278 	return cs_tree;
   3279 
   3280 err:
   3281 	if (cs_tree)
   3282 		cpuset_fts_close(cs_tree);
   3283 	if (fts)
   3284 		(void)fts_close(fts);
   3285 	return NULL;
   3286 }
   3287 
   3288 /* Return pointer to next cpuset entry in hierarchy */
   3289 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
   3290 {
   3291 	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
   3292 	if (cs_tree->next != NULL)	/* seek to next entry */
   3293 		cs_tree->next = cs_tree->next->next;
   3294 	return cs_entry;
   3295 }
   3296 
   3297 /* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
   3298 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
   3299 {
   3300 	struct cpuset_fts_entry *cs1, *cs2, *cs3;
   3301 
   3302 	/*
   3303 	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
   3304 	 * is redirected from cs3 to cs1.
   3305 	 */
   3306 
   3307 	cs1 = cs2 = NULL;
   3308 	cs3 = cs_tree->head;
   3309 	while (cs3) {
   3310 		cs1 = cs2;
   3311 		cs2 = cs3;
   3312 		cs3 = cs3->next;
   3313 		cs2->next = cs1;
   3314 	}
   3315 	cs_tree->head = cs2;
   3316 	cpuset_fts_rewind(cs_tree);
   3317 }
   3318 
   3319 /* Rewind cpuset list to beginning */
   3320 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
   3321 {
   3322 	cs_tree->next = cs_tree->head;
   3323 }
   3324 
   3325 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
   3326 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
   3327 {
   3328 	return cs_entry->path;
   3329 }
   3330 
   3331 /* Return pointer to stat(2) structure of a cpuset entry's directory */
   3332 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
   3333 {
   3334 	return cs_entry->stat;
   3335 }
   3336 
   3337 /* Return pointer to cpuset structure of a cpuset entry */
   3338 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
   3339 					   *cs_entry)
   3340 {
   3341 	return cs_entry->cpuset;
   3342 }
   3343 
   3344 /* Return value of errno (0 if no error) on attempted cpuset operations */
   3345 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
   3346 {
   3347 	return cs_entry->err;
   3348 }
   3349 
   3350 /* Return operation identity causing error */
   3351 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
   3352 {
   3353 	return cs_entry->info;
   3354 }
   3355 
   3356 /* Close a cpuset hierarchy handle (free's all associated memory) */
   3357 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
   3358 {
   3359 	struct cpuset_fts_entry *cs_entry = cs_tree->head;
   3360 
   3361 	while (cs_entry) {
   3362 		struct cpuset_fts_entry *ep = cs_entry;
   3363 
   3364 		cs_entry = cs_entry->next;
   3365 		free(ep->path);
   3366 		free(ep->stat);
   3367 		cpuset_free(ep->cpuset);
   3368 		free(ep);
   3369 	}
   3370 	free(cs_tree);
   3371 }
   3372 
   3373 /* Bind current task to cpu (uses sched_setaffinity(2)) */
   3374 int cpuset_cpubind(int cpu)
   3375 {
   3376 	struct bitmask *bmp;
   3377 	int r;
   3378 
   3379 	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
   3380 		return -1;
   3381 	bitmask_setbit(bmp, cpu);
   3382 	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
   3383 	bitmask_free(bmp);
   3384 	return r;
   3385 }
   3386 
   3387 /*
   3388  * int cpuset_latestcpu(pid_t pid)
   3389  *
   3390  * Return most recent CPU on which task pid executed.  If pid == 0,
   3391  * examine current task.
   3392  *
   3393  * The last used CPU is visible for a given pid as field #39 (starting
   3394  * with #1) in the file /proc/pid/stat.  Currently this file has 41
   3395  * fields, in which case this is the 3rd to the last field.
   3396  *
   3397  * Unfortunately field #2 is a command name and might have embedded
   3398  * whitespace.  So we can't just count white space separated fields.
   3399  * Fortunately, this command name is surrounded by parentheses, as
   3400  * for example "(sh)", and that closing parenthesis is the last ')'
   3401  * character in the line.  No remaining fields can have embedded
   3402  * whitespace or parentheses.  So instead of looking for the 39th
   3403  * white space separated field, we can look for the 37th white space
   3404  * separated field past the last ')' character on the line.
   3405  */
   3406 
   3407 /* Return most recent CPU on which task pid executed */
   3408 int cpuset_latestcpu(pid_t pid)
   3409 {
   3410 	char buf[PATH_MAX];
   3411 	char *bp;
   3412 	int fd = -1;
   3413 	int cpu = -1;
   3414 
   3415 	if (pid == 0)
   3416 		snprintf(buf, sizeof(buf), "/proc/self/stat");
   3417 	else
   3418 		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
   3419 
   3420 	if ((fd = open(buf, O_RDONLY)) < 0)
   3421 		goto err;
   3422 	if (read(fd, buf, sizeof(buf)) < 1)
   3423 		goto err;
   3424 	close(fd);
   3425 
   3426 	bp = strrchr(buf, ')');
   3427 	if (bp)
   3428 		sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u",	/* 37th field past ')' */
   3429 		       &cpu);
   3430 	if (cpu < 0)
   3431 		errno = EINVAL;
   3432 	return cpu;
   3433 err:
   3434 	if (fd >= 0)
   3435 		close(fd);
   3436 	return -1;
   3437 }
   3438 
   3439 /* Bind current task to memory (uses set_mempolicy(2)) */
   3440 int cpuset_membind(int mem)
   3441 {
   3442 	struct bitmask *bmp;
   3443 	int r;
   3444 
   3445 	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
   3446 		return -1;
   3447 	bitmask_setbit(bmp, mem);
   3448 #if HAVE_DECL_MPOL_BIND
   3449 	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
   3450 #else
   3451 	r = -1;
   3452 	errno = ENOSYS;
   3453 #endif
   3454 	bitmask_free(bmp);
   3455 	return r;
   3456 }
   3457 
   3458 /* [optional] Return Memory Node holding page at specified addr */
   3459 int cpuset_addr2node(void *addr)
   3460 {
   3461 	int node = -1;
   3462 
   3463 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
   3464 	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
   3465 		/* I realize this seems redundant, but I _want_ to make sure
   3466 		 * that this value is -1. */
   3467 		node = -1;
   3468 	}
   3469 #endif
   3470 	return node;
   3471 }
   3472 
   3473 /*
   3474  * Transform cpuset into Text Format Representation in buffer 'buf',
   3475  * of length 'buflen', nul-terminated if space allows.  Return number
   3476  * of characters that would have been written, if enough space had
   3477  * been available, in the same way that snprintf() does.
   3478  */
   3479 
   3480 /* Export cpuset settings to a regular file */
   3481 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
   3482 {
   3483 	char *tmp = NULL;
   3484 	int n = 0;
   3485 
   3486 	if (cp->cpu_exclusive)
   3487 		n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n");
   3488 
   3489 	if (cp->mem_exclusive)
   3490 		n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n");
   3491 
   3492 	if (cp->notify_on_release)
   3493 		n += snprintf(buf + n, max(buflen - n, 0),
   3494 			      "notify_on_release\n");
   3495 
   3496 	if (cp->memory_pressure_enabled)
   3497 		n += snprintf(buf + n, max(buflen - n, 0),
   3498 			      "memory_pressure_enabled\n");
   3499 
   3500 	if (cp->memory_migrate)
   3501 		n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n");
   3502 
   3503 	if (cp->memory_spread_page)
   3504 		n += snprintf(buf + n, max(buflen - n, 0),
   3505 			      "memory_spread_page\n");
   3506 
   3507 	if (cp->memory_spread_slab)
   3508 		n += snprintf(buf + n, max(buflen - n, 0),
   3509 			      "memory_spread_slab\n");
   3510 
   3511 	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
   3512 		return -1;
   3513 	n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp);
   3514 	free(tmp);
   3515 	tmp = NULL;
   3516 
   3517 	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
   3518 		return -1;
   3519 	n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp);
   3520 	free(tmp);
   3521 	tmp = NULL;
   3522 
   3523 	return n;
   3524 }
   3525 
   3526 static int import_list(UNUSED const char *tok, const char *arg,
   3527 		       struct bitmask *bmp, char *emsg, int elen)
   3528 {
   3529 	if (bitmask_parselist(arg, bmp) < 0) {
   3530 		if (emsg)
   3531 			snprintf(emsg, elen, "Invalid list format: %s", arg);
   3532 		return -1;
   3533 	}
   3534 	return 0;
   3535 }
   3536 
   3537 static void stolower(char *s)
   3538 {
   3539 	while (*s) {
   3540 		unsigned char c = *s;
   3541 		*s = tolower(c);
   3542 		s++;
   3543 	}
   3544 }
   3545 
   3546 /* Import cpuset settings from a regular file */
   3547 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
   3548 		  char *emsg, int elen)
   3549 {
   3550 	char *linebuf = NULL;
   3551 	int linebuflen;
   3552 	int linenum = 0;
   3553 	int offset = 0;
   3554 
   3555 	linebuflen = strlen(buf) + 1;
   3556 	if ((linebuf = malloc(linebuflen)) == NULL) {
   3557 		if (emsg)
   3558 			snprintf(emsg, elen, "Insufficient memory");
   3559 		goto err;
   3560 	}
   3561 
   3562 	while (slgets(linebuf, linebuflen, buf, &offset)) {
   3563 		char *tok, *arg;
   3564 		char *ptr;	/* for strtok_r */
   3565 
   3566 		linenum++;
   3567 		if ((tok = strchr(linebuf, '#')) != NULL)
   3568 			*tok = 0;
   3569 		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
   3570 			continue;
   3571 		stolower(tok);
   3572 
   3573 		arg = strtok_r(0, " \t", &ptr);
   3574 
   3575 		if (streq(tok, "cpu_exclusive")) {
   3576 			cp->cpu_exclusive = 1;
   3577 			goto eol;
   3578 		}
   3579 		if (streq(tok, "mem_exclusive")) {
   3580 			cp->mem_exclusive = 1;
   3581 			goto eol;
   3582 		}
   3583 		if (streq(tok, "notify_on_release")) {
   3584 			cp->notify_on_release = 1;
   3585 			goto eol;
   3586 		}
   3587 		if (streq(tok, "memory_pressure_enabled")) {
   3588 			cp->memory_pressure_enabled = 1;
   3589 			goto eol;
   3590 		}
   3591 		if (streq(tok, "memory_migrate")) {
   3592 			cp->memory_migrate = 1;
   3593 			goto eol;
   3594 		}
   3595 		if (streq(tok, "memory_spread_page")) {
   3596 			cp->memory_spread_page = 1;
   3597 			goto eol;
   3598 		}
   3599 		if (streq(tok, "memory_spread_slab")) {
   3600 			cp->memory_spread_slab = 1;
   3601 			goto eol;
   3602 		}
   3603 		if (streq(tok, "cpu") || streq(tok, "cpus")) {
   3604 			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
   3605 				goto err;
   3606 			goto eol;
   3607 		}
   3608 		if (streq(tok, "mem") || streq(tok, "mems")) {
   3609 			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
   3610 				goto err;
   3611 			goto eol;
   3612 		}
   3613 		if (emsg)
   3614 			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
   3615 		goto err;
   3616 eol:
   3617 		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
   3618 			if (emsg)
   3619 				snprintf(emsg, elen, "Surplus token: '%s'",
   3620 					 tok);
   3621 			goto err;
   3622 		}
   3623 		continue;
   3624 	}
   3625 
   3626 	free(linebuf);
   3627 
   3628 	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
   3629 		cpuset_localcpus(cp->mems, cp->cpus);
   3630 	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
   3631 		cpuset_localmems(cp->cpus, cp->mems);
   3632 
   3633 	/*
   3634 	 * All cpuset attributes are determined in an import.
   3635 	 * Those that aren't explicitly specified are presumed
   3636 	 * to be unchanged (zero, if it's a freshly allocated
   3637 	 * struct cpuset.)
   3638 	 */
   3639 
   3640 	cp->cpus_valid = 1;
   3641 	cp->mems_valid = 1;
   3642 	cp->cpu_exclusive_valid = 1;
   3643 	cp->mem_exclusive_valid = 1;
   3644 	cp->notify_on_release_valid = 1;
   3645 	cp->memory_migrate_valid = 1;
   3646 	cp->memory_pressure_enabled_valid = 1;
   3647 	cp->memory_spread_page_valid = 1;
   3648 	cp->memory_spread_slab_valid = 1;
   3649 
   3650 	return 0;
   3651 err:
   3652 	if (elinenum)
   3653 		*elinenum = linenum;
   3654 	free(linebuf);
   3655 	return -1;
   3656 }
   3657 
   3658 /* Pin current task CPU (and memory) */
   3659 int cpuset_pin(int relcpu)
   3660 {
   3661 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
   3662 	int cpu, r;
   3663 
   3664 	if (check() < 0)
   3665 		return -1;
   3666 
   3667 	do {
   3668 		cpuset_free_placement(plc1);
   3669 		plc1 = cpuset_get_placement(0);
   3670 
   3671 		r = 0;
   3672 		if (cpuset_unpin() < 0)
   3673 			r = -1;
   3674 		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
   3675 		if (cpuset_cpubind(cpu) < 0)
   3676 			r = -1;
   3677 
   3678 		cpuset_free_placement(plc2);
   3679 		plc2 = cpuset_get_placement(0);
   3680 	} while (!cpuset_equal_placement(plc1, plc2));
   3681 
   3682 	cpuset_free_placement(plc1);
   3683 	cpuset_free_placement(plc2);
   3684 	return r;
   3685 }
   3686 
   3687 /* Return number CPUs in current tasks cpuset */
   3688 int cpuset_size()
   3689 {
   3690 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
   3691 	int r;
   3692 
   3693 	if (check() < 0)
   3694 		return -1;
   3695 
   3696 	do {
   3697 		cpuset_free_placement(plc1);
   3698 		plc1 = cpuset_get_placement(0);
   3699 
   3700 		r = cpuset_cpus_weight(0);
   3701 
   3702 		cpuset_free_placement(plc2);
   3703 		plc2 = cpuset_get_placement(0);
   3704 	} while (!cpuset_equal_placement(plc1, plc2));
   3705 
   3706 	cpuset_free_placement(plc1);
   3707 	cpuset_free_placement(plc2);
   3708 	return r;
   3709 }
   3710 
   3711 /* Return relative CPU number, within current cpuset, last executed on */
   3712 int cpuset_where()
   3713 {
   3714 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
   3715 	int r;
   3716 
   3717 	if (check() < 0)
   3718 		return -1;
   3719 
   3720 	do {
   3721 		cpuset_free_placement(plc1);
   3722 		plc1 = cpuset_get_placement(0);
   3723 
   3724 		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
   3725 
   3726 		cpuset_free_placement(plc2);
   3727 		plc2 = cpuset_get_placement(0);
   3728 	} while (!cpuset_equal_placement(plc1, plc2));
   3729 
   3730 	cpuset_free_placement(plc1);
   3731 	cpuset_free_placement(plc2);
   3732 	return r;
   3733 }
   3734 
   3735 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
   3736 int cpuset_unpin()
   3737 {
   3738 	struct bitmask *cpus = NULL, *mems = NULL;
   3739 	int r = -1;
   3740 
   3741 	if (check() < 0)
   3742 		goto err;
   3743 
   3744 	/*
   3745 	 * Don't need cpuset_*_placement() guard against concurrent
   3746 	 * cpuset migration, because none of the following depends
   3747 	 * on the tasks cpuset placement.
   3748 	 */
   3749 
   3750 	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
   3751 		goto err;
   3752 	bitmask_setall(cpus);
   3753 	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
   3754 		goto err;
   3755 
   3756 	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
   3757 		goto err;
   3758 #if HAVE_DECL_MPOL_DEFAULT
   3759 	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
   3760 			  bitmask_nbits(mems) + 1) < 0)
   3761 		goto err;
   3762 	r = 0;
   3763 #endif
   3764 	/* fall into ... */
   3765 err:
   3766 	bitmask_free(cpus);
   3767 	bitmask_free(mems);
   3768 	return r;
   3769 
   3770 }
   3771 
   3772 struct cpuset_function_list {
   3773 	const char *fname;
   3774 	void *func;
   3775 } flist[] = {
   3776 	{
   3777 	"cpuset_version", cpuset_version}, {
   3778 	"cpuset_alloc", cpuset_alloc}, {
   3779 	"cpuset_free", cpuset_free}, {
   3780 	"cpuset_cpus_nbits", cpuset_cpus_nbits}, {
   3781 	"cpuset_mems_nbits", cpuset_mems_nbits}, {
   3782 	"cpuset_setcpus", cpuset_setcpus}, {
   3783 	"cpuset_setmems", cpuset_setmems}, {
   3784 	"cpuset_set_iopt", cpuset_set_iopt}, {
   3785 	"cpuset_set_sopt", cpuset_set_sopt}, {
   3786 	"cpuset_getcpus", cpuset_getcpus}, {
   3787 	"cpuset_getmems", cpuset_getmems}, {
   3788 	"cpuset_cpus_weight", cpuset_cpus_weight}, {
   3789 	"cpuset_mems_weight", cpuset_mems_weight}, {
   3790 	"cpuset_get_iopt", cpuset_get_iopt}, {
   3791 	"cpuset_get_sopt", cpuset_get_sopt}, {
   3792 	"cpuset_localcpus", cpuset_localcpus}, {
   3793 	"cpuset_localmems", cpuset_localmems}, {
   3794 	"cpuset_cpumemdist", cpuset_cpumemdist}, {
   3795 	"cpuset_cpu2node", cpuset_cpu2node}, {
   3796 	"cpuset_addr2node", cpuset_addr2node}, {
   3797 	"cpuset_create", cpuset_create}, {
   3798 	"cpuset_delete", cpuset_delete}, {
   3799 	"cpuset_query", cpuset_query}, {
   3800 	"cpuset_modify", cpuset_modify}, {
   3801 	"cpuset_getcpusetpath", cpuset_getcpusetpath}, {
   3802 	"cpuset_cpusetofpid", cpuset_cpusetofpid}, {
   3803 	"cpuset_mountpoint", cpuset_mountpoint}, {
   3804 	"cpuset_collides_exclusive", cpuset_collides_exclusive}, {
   3805 	"cpuset_nuke", cpuset_nuke}, {
   3806 	"cpuset_init_pidlist", cpuset_init_pidlist}, {
   3807 	"cpuset_pidlist_length", cpuset_pidlist_length}, {
   3808 	"cpuset_get_pidlist", cpuset_get_pidlist}, {
   3809 	"cpuset_freepidlist", cpuset_freepidlist}, {
   3810 	"cpuset_move", cpuset_move}, {
   3811 	"cpuset_move_all", cpuset_move_all}, {
   3812 	"cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
   3813 	"cpuset_migrate", cpuset_migrate}, {
   3814 	"cpuset_migrate_all", cpuset_migrate_all}, {
   3815 	"cpuset_reattach", cpuset_reattach}, {
   3816 	"cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
   3817 	"cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
   3818 	"cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
   3819 	"cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
   3820 	"cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
   3821 	"cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
   3822 	"cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
   3823 	"cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
   3824 	"cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
   3825 	"cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
   3826 	"cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
   3827 	"cpuset_get_placement", cpuset_get_placement}, {
   3828 	"cpuset_equal_placement", cpuset_equal_placement}, {
   3829 	"cpuset_free_placement", cpuset_free_placement}, {
   3830 	"cpuset_fts_open", cpuset_fts_open}, {
   3831 	"cpuset_fts_read", cpuset_fts_read}, {
   3832 	"cpuset_fts_reverse", cpuset_fts_reverse}, {
   3833 	"cpuset_fts_rewind", cpuset_fts_rewind}, {
   3834 	"cpuset_fts_get_path", cpuset_fts_get_path}, {
   3835 	"cpuset_fts_get_stat", cpuset_fts_get_stat}, {
   3836 	"cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
   3837 	"cpuset_fts_get_errno", cpuset_fts_get_errno}, {
   3838 	"cpuset_fts_get_info", cpuset_fts_get_info}, {
   3839 	"cpuset_fts_close", cpuset_fts_close}, {
   3840 	"cpuset_cpubind", cpuset_cpubind}, {
   3841 	"cpuset_latestcpu", cpuset_latestcpu}, {
   3842 	"cpuset_membind", cpuset_membind}, {
   3843 	"cpuset_export", cpuset_export}, {
   3844 	"cpuset_import", cpuset_import}, {
   3845 	"cpuset_function", cpuset_function}, {
   3846 	"cpuset_pin", cpuset_pin}, {
   3847 	"cpuset_size", cpuset_size}, {
   3848 	"cpuset_where", cpuset_where}, {
   3849 "cpuset_unpin", cpuset_unpin},};
   3850 
   3851 /* Return pointer to a libcpuset.so function, or NULL */
   3852 void *cpuset_function(const char *function_name)
   3853 {
   3854 	unsigned int i;
   3855 
   3856 	for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
   3857 		if (streq(function_name, flist[i].fname))
   3858 			return flist[i].func;
   3859 	return NULL;
   3860 }
   3861 
   3862 /* Fortran interface to basic cpuset routines */
   3863 int cpuset_pin_(int *ptr_relcpu)
   3864 {
   3865 	return cpuset_pin(*ptr_relcpu);
   3866 }
   3867 
   3868 int cpuset_size_(void)
   3869 {
   3870 	return cpuset_size();
   3871 }
   3872 
   3873 int cpuset_where_(void)
   3874 {
   3875 	return cpuset_where();
   3876 }
   3877 
   3878 int cpuset_unpin_(void)
   3879 {
   3880 	return cpuset_unpin();
   3881 }
   3882 
   3883 #endif /* HAVE_LINUX_MEMPOLICY_H */
   3884