1 /* 2 * cpuset user library implementation. 3 * 4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. 5 * 6 * Paul Jackson <pj (at) sgi.com> 7 */ 8 9 /* 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #define _GNU_SOURCE /* need to see pread() and syscall() */ 26 #include <unistd.h> 27 28 #include <ctype.h> 29 #include <dirent.h> 30 #include <errno.h> 31 #include <fcntl.h> 32 #include <fts.h> 33 #include <limits.h> 34 #include <signal.h> 35 #include <stdint.h> 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <sys/stat.h> 40 #include <sys/syscall.h> 41 #include <sys/types.h> 42 #include <time.h> 43 #include <utime.h> 44 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ 45 46 #include "bitmask.h" 47 #include "cpuset.h" 48 #include "common.h" 49 #include "test.h" 50 #include "lapi/syscalls.h" 51 #include "config.h" 52 53 #if HAVE_LINUX_MEMPOLICY_H 54 #include <linux/mempolicy.h> 55 56 /* Bump version, and update Change History, when libcpuset API changes */ 57 #define CPUSET_VERSION 3 58 59 /* 60 * For a history of what changed in each version, see the "Change 61 * History" section, at the end of the libcpuset master document. 62 */ 63 64 int cpuset_version(void) 65 { 66 return CPUSET_VERSION; 67 } 68 69 struct cpuset { 70 struct bitmask *cpus; 71 struct bitmask *mems; 72 char cpu_exclusive; 73 char mem_exclusive; 74 char mem_hardwall; 75 char notify_on_release; 76 char memory_migrate; 77 char memory_pressure_enabled; 78 char memory_spread_page; 79 char memory_spread_slab; 80 char sched_load_balance; 81 int sched_relax_domain_level; 82 83 /* 84 * Each field 'x' above gets an 'x_valid' field below. 85 * The apply_cpuset_settings() will only set those fields whose 86 * corresponding *_valid flags are set. The cpuset_alloc() 87 * routine clears these flags as part of the clear in calloc(), 88 * and the various cpuset_set*() routines set these flags when 89 * setting the corresponding value. 90 * 91 * The purpose of these valid fields is to ensure that when 92 * we create a new cpuset, we don't accidentally overwrite 93 * some non-zero kernel default, such as an inherited 94 * memory_spread_* flag, just because the user application 95 * code didn't override the default zero settings resulting 96 * from the calloc() call in cpuset_alloc(). 97 * 98 * The choice of 'char' for the type of the flags above, 99 * but a bitfield for the flags below, is somewhat capricious. 100 */ 101 unsigned cpus_valid:1; 102 unsigned mems_valid:1; 103 unsigned cpu_exclusive_valid:1; 104 unsigned mem_exclusive_valid:1; 105 unsigned mem_hardwall_valid:1; 106 unsigned notify_on_release_valid:1; 107 unsigned memory_migrate_valid:1; 108 unsigned memory_pressure_enabled_valid:1; 109 unsigned memory_spread_page_valid:1; 110 unsigned memory_spread_slab_valid:1; 111 unsigned sched_load_balance_valid:1; 112 unsigned sched_relax_domain_level_valid:1; 113 114 /* 115 * if the relative variable was modified, use following flags 116 * to put a mark 117 */ 118 unsigned cpus_dirty:1; 119 unsigned mems_dirty:1; 120 unsigned cpu_exclusive_dirty:1; 121 unsigned mem_exclusive_dirty:1; 122 unsigned mem_hardwall_dirty:1; 123 unsigned notify_on_release_dirty:1; 124 unsigned memory_migrate_dirty:1; 125 unsigned memory_pressure_enabled_dirty:1; 126 unsigned memory_spread_page_dirty:1; 127 unsigned memory_spread_slab_dirty:1; 128 unsigned sched_load_balance_dirty:1; 129 unsigned sched_relax_domain_level_dirty:1; 130 }; 131 132 /* Presumed cpuset file system mount point */ 133 static const char *cpusetmnt = "/dev/cpuset"; 134 135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */ 136 static const char *mapfile = "/var/run/cpunodemap"; 137 138 /* The primary source for the cpunodemap[] is available below here. */ 139 static const char *sysdevices = "/sys/devices/system"; 140 141 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ 142 #define SMALL_BUFSZ 16 143 144 /* 145 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t 146 * and nodemask_t sizes. The lines in this file that begin with the 147 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask 148 * and nodemask string, respectively. The lengths of these strings 149 * reflect the kernel's internal cpumask_t and nodemask_t sizes, 150 * which sizes are needed to correctly call the sched_setaffinity 151 * and set_mempolicy system calls, and to size user level 152 * bitmasks to match the kernels. 153 */ 154 155 static const char *mask_size_file = "/proc/self/status"; 156 static const char *cpumask_prefix = "Cpus_allowed:\t"; 157 static const char *nodemask_prefix = "Mems_allowed:\t"; 158 159 /* 160 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. 161 * 162 * The first time we need these, we parse the Cpus_allowed and 163 * Mems_allowed lines from mask_size_file ("/proc/self/status"). 164 */ 165 166 static int cpumask_sz; 167 static int nodemask_sz; 168 169 /* 170 * These defaults only kick in if we fail to size the kernel 171 * cpumask and nodemask by reading the Cpus_allowed and 172 * Mems_allowed fields from the /proc/self/status file. 173 */ 174 175 #define DEFCPUBITS (512) 176 #define DEFNODEBITS (DEFCPUBITS/2) 177 178 /* 179 * Arch-neutral API for obtaining NUMA distances between CPUs 180 * and Memory Nodes, via the files: 181 * /sys/devices/system/node/nodeN/distance 182 * which have lines such as: 183 * 46 66 10 20 184 * which say that for cpu on node N (from the path above), the 185 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, 186 * respectively. 187 */ 188 189 static const char *distance_directory = "/sys/devices/system/node"; 190 191 /* 192 * Someday, we should disable, then later discard, the SN code 193 * marked ALTERNATE_SN_DISTMAP. 194 */ 195 196 #define ALTERNATE_SN_DISTMAP 1 197 #ifdef ALTERNATE_SN_DISTMAP 198 199 /* 200 * Alternative SN (SGI ia64) architecture specific API for obtaining 201 * NUMA distances between CPUs and Memory Nodes is via the file 202 * /proc/sgi_sn/sn_topology, which has lines such as: 203 * 204 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 205 * 206 * which says that for each CPU on node 2, the distance to nodes 207 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. 208 * 209 * This file has other lines as well, which start with other 210 * keywords than "node". Ignore these other lines. 211 */ 212 213 static const char *sn_topology = "/proc/sgi_sn/sn_topology"; 214 static const char *sn_top_node_prefix = "node "; 215 216 #endif 217 218 /* 219 * Check that cpusets supported, /dev/cpuset mounted. 220 * If ok, return 0. 221 * If not, return -1 and set errno: 222 * ENOSYS - kernel doesn't support cpusets 223 * ENODEV - /dev/cpuset not mounted 224 */ 225 226 static enum { 227 check_notdone, 228 check_enosys, 229 check_enodev, 230 check_ok 231 } check_state = check_notdone; 232 233 static int check(void) 234 { 235 if (check_state == check_notdone) { 236 struct stat statbuf; 237 238 if (stat("/proc/self/cpuset", &statbuf) < 0) { 239 check_state = check_enosys; 240 goto done; 241 } 242 243 if (stat("/dev/cpuset/tasks", &statbuf) < 0) { 244 check_state = check_enodev; 245 goto done; 246 } 247 248 check_state = check_ok; 249 } 250 done: 251 switch (check_state) { 252 case check_enosys: 253 errno = ENOSYS; 254 return -1; 255 case check_enodev: 256 errno = ENODEV; 257 return -1; 258 default: 259 break; 260 } 261 return 0; 262 } 263 264 static void chomp(char *s) 265 { 266 char *t; 267 268 for (t = s + strlen(s) - 1; t >= s; t--) { 269 if (*t == '\n' || *t == '\r') 270 *t = '\0'; 271 else 272 break; 273 } 274 } 275 276 /* 277 * Determine number of bytes in a seekable open file, without 278 * assuming that stat(2) on that file has a useful size. 279 * Has side affect of leaving the file rewound to the beginnning. 280 */ 281 static int filesize(FILE * fp) 282 { 283 int sz = 0; 284 rewind(fp); 285 while (fgetc(fp) != EOF) 286 sz++; 287 rewind(fp); 288 return sz; 289 } 290 291 /* Are strings s1 and s2 equal? */ 292 static int streq(const char *s1, const char *s2) 293 { 294 return strcmp(s1, s2) == 0; 295 } 296 297 /* Is string 'pre' a prefix of string 's'? */ 298 static int strprefix(const char *s, const char *pre) 299 { 300 return strncmp(s, pre, strlen(pre)) == 0; 301 } 302 303 /* 304 * char *flgets(char *buf, int buflen, FILE *fp) 305 * 306 * Obtain one line from input file fp. Copy up to first 307 * buflen-1 chars of line into buffer buf, discarding any remainder 308 * of line. Stop reading at newline, discarding newline. 309 * Nul terminate result and return pointer to buffer buf 310 * on success, or NULL if nothing more to read or failure. 311 */ 312 313 static char *flgets(char *buf, int buflen, FILE * fp) 314 { 315 int c = -1; 316 char *bp; 317 318 bp = buf; 319 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { 320 if (c == '\n') 321 goto newline; 322 *bp++ = c; 323 } 324 if ((c < 0) && (bp == buf)) 325 return NULL; 326 327 if (c > 0) { 328 while ((c = getc(fp)) >= 0) { 329 if (c == '\n') 330 break; 331 } 332 } 333 334 newline: 335 *bp++ = '\0'; 336 return buf; 337 } 338 339 /* 340 * sgetc(const char *inputbuf, int *offsetptr) 341 * 342 * Return next char from nul-terminated input buffer inputbuf, 343 * starting at offset *offsetptr. Increment *offsetptr. 344 * If next char would be nul ('\0'), return EOF and don't 345 * increment *offsetptr. 346 */ 347 348 static int sgetc(const char *inputbuf, int *offsetptr) 349 { 350 char c; 351 352 if ((c = inputbuf[*offsetptr]) != 0) { 353 *offsetptr = *offsetptr + 1; 354 return c; 355 } else { 356 return EOF; 357 } 358 } 359 360 /* 361 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 362 * 363 * Obtain next line from nul-terminated input buffer 'inputbuf', 364 * starting at offset *offsetptr. Copy up to first buflen-1 365 * chars of line into output buffer buf, discarding any remainder 366 * of line. Stop reading at newline, discarding newline. 367 * Nul terminate result and return pointer to output buffer 368 * buf on success, or NULL if nothing more to read. 369 */ 370 371 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 372 { 373 int c = -1; 374 char *bp; 375 376 bp = buf; 377 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { 378 if (c == '\n') 379 goto newline; 380 *bp++ = c; 381 } 382 if ((c < 0) && (bp == buf)) 383 return NULL; 384 385 if (c > 0) { 386 while ((c = sgetc(inputbuf, offsetptr)) >= 0) { 387 if (c == '\n') 388 break; 389 } 390 } 391 392 newline: 393 *bp++ = '\0'; 394 return buf; 395 } 396 397 /* 398 * time_t get_mtime(char *path) 399 * 400 * Return modtime of file at location path, else return 0. 401 */ 402 403 static time_t get_mtime(const char *path) 404 { 405 struct stat statbuf; 406 407 if (stat(path, &statbuf) != 0) 408 return 0; 409 return statbuf.st_mtime; 410 } 411 412 /* 413 * int set_mtime(const char *path, time_t mtime) 414 * 415 * Set modtime of file 'path' to 'mtime'. Return 0 on success, 416 * or -1 on error, setting errno. 417 */ 418 419 static int set_mtime(const char *path, time_t mtime) 420 { 421 struct utimbuf times; 422 423 times.actime = mtime; 424 times.modtime = mtime; 425 return utime(path, ×); 426 } 427 428 /* 429 * True if two pathnames resolve to same file. 430 * False if either path can not be stat'd, 431 * or if the two paths resolve to a different file. 432 */ 433 434 static int samefile(const char *path1, const char *path2) 435 { 436 struct stat sb1, sb2; 437 438 if (stat(path1, &sb1) != 0) 439 return 0; 440 if (stat(path2, &sb2) != 0) 441 return 0; 442 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; 443 } 444 445 #define slash(c) (*(c) == '/') 446 #define eocomp(c) (slash(c) || !*(c)) 447 #define dot1(c) (*(c) == '.' && eocomp(c+1)) 448 449 /* In place path compression. Remove extra dots and slashes. */ 450 static char *pathcomp(char *p) 451 { 452 char *a = p; 453 char *b = p; 454 455 if (!p || !*p) 456 return p; 457 if (slash(p)) 458 *b++ = *a++; 459 for (;;) { 460 if (slash(a)) 461 while (slash(++a)) 462 continue; 463 if (!*a) { 464 if (b == p) 465 *b++ = '.'; 466 *b = '\0'; 467 return (p); 468 } else if (dot1(a)) { 469 a++; 470 } else { 471 if ((b != p) && !slash(b - 1)) 472 *b++ = '/'; 473 while (!eocomp(a)) 474 *b++ = *a++; 475 } 476 } 477 } 478 479 #undef slash 480 #undef eocomp 481 #undef dot1 482 483 /* 484 * pathcat2(buf, buflen, name1, name2) 485 * 486 * Return buf, of length buflen, with name1/name2 stored in it. 487 */ 488 489 static char *pathcat2(char *buf, int buflen, const char *name1, 490 const char *name2) 491 { 492 (void)snprintf(buf, buflen, "%s/%s", name1, name2); 493 return pathcomp(buf); 494 } 495 496 /* 497 * pathcat3(buf, buflen, name1, name2, name3) 498 * 499 * Return buf, of length buflen, with name1/name2/name3 stored in it. 500 */ 501 502 static char *pathcat3(char *buf, int buflen, const char *name1, 503 const char *name2, const char *name3) 504 { 505 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); 506 return pathcomp(buf); 507 } 508 509 /* 510 * fullpath(buf, buflen, name) 511 * 512 * Put full path of cpuset 'name' in buffer 'buf'. If name 513 * starts with a slash (``/``) character, then this a path 514 * relative to ``/dev/cpuset``, otherwise it is relative to 515 * the current tasks cpuset. Return 0 on success, else 516 * -1 on error, setting errno. 517 */ 518 519 static int fullpath(char *buf, int buflen, const char *name) 520 { 521 int len; 522 523 /* easy case */ 524 if (*name == '/') { 525 pathcat2(buf, buflen, cpusetmnt, name); 526 pathcomp(buf); 527 return 0; 528 } 529 530 /* hard case */ 531 snprintf(buf, buflen, "%s/", cpusetmnt); 532 len = strlen(buf); 533 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) 534 return -1; 535 if (strlen(buf) >= buflen - 1 - strlen(name)) { 536 errno = E2BIG; 537 return -1; 538 } 539 strcat(buf, "/"); 540 strcat(buf, name); 541 pathcomp(buf); 542 return 0; 543 } 544 545 /* 546 * fullpath2(buf, buflen, name1, name2) 547 * 548 * Like fullpath(), only concatenate two pathname components on end. 549 */ 550 551 static int fullpath2(char *buf, int buflen, const char *name1, 552 const char *name2) 553 { 554 if (fullpath(buf, buflen, name1) < 0) 555 return -1; 556 if (strlen(buf) >= buflen - 1 - strlen(name2)) { 557 errno = E2BIG; 558 return -1; 559 } 560 strcat(buf, "/"); 561 strcat(buf, name2); 562 pathcomp(buf); 563 return 0; 564 } 565 566 /* 567 * Convert the string length of an ascii hex mask to the number 568 * of bits represented by that mask. 569 * 570 * The cpumask and nodemask values in /proc/self/status are in an 571 * ascii format that uses 9 characters for each 32 bits of mask. 572 */ 573 static int s2nbits(const char *s) 574 { 575 return strlen(s) * 32 / 9; 576 } 577 578 static void update_mask_sizes(void) 579 { 580 FILE *fp = NULL; 581 char *buf = NULL; 582 int fsize; 583 584 if ((fp = fopen(mask_size_file, "r")) == NULL) 585 goto done; 586 fsize = filesize(fp); 587 if ((buf = malloc(fsize)) == NULL) 588 goto done; 589 590 /* 591 * Beware: mask sizing arithmetic is fussy. 592 * The trailing newline left by fgets() is required. 593 */ 594 while (fgets(buf, fsize, fp)) { 595 if (strprefix(buf, cpumask_prefix)) 596 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); 597 if (strprefix(buf, nodemask_prefix)) 598 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); 599 } 600 done: 601 free(buf); 602 if (fp != NULL) 603 fclose(fp); 604 if (cpumask_sz == 0) 605 cpumask_sz = DEFCPUBITS; 606 if (nodemask_sz == 0) 607 nodemask_sz = DEFNODEBITS; 608 } 609 610 /* Allocate a new struct cpuset */ 611 struct cpuset *cpuset_alloc(void) 612 { 613 struct cpuset *cp = NULL; 614 int nbits; 615 616 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) 617 goto err; 618 619 nbits = cpuset_cpus_nbits(); 620 if ((cp->cpus = bitmask_alloc(nbits)) == NULL) 621 goto err; 622 623 nbits = cpuset_mems_nbits(); 624 if ((cp->mems = bitmask_alloc(nbits)) == NULL) 625 goto err; 626 627 return cp; 628 err: 629 if (cp && cp->cpus) 630 bitmask_free(cp->cpus); 631 if (cp && cp->mems) 632 bitmask_free(cp->mems); 633 free(cp); 634 return NULL; 635 } 636 637 /* Free struct cpuset *cp */ 638 void cpuset_free(struct cpuset *cp) 639 { 640 if (!cp) 641 return; 642 if (cp->cpus) 643 bitmask_free(cp->cpus); 644 if (cp->mems) 645 bitmask_free(cp->mems); 646 free(cp); 647 } 648 649 /* Number of bits in a CPU bitmask on current system */ 650 int cpuset_cpus_nbits(void) 651 { 652 if (cpumask_sz == 0) 653 update_mask_sizes(); 654 return cpumask_sz; 655 } 656 657 /* Number of bits in a Memory bitmask on current system */ 658 int cpuset_mems_nbits(void) 659 { 660 if (nodemask_sz == 0) 661 update_mask_sizes(); 662 return nodemask_sz; 663 } 664 665 /* Set CPUs in cpuset cp to bitmask cpus */ 666 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) 667 { 668 if (cp->cpus) 669 bitmask_free(cp->cpus); 670 cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); 671 if (cp->cpus == NULL) 672 return -1; 673 bitmask_copy(cp->cpus, cpus); 674 cp->cpus_valid = 1; 675 cp->cpus_dirty = 1; 676 return 0; 677 } 678 679 /* Set Memory Nodes in cpuset cp to bitmask mems */ 680 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) 681 { 682 if (cp->mems) 683 bitmask_free(cp->mems); 684 cp->mems = bitmask_alloc(bitmask_nbits(mems)); 685 if (cp->mems == NULL) 686 return -1; 687 bitmask_copy(cp->mems, mems); 688 cp->mems_valid = 1; 689 cp->mems_dirty = 1; 690 return 0; 691 } 692 693 /* Set integer value optname of cpuset cp */ 694 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) 695 { 696 if (streq(optionname, "cpu_exclusive")) { 697 cp->cpu_exclusive = ! !value; 698 cp->cpu_exclusive_valid = 1; 699 cp->cpu_exclusive_dirty = 1; 700 } else if (streq(optionname, "mem_exclusive")) { 701 cp->mem_exclusive = ! !value; 702 cp->mem_exclusive_valid = 1; 703 cp->mem_exclusive_dirty = 1; 704 } else if (streq(optionname, "mem_hardwall")) { 705 cp->mem_hardwall = ! !value; 706 cp->mem_hardwall_valid = 1; 707 cp->mem_hardwall_dirty = 1; 708 } else if (streq(optionname, "notify_on_release")) { 709 cp->notify_on_release = ! !value; 710 cp->notify_on_release_valid = 1; 711 cp->notify_on_release_dirty = 1; 712 } else if (streq(optionname, "memory_pressure_enabled")) { 713 cp->memory_pressure_enabled = ! !value; 714 cp->memory_pressure_enabled_valid = 1; 715 cp->memory_pressure_enabled_dirty = 1; 716 } else if (streq(optionname, "memory_migrate")) { 717 cp->memory_migrate = ! !value; 718 cp->memory_migrate_valid = 1; 719 cp->memory_migrate_dirty = 1; 720 } else if (streq(optionname, "memory_spread_page")) { 721 cp->memory_spread_page = ! !value; 722 cp->memory_spread_page_valid = 1; 723 cp->memory_spread_page_dirty = 1; 724 } else if (streq(optionname, "memory_spread_slab")) { 725 cp->memory_spread_slab = ! !value; 726 cp->memory_spread_slab_valid = 1; 727 cp->memory_spread_slab_dirty = 1; 728 } else if (streq(optionname, "sched_load_balance")) { 729 cp->sched_load_balance = ! !value; 730 cp->sched_load_balance_valid = 1; 731 cp->sched_load_balance_dirty = 1; 732 } else if (streq(optionname, "sched_relax_domain_level")) { 733 cp->sched_relax_domain_level = value; 734 cp->sched_relax_domain_level_valid = 1; 735 cp->sched_relax_domain_level_dirty = 1; 736 } else 737 return -2; /* optionname not recognized */ 738 return 0; 739 } 740 741 /* [optional] Set string value optname */ 742 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, 743 UNUSED const char *value) 744 { 745 return -2; /* For now, all string options unrecognized */ 746 } 747 748 /* Return handle for reading memory_pressure. */ 749 int cpuset_open_memory_pressure(const char *cpusetpath) 750 { 751 char buf[PATH_MAX]; 752 753 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); 754 return open(buf, O_RDONLY); 755 } 756 757 /* Return current memory_pressure of cpuset. */ 758 int cpuset_read_memory_pressure(int han) 759 { 760 char buf[SMALL_BUFSZ]; 761 762 if (pread(han, buf, sizeof(buf), 0L) < 0) 763 return -1; 764 return atoi(buf); 765 } 766 767 /* Close handle for reading memory pressure. */ 768 void cpuset_close_memory_pressure(int han) 769 { 770 close(han); 771 } 772 773 /* 774 * Resolve cpuset pointer (to that of current task if cp == NULL). 775 * 776 * If cp not NULL, just return it. If cp is NULL, return pointer 777 * to temporary cpuset for current task, and set *cp_tofree to 778 * pointer to that same temporary cpuset, to be freed later. 779 * 780 * Return NULL and set errno on error. Errors can occur when 781 * resolving the current tasks cpuset. 782 */ 783 static const struct cpuset *resolve_cp(const struct cpuset *cp, 784 struct cpuset **cp_tofree) 785 { 786 const struct cpuset *rcp; 787 788 if (cp) { 789 rcp = cp; 790 } else { 791 struct cpuset *cp1 = cpuset_alloc(); 792 if (cp1 == NULL) 793 goto err; 794 if (cpuset_cpusetofpid(cp1, 0) < 0) { 795 cpuset_free(cp1); 796 goto err; 797 } 798 *cp_tofree = cp1; 799 rcp = cp1; 800 } 801 return rcp; 802 err: 803 return NULL; 804 } 805 806 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ 807 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) 808 { 809 struct cpuset *cp_tofree = NULL; 810 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 811 812 if (!cp1) 813 goto err; 814 if (cp1->cpus == NULL) { 815 errno = EINVAL; 816 goto err; 817 } 818 bitmask_copy(cpus, cp1->cpus); 819 cpuset_free(cp_tofree); 820 return 0; 821 err: 822 cpuset_free(cp_tofree); 823 return -1; 824 } 825 826 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ 827 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) 828 { 829 struct cpuset *cp_tofree = NULL; 830 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 831 832 if (!cp1) 833 goto err; 834 if (cp1->mems == NULL) { 835 errno = EINVAL; 836 goto err; 837 } 838 bitmask_copy(mems, cp1->mems); 839 cpuset_free(cp_tofree); 840 return 0; 841 err: 842 cpuset_free(cp_tofree); 843 return -1; 844 } 845 846 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */ 847 int cpuset_cpus_weight(const struct cpuset *cp) 848 { 849 struct cpuset *cp_tofree = NULL; 850 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 851 int w = -1; 852 853 if (!cp1) 854 goto err; 855 if (cp1->cpus == NULL) { 856 errno = EINVAL; 857 goto err; 858 } 859 w = bitmask_weight(cp1->cpus); 860 /* fall into ... */ 861 err: 862 cpuset_free(cp_tofree); 863 return w; 864 } 865 866 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ 867 int cpuset_mems_weight(const struct cpuset *cp) 868 { 869 struct cpuset *cp_tofree = NULL; 870 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 871 int w = -1; 872 873 if (!cp1) 874 goto err; 875 if (cp1->mems == NULL) { 876 errno = EINVAL; 877 goto err; 878 } 879 w = bitmask_weight(cp1->mems); 880 /* fall into ... */ 881 err: 882 cpuset_free(cp_tofree); 883 return w; 884 } 885 886 /* Return integer value of option optname in cp */ 887 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) 888 { 889 if (streq(optionname, "cpu_exclusive")) 890 return cp->cpu_exclusive; 891 else if (streq(optionname, "mem_exclusive")) 892 return cp->mem_exclusive; 893 else if (streq(optionname, "mem_hardwall")) 894 return cp->mem_hardwall; 895 else if (streq(optionname, "notify_on_release")) 896 return cp->notify_on_release; 897 else if (streq(optionname, "memory_pressure_enabled")) 898 return cp->memory_pressure_enabled; 899 else if (streq(optionname, "memory_migrate")) 900 return cp->memory_migrate; 901 else if (streq(optionname, "memory_spread_page")) 902 return cp->memory_spread_page; 903 else if (streq(optionname, "memory_spread_slab")) 904 return cp->memory_spread_slab; 905 else if (streq(optionname, "sched_load_balance")) 906 return cp->sched_load_balance; 907 else if (streq(optionname, "sched_relax_domain_level")) 908 return cp->sched_relax_domain_level; 909 else 910 return -2; /* optionname not recognized */ 911 } 912 913 /* [optional] Return string value of optname */ 914 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, 915 UNUSED const char *optionname) 916 { 917 return NULL; /* For now, all string options unrecognized */ 918 } 919 920 static int read_flag(const char *filepath, char *flagp) 921 { 922 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ 923 int fd = -1; 924 925 if ((fd = open(filepath, O_RDONLY)) < 0) 926 goto err; 927 if (read(fd, buf, sizeof(buf)) < 1) 928 goto err; 929 if (atoi(buf)) 930 *flagp = 1; 931 else 932 *flagp = 0; 933 close(fd); 934 return 0; 935 err: 936 if (fd >= 0) 937 close(fd); 938 return -1; 939 } 940 941 static int load_flag(const char *path, char *flagp, const char *flag) 942 { 943 char buf[PATH_MAX]; 944 945 pathcat2(buf, sizeof(buf), path, flag); 946 return read_flag(buf, flagp); 947 } 948 949 static int read_number(const char *filepath, int *numberp) 950 { 951 char buf[SMALL_BUFSZ]; 952 int fd = -1; 953 954 if ((fd = open(filepath, O_RDONLY)) < 0) 955 goto err; 956 if (read(fd, buf, sizeof(buf)) < 1) 957 goto err; 958 *numberp = atoi(buf); 959 close(fd); 960 return 0; 961 err: 962 if (fd >= 0) 963 close(fd); 964 return -1; 965 } 966 967 static int load_number(const char *path, int *numberp, const char *file) 968 { 969 char buf[PATH_MAX]; 970 971 pathcat2(buf, sizeof(buf), path, file); 972 return read_number(buf, numberp); 973 } 974 975 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) 976 { 977 FILE *fp = NULL; 978 char *buf = NULL; 979 int buflen; 980 struct bitmask *bmp = NULL; 981 982 if ((fp = fopen(filepath, "r")) == NULL) 983 goto err; 984 buflen = filesize(fp) + 1; /* + 1 for nul term */ 985 if ((buf = malloc(buflen)) == NULL) 986 goto err; 987 if (flgets(buf, buflen, fp) == NULL) 988 goto err; 989 fclose(fp); 990 fp = NULL; 991 992 if ((bmp = bitmask_alloc(nbits)) == NULL) 993 goto err; 994 if (*buf && bitmask_parselist(buf, bmp) < 0) 995 goto err; 996 if (*bmpp) 997 bitmask_free(*bmpp); 998 *bmpp = bmp; 999 free(buf); 1000 buf = NULL; 1001 return 0; 1002 err: 1003 if (buf != NULL) 1004 free(buf); 1005 if (fp != NULL) 1006 fclose(fp); 1007 if (bmp != NULL) 1008 bitmask_free(bmp); 1009 return -1; 1010 } 1011 1012 static int load_mask(const char *path, struct bitmask **bmpp, 1013 int nbits, const char *mask) 1014 { 1015 char buf[PATH_MAX]; 1016 1017 pathcat2(buf, sizeof(buf), path, mask); 1018 return read_mask(buf, bmpp, nbits); 1019 } 1020 1021 /* Write string to file at given filepath. Create or truncate file. */ 1022 static int write_string_file(const char *filepath, const char *str) 1023 { 1024 int fd = -1; 1025 1026 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0) 1027 goto err; 1028 if (write(fd, str, strlen(str)) < 0) 1029 goto err; 1030 close(fd); 1031 return 0; 1032 err: 1033 if (fd >= 0) 1034 close(fd); 1035 return -1; 1036 } 1037 1038 /* Size and allocate buffer. Write bitmask into it. Caller must free */ 1039 static char *sprint_mask_buf(const struct bitmask *bmp) 1040 { 1041 char *buf = NULL; 1042 int buflen; 1043 char c; 1044 1045 /* First bitmask_displaylist() call just to get the length */ 1046 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ 1047 if ((buf = malloc(buflen)) == NULL) 1048 return NULL; 1049 bitmask_displaylist(buf, buflen, bmp); 1050 return buf; 1051 } 1052 1053 static int exists_flag(const char *path, const char *flag) 1054 { 1055 char buf[PATH_MAX]; 1056 struct stat statbuf; 1057 int rc; 1058 1059 pathcat2(buf, sizeof(buf), path, flag); 1060 rc = (stat(buf, &statbuf) == 0); 1061 errno = 0; 1062 return rc; 1063 } 1064 1065 static int store_flag(const char *path, const char *flag, int val) 1066 { 1067 char buf[PATH_MAX]; 1068 1069 pathcat2(buf, sizeof(buf), path, flag); 1070 return write_string_file(buf, val ? "1" : "0"); 1071 } 1072 1073 static int store_number(const char *path, const char *file, int val) 1074 { 1075 char buf[PATH_MAX]; 1076 char data[SMALL_BUFSZ]; 1077 1078 memset(data, 0, sizeof(data)); 1079 pathcat2(buf, sizeof(buf), path, file); 1080 snprintf(data, sizeof(data), "%d", val); 1081 return write_string_file(buf, data); 1082 } 1083 1084 static int store_mask(const char *path, const char *mask, 1085 const struct bitmask *bmp) 1086 { 1087 char maskpath[PATH_MAX]; 1088 char *bp = NULL; 1089 int rc; 1090 1091 if (bmp == NULL) 1092 return 0; 1093 pathcat2(maskpath, sizeof(maskpath), path, mask); 1094 if ((bp = sprint_mask_buf(bmp)) == NULL) 1095 return -1; 1096 rc = write_string_file(maskpath, bp); 1097 free(bp); 1098 return rc; 1099 } 1100 1101 /* 1102 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file 1103 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents 1104 * were N == cpu number. 1105 */ 1106 1107 char cpu_online(unsigned int cpu) 1108 { 1109 char online; 1110 char cpupath[PATH_MAX]; 1111 1112 (void)snprintf(cpupath, sizeof(cpupath), 1113 "/sys/devices/system/cpu/cpu%d/online", cpu); 1114 if (read_flag(cpupath, &online) < 0) 1115 return 0; /* oops - guess that cpu's not there */ 1116 return online; 1117 } 1118 1119 /* 1120 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), 1121 * to the node on which that cpu resides or cpuset_mems_nbits(). 1122 * 1123 * To avoid every user having to recalculate this relation 1124 * from various clues in the sysfs file system (below the 1125 * path /sys/devices/system) a copy of this map is kept at 1126 * /var/run/cpunodemap. 1127 * 1128 * The system automatically cleans out files below 1129 * /var/run on each system reboot (see the init script 1130 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry 1131 * about stale data in this file across reboots. If the file 1132 * is missing, let the first process that needs it, and has 1133 * permission to write in the /var/run directory, rebuild it. 1134 * 1135 * If using this cached data, remember the mtime of the mapfile 1136 * the last time we read it in case something like a hotplug 1137 * event results in the file being removed and rebuilt, so we 1138 * can detect if we're using a stale cache, and need to reload. 1139 * 1140 * The mtime of this file is set to the time when we did 1141 * the recalculation of the map, from the clues beneath 1142 * /sys/devices/system. This is done so that a program 1143 * won't see the mapfile it just wrote as being newer than what 1144 * it just wrote out (store_map) and read the same map back in 1145 * (load_file). 1146 */ 1147 1148 /* 1149 * Hold flockfile(stdin) while using cpunodemap for posix thread safety. 1150 * 1151 * Note on locking and flockfile(FILE *): 1152 * 1153 * We use flockfile() and funlockfile() instead of directly 1154 * calling pthread_mutex_lock and pthread_mutex_unlock on 1155 * a pthread_mutex_t, because this avoids forcing the app 1156 * to link with libpthread. The glibc implementation of 1157 * flockfile/funlockfile will fall back to no-ops if libpthread 1158 * doesn't happen to be linked. 1159 * 1160 * Since flockfile already has the moderately convoluted 1161 * combination of weak and strong symbols required to accomplish 1162 * this, it is easier to use flockfile() on some handy FILE * 1163 * stream as a surrogate for pthread locking than it is to so 1164 * re-invent that wheel. 1165 * 1166 * Forcing all apps that use cpusets to link with libpthread 1167 * would force non-transparent initialization on apps that 1168 * might not be prepared to handle it. 1169 * 1170 * The application using libcpuset should never notice this 1171 * odd use of flockfile(), because we never return to the 1172 * application from any libcpuset call with any such lock held. 1173 * We just use this locking for guarding some non-atomic cached 1174 * data updates and accesses, internal to some libcpuset calls. 1175 * Also, flockfile() allows recursive nesting, so if the app 1176 * calls libcpuset holding such a file lock, we won't deadlock 1177 * if we go to acquire the same lock. We'll just get the lock 1178 * and increment its counter while we hold it. 1179 */ 1180 1181 static struct cpunodemap { 1182 int *map; /* map[cpumask_sz]: maps cpu to its node */ 1183 time_t mtime; /* modtime of mapfile when last read */ 1184 } cpunodemap; 1185 1186 /* 1187 * rebuild_map() - Rebuild cpunodemap[] from scratch. 1188 * 1189 * Situation: 1190 * Neither our in-memory cpunodemap[] array nor the 1191 * cache of it in mapfile is current. 1192 * Action: 1193 * Rebuild it from first principles and the information 1194 * available below /sys/devices/system. 1195 */ 1196 1197 static void rebuild_map(void) 1198 { 1199 char buf[PATH_MAX]; 1200 DIR *dir1, *dir2; 1201 struct dirent *dent1, *dent2; 1202 int ncpus = cpuset_cpus_nbits(); 1203 int nmems = cpuset_mems_nbits(); 1204 unsigned int cpu, mem; 1205 1206 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1207 cpunodemap.map[cpu] = -1; 1208 pathcat2(buf, sizeof(buf), sysdevices, "node"); 1209 if ((dir1 = opendir(buf)) == NULL) 1210 return; 1211 while ((dent1 = readdir(dir1)) != NULL) { 1212 if (sscanf(dent1->d_name, "node%u", &mem) < 1) 1213 continue; 1214 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); 1215 if ((dir2 = opendir(buf)) == NULL) 1216 continue; 1217 while ((dent2 = readdir(dir2)) != NULL) { 1218 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) 1219 continue; 1220 if (cpu >= (unsigned int)ncpus 1221 || mem >= (unsigned int)nmems) 1222 continue; 1223 cpunodemap.map[cpu] = mem; 1224 } 1225 closedir(dir2); 1226 } 1227 closedir(dir1); 1228 cpunodemap.mtime = time(0); 1229 } 1230 1231 /* 1232 * load_map() - Load cpunodemap[] from mapfile. 1233 * 1234 * Situation: 1235 * The cpunodemap in mapfile is more recent than 1236 * what we have in the cpunodemap[] array. 1237 * Action: 1238 * Reload the cpunodemap[] array from the file. 1239 */ 1240 1241 static void load_map(void) 1242 { 1243 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ 1244 FILE *mapfp; /* File stream on mapfile */ 1245 int ncpus = cpuset_cpus_nbits(); 1246 int nmems = cpuset_mems_nbits(); 1247 unsigned int cpu, mem; 1248 1249 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL) 1250 return; 1251 cpunodemap.mtime = get_mtime(mapfile); 1252 if ((mapfp = fopen(mapfile, "r")) == NULL) 1253 return; 1254 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1255 cpunodemap.map[cpu] = nmems; 1256 while (flgets(buf, sizeof(buf), mapfp) != NULL) { 1257 if (sscanf(buf, "%u %u", &cpu, &mem) < 2) 1258 continue; 1259 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) 1260 continue; 1261 cpunodemap.map[cpu] = mem; 1262 } 1263 fclose(mapfp); 1264 } 1265 1266 /* 1267 * store_map() - Write cpunodemap[] out to mapfile. 1268 * 1269 * Situation: 1270 * The cpunodemap in the cpunodemap[] array is 1271 * more recent than the one in mapfile. 1272 * Action: 1273 * Write cpunodemap[] out to mapfile. 1274 */ 1275 1276 static void store_map(void) 1277 { 1278 char buf[PATH_MAX]; 1279 int fd = -1; 1280 FILE *mapfp = NULL; 1281 int ncpus = cpuset_cpus_nbits(); 1282 int nmems = cpuset_mems_nbits(); 1283 unsigned int cpu, mem; 1284 1285 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); 1286 if ((fd = mkstemp(buf)) < 0) 1287 goto err; 1288 if ((mapfp = fdopen(fd, "w")) == NULL) 1289 goto err; 1290 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1291 mem = cpunodemap.map[cpu]; 1292 if (mem < (unsigned int)nmems) 1293 fprintf(mapfp, "%u %u\n", cpu, mem); 1294 } 1295 fclose(mapfp); 1296 set_mtime(buf, cpunodemap.mtime); 1297 if (rename(buf, mapfile) < 0) 1298 goto err; 1299 /* mkstemp() creates mode 0600 - change to world readable */ 1300 (void)chmod(mapfile, 0444); 1301 return; 1302 err: 1303 if (mapfp != NULL) { 1304 fclose(mapfp); 1305 fd = -1; 1306 } 1307 if (fd >= 0) 1308 close(fd); 1309 (void)unlink(buf); 1310 } 1311 1312 /* 1313 * Load and gain thread safe access to the <cpu, node> map. 1314 * 1315 * Return 0 on success with flockfile(stdin) held. 1316 * Each successful get_map() call must be matched with a 1317 * following put_map() call to release the lock. 1318 * 1319 * On error, return -1 with errno set and no lock held. 1320 */ 1321 1322 static int get_map(void) 1323 { 1324 time_t file_mtime; 1325 1326 flockfile(stdin); 1327 1328 if (cpunodemap.map == NULL) { 1329 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); 1330 if (cpunodemap.map == NULL) 1331 goto err; 1332 } 1333 1334 /* If no one has a good cpunodemap, rebuild from scratch */ 1335 file_mtime = get_mtime(mapfile); 1336 if (cpunodemap.mtime == 0 && file_mtime == 0) 1337 rebuild_map(); 1338 1339 /* If either cpunodemap[] or mapfile newer, update other with it */ 1340 file_mtime = get_mtime(mapfile); 1341 if (cpunodemap.mtime < file_mtime) 1342 load_map(); 1343 else if (cpunodemap.mtime > file_mtime) 1344 store_map(); 1345 return 0; 1346 err: 1347 funlockfile(stdin); 1348 return -1; 1349 } 1350 1351 static void put_map(void) 1352 { 1353 funlockfile(stdin); 1354 } 1355 1356 /* Set cpus to those local to Memory Nodes mems */ 1357 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) 1358 { 1359 int ncpus = cpuset_cpus_nbits(); 1360 unsigned int cpu; 1361 1362 if (check() < 0) 1363 return -1; 1364 1365 get_map(); 1366 bitmask_clearall(cpus); 1367 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1368 if (bitmask_isbitset(mems, cpunodemap.map[cpu])) 1369 bitmask_setbit(cpus, cpu); 1370 } 1371 put_map(); 1372 return 0; 1373 } 1374 1375 /* Set mems to those local to CPUs cpus */ 1376 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) 1377 { 1378 int ncpus = cpuset_cpus_nbits(); 1379 unsigned int cpu; 1380 1381 if (check() < 0) 1382 return -1; 1383 1384 get_map(); 1385 bitmask_clearall(mems); 1386 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1387 if (bitmask_isbitset(cpus, cpu)) 1388 bitmask_setbit(mems, cpunodemap.map[cpu]); 1389 } 1390 put_map(); 1391 return 0; 1392 } 1393 1394 /* 1395 * distmap[] 1396 * 1397 * Array of ints of size cpumask_sz by nodemask_sz. 1398 * 1399 * Element distmap[cpu][mem] is the distance between CPU cpu 1400 * and Memory Node mem. Distances are weighted to roughly 1401 * approximate the cost of memory references, and scaled so that 1402 * the distance from a CPU to its local Memory Node is ten (10). 1403 * 1404 * The first call to cpuset_cpumemdist() builds this map, from 1405 * whatever means the kernel provides to obtain these distances. 1406 * 1407 * These distances derive from ACPI SLIT table entries, which are 1408 * eight bits in size. 1409 * 1410 * Hold flockfile(stdout) while using distmap for posix thread safety. 1411 */ 1412 1413 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ 1414 1415 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ 1416 1417 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ 1418 1419 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ 1420 1421 /* 1422 * Parse arch neutral lines from 'distance' files of form: 1423 * 1424 * 46 66 10 20 1425 * 1426 * The lines contain a space separated list of distances, which is parsed 1427 * into array dists[] of each nodes distance from the specified node. 1428 * 1429 * Result is placed in distmap[ncpus][nmems]: 1430 * 1431 * For each cpu c on node: 1432 * For each node position n in list of distances: 1433 * distmap[c][n] = dists[n] 1434 */ 1435 1436 static int parse_distmap_line(unsigned int node, char *buf) 1437 { 1438 char *p, *q; 1439 int ncpus = cpuset_cpus_nbits(); 1440 int nmems = cpuset_mems_nbits(); 1441 unsigned int c, n; 1442 distmap_entry_t *dists = NULL; 1443 struct bitmask *cpus = NULL, *mems = NULL; 1444 int ret = -1; 1445 1446 p = buf; 1447 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1448 goto err; 1449 for (n = 0; n < (unsigned int)nmems; n++) 1450 dists[n] = DISTMAP_MAX; 1451 1452 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1453 unsigned int d; 1454 1455 if ((p = strpbrk(p, "0123456789")) == NULL) 1456 break; 1457 d = strtoul(p, &q, 10); 1458 if (p == q) 1459 break; 1460 if (d < DISTMAP_MAX) 1461 dists[n] = (distmap_entry_t) d; 1462 } 1463 1464 if ((mems = bitmask_alloc(nmems)) == NULL) 1465 goto err; 1466 bitmask_setbit(mems, node); 1467 1468 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1469 goto err; 1470 cpuset_localcpus(mems, cpus); 1471 1472 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1473 c = bitmask_next(cpus, c + 1)) 1474 for (n = 0; n < (unsigned int)nmems; n++) 1475 distmap[I(c, n)] = dists[n]; 1476 ret = 0; 1477 /* fall into ... */ 1478 err: 1479 bitmask_free(mems); 1480 bitmask_free(cpus); 1481 free(dists); 1482 return ret; 1483 } 1484 1485 static int parse_distance_file(unsigned int node, const char *path) 1486 { 1487 FILE *fp; 1488 char *buf = NULL; 1489 int buflen; 1490 1491 if ((fp = fopen(path, "r")) == NULL) 1492 goto err; 1493 1494 buflen = filesize(fp); 1495 1496 if ((buf = malloc(buflen)) == NULL) 1497 goto err; 1498 1499 if (flgets(buf, buflen, fp) == NULL) 1500 goto err; 1501 1502 if (parse_distmap_line(node, buf) < 0) 1503 goto err; 1504 1505 free(buf); 1506 fclose(fp); 1507 return 0; 1508 err: 1509 free(buf); 1510 if (fp) 1511 fclose(fp); 1512 return -1; 1513 } 1514 1515 static void build_distmap(void) 1516 { 1517 static int tried_before = 0; 1518 int ncpus = cpuset_cpus_nbits(); 1519 int nmems = cpuset_mems_nbits(); 1520 int c, m; 1521 DIR *dir = NULL; 1522 struct dirent *dent; 1523 1524 if (tried_before) 1525 goto err; 1526 tried_before = 1; 1527 1528 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1529 goto err; 1530 1531 for (c = 0; c < ncpus; c++) 1532 for (m = 0; m < nmems; m++) 1533 distmap[I(c, m)] = DISTMAP_MAX; 1534 1535 if ((dir = opendir(distance_directory)) == NULL) 1536 goto err; 1537 while ((dent = readdir(dir)) != NULL) { 1538 char buf[PATH_MAX]; 1539 unsigned int node; 1540 1541 if (sscanf(dent->d_name, "node%u", &node) < 1) 1542 continue; 1543 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, 1544 "distance"); 1545 if (parse_distance_file(node, buf) < 0) 1546 goto err; 1547 } 1548 closedir(dir); 1549 return; 1550 err: 1551 if (dir) 1552 closedir(dir); 1553 free(distmap); 1554 distmap = NULL; 1555 } 1556 1557 #ifdef ALTERNATE_SN_DISTMAP 1558 1559 /* 1560 * Parse SN architecture specific line of form: 1561 * 1562 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 1563 * 1564 * Second field is node number. The "dist" field is the colon separated list 1565 * of distances, which is parsed into array dists[] of each nodes distance 1566 * from that node. 1567 * 1568 * Result is placed in distmap[ncpus][nmems]: 1569 * 1570 * For each cpu c on that node: 1571 * For each node position n in list of distances: 1572 * distmap[c][n] = dists[n] 1573 */ 1574 1575 static void parse_distmap_line_sn(char *buf) 1576 { 1577 char *p, *pend, *q; 1578 int ncpus = cpuset_cpus_nbits(); 1579 int nmems = cpuset_mems_nbits(); 1580 unsigned long c, n, node; 1581 distmap_entry_t *dists = NULL; 1582 struct bitmask *cpus = NULL, *mems = NULL; 1583 1584 if ((p = strchr(buf, ' ')) == NULL) 1585 goto err; 1586 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) 1587 goto err; 1588 if ((p = strstr(q, " dist ")) == NULL) 1589 goto err; 1590 p += strlen(" dist "); 1591 if ((pend = strchr(p, ' ')) != NULL) 1592 *pend = '\0'; 1593 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1594 goto err; 1595 for (n = 0; n < (unsigned int)nmems; n++) 1596 dists[n] = DISTMAP_MAX; 1597 1598 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1599 unsigned long d; 1600 1601 if ((p = strpbrk(p, "0123456789")) == NULL) 1602 break; 1603 d = strtoul(p, &q, 10); 1604 if (p == q) 1605 break; 1606 if (d < DISTMAP_MAX) 1607 dists[n] = (distmap_entry_t) d; 1608 } 1609 1610 if ((mems = bitmask_alloc(nmems)) == NULL) 1611 goto err; 1612 bitmask_setbit(mems, node); 1613 1614 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1615 goto err; 1616 cpuset_localcpus(mems, cpus); 1617 1618 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1619 c = bitmask_next(cpus, c + 1)) 1620 for (n = 0; n < (unsigned int)nmems; n++) 1621 distmap[I(c, n)] = dists[n]; 1622 /* fall into ... */ 1623 err: 1624 bitmask_free(mems); 1625 bitmask_free(cpus); 1626 free(dists); 1627 } 1628 1629 static void build_distmap_sn(void) 1630 { 1631 int ncpus = cpuset_cpus_nbits(); 1632 int nmems = cpuset_mems_nbits(); 1633 int c, m; 1634 static int tried_before = 0; 1635 FILE *fp = NULL; 1636 char *buf = NULL; 1637 int buflen; 1638 1639 if (tried_before) 1640 goto err; 1641 tried_before = 1; 1642 1643 if ((fp = fopen(sn_topology, "r")) == NULL) 1644 goto err; 1645 1646 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1647 goto err; 1648 1649 for (c = 0; c < ncpus; c++) 1650 for (m = 0; m < nmems; m++) 1651 distmap[I(c, m)] = DISTMAP_MAX; 1652 1653 buflen = filesize(fp); 1654 if ((buf = malloc(buflen)) == NULL) 1655 goto err; 1656 1657 while (flgets(buf, buflen, fp) != NULL) 1658 if (strprefix(buf, sn_top_node_prefix)) 1659 parse_distmap_line_sn(buf); 1660 1661 free(buf); 1662 fclose(fp); 1663 return; 1664 err: 1665 free(buf); 1666 free(distmap); 1667 distmap = NULL; 1668 if (fp) 1669 fclose(fp); 1670 } 1671 1672 #endif 1673 1674 /* [optional] Hardware distance from CPU to Memory Node */ 1675 unsigned int cpuset_cpumemdist(int cpu, int mem) 1676 { 1677 int ncpus = cpuset_cpus_nbits(); 1678 int nmems = cpuset_mems_nbits(); 1679 distmap_entry_t r = DISTMAP_MAX; 1680 1681 flockfile(stdout); 1682 1683 if (check() < 0) 1684 goto err; 1685 1686 if (distmap == NULL) 1687 build_distmap(); 1688 1689 #ifdef ALTERNATE_SN_DISTMAP 1690 if (distmap == NULL) 1691 build_distmap_sn(); 1692 #endif 1693 1694 if (distmap == NULL) 1695 goto err; 1696 1697 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) 1698 goto err; 1699 1700 r = distmap[I(cpu, mem)]; 1701 /* fall into ... */ 1702 err: 1703 funlockfile(stdout); 1704 return r; 1705 } 1706 1707 /* [optional] Return Memory Node closest to cpu */ 1708 int cpuset_cpu2node(int cpu) 1709 { 1710 int ncpus = cpuset_cpus_nbits(); 1711 int nmems = cpuset_mems_nbits(); 1712 struct bitmask *cpus = NULL, *mems = NULL; 1713 int r = -1; 1714 1715 if (check() < 0) 1716 goto err; 1717 1718 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1719 goto err; 1720 bitmask_setbit(cpus, cpu); 1721 1722 if ((mems = bitmask_alloc(nmems)) == NULL) 1723 goto err; 1724 cpuset_localmems(cpus, mems); 1725 r = bitmask_first(mems); 1726 /* fall into ... */ 1727 err: 1728 bitmask_free(cpus); 1729 bitmask_free(mems); 1730 return r; 1731 } 1732 1733 static int apply_cpuset_settings(const char *path, const struct cpuset *cp) 1734 { 1735 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { 1736 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) 1737 goto err; 1738 } 1739 1740 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { 1741 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) 1742 goto err; 1743 } 1744 1745 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { 1746 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) 1747 goto err; 1748 } 1749 1750 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { 1751 if (store_flag(path, "notify_on_release", cp->notify_on_release) 1752 < 0) 1753 goto err; 1754 } 1755 1756 if (cp->memory_migrate_valid && 1757 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) { 1758 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) 1759 goto err; 1760 } 1761 1762 if (cp->memory_pressure_enabled_valid && 1763 cp->memory_pressure_enabled_dirty && 1764 exists_flag(path, "memory_pressure_enabled")) { 1765 if (store_flag 1766 (path, "memory_pressure_enabled", 1767 cp->memory_pressure_enabled) < 0) 1768 goto err; 1769 } 1770 1771 if (cp->memory_spread_page_valid && 1772 cp->memory_spread_page_dirty && 1773 exists_flag(path, "memory_spread_page")) { 1774 if (store_flag 1775 (path, "memory_spread_page", cp->memory_spread_page) < 0) 1776 goto err; 1777 } 1778 1779 if (cp->memory_spread_slab_valid && 1780 cp->memory_spread_slab_dirty && 1781 exists_flag(path, "memory_spread_slab")) { 1782 if (store_flag 1783 (path, "memory_spread_slab", cp->memory_spread_slab) < 0) 1784 goto err; 1785 } 1786 1787 if (cp->sched_load_balance_valid && 1788 cp->sched_load_balance_dirty && 1789 exists_flag(path, "sched_load_balance")) { 1790 if (store_flag 1791 (path, "sched_load_balance", cp->sched_load_balance) < 0) 1792 goto err; 1793 } 1794 1795 if (cp->sched_relax_domain_level_valid && 1796 cp->sched_relax_domain_level_dirty && 1797 exists_flag(path, "sched_relax_domain_level")) { 1798 if (store_number 1799 (path, "sched_relax_domain_level", 1800 cp->sched_relax_domain_level) < 0) 1801 goto err; 1802 } 1803 1804 if (cp->cpus_valid && cp->cpus_dirty) { 1805 if (store_mask(path, "cpus", cp->cpus) < 0) 1806 goto err; 1807 } 1808 1809 if (cp->mems_valid && cp->mems_dirty) { 1810 if (store_mask(path, "mems", cp->mems) < 0) 1811 goto err; 1812 } 1813 return 0; 1814 err: 1815 return -1; 1816 } 1817 1818 /* 1819 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. 1820 * 1821 * Extract max value of any 'siblings' field in /proc/cpuinfo. 1822 * Cache the result - only need to extract once in lifetime of task. 1823 * 1824 * The siblings field is the number of logical CPUs in a physical 1825 * processor package. It is equal to the product of the number of 1826 * cores in that package, times the number of hyper-threads per core. 1827 * The bug that cpuset_would_crash_kernel() is detecting arises 1828 * when a cpu_exclusive cpuset tries to include just some, not all, 1829 * of the sibling logical CPUs available in a processor package. 1830 * 1831 * In the improbable case that a system has mixed values of siblings 1832 * (some processor packages have more than others, perhaps due to 1833 * partially enabling Hyper-Threading), we take the worse case value, 1834 * the largest siblings value. This might be overkill. I don't know 1835 * if this kernel bug considers each processor package's siblings 1836 * separately or not. But it sure is easier this way ... 1837 * 1838 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from 1839 * open to close, the first time called. 1840 */ 1841 1842 static int get_siblings(void) 1843 { 1844 static int siblings; 1845 char buf[32]; /* big enough for one 'siblings' line */ 1846 FILE *fp; 1847 1848 if (siblings) 1849 return siblings; 1850 1851 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 1852 return 4; /* wing it - /proc not mounted ? */ 1853 while (flgets(buf, sizeof(buf), fp) != NULL) { 1854 int s; 1855 1856 if (sscanf(buf, "siblings : %d", &s) < 1) 1857 continue; 1858 if (s > siblings) 1859 siblings = s; 1860 } 1861 fclose(fp); 1862 if (siblings == 0) 1863 siblings = 1; /* old kernel, no siblings, default to 1 */ 1864 return siblings; 1865 } 1866 1867 /* 1868 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic 1869 * scheduler domain code invoked for cpu_exclusive cpusets that causes 1870 * the kernel to freeze, requiring a hardware reset. 1871 * 1872 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' 1873 * cpuset is defined where that cpusets 'cpus' are not on package 1874 * boundaries then the kernel will freeze, usually as soon as this 1875 * cpuset is created, requiring a hardware reset. 1876 * 1877 * A cpusets 'cpus' are not on package boundaries if the cpuset 1878 * includes a proper non-empty subset (some, but not all) of the 1879 * logical cpus on a processor package. This requires multiple 1880 * logical CPUs per package, available with either Hyper-Thread or 1881 * Multi-Core support. Without one of these features, there is only 1882 * one logical CPU per physical package, and it's not possible to 1883 * have a proper, non-empty subset of a set of cardinality one. 1884 * 1885 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC 1886 * on i386 and x86_64 arch's. 1887 * 1888 * The objective of this routine cpuset_would_crash_kernel() is to 1889 * determine if a proposed cpuset setting would crash the kernel due 1890 * to this bug, so that the caller can avoid the crash. 1891 * 1892 * Ideally we'd check for exactly these conditions here, but computing 1893 * the package (identified by the 'physical id' field of /proc/cpuinfo) 1894 * of each cpu in a cpuset is more effort than it's worth here. 1895 * 1896 * Also there is no obvious way to identify exactly whether the kernel 1897 * one is executing on has this bug, short of trying it, and seeing 1898 * if the kernel just crashed. 1899 * 1900 * So for now, we look for a simpler set of conditions, that meets 1901 * our immediate need - avoid this crash on SUSE SLES10 systems that 1902 * are susceptible to it. We look for the kernel version 2.6.16.*, 1903 * which is the base kernel of SUSE SLES10, and for i386 or x86_64 1904 * processors, which had CONFIG_SCHED_MC enabled. 1905 * 1906 * If these simpler conditions are met, we further simplify the check, 1907 * by presuming that the logical CPUs are numbered on processor 1908 * package boundaries. If each package has S siblings, we assume 1909 * that CPUs numbered N through N + S -1 are on the same package, 1910 * for any CPU N such that N mod S == 0. 1911 * 1912 * Yes, this is a hack, focused on avoiding kernel freezes on 1913 * susceptible SUSE SLES10 systems. 1914 */ 1915 1916 static int cpuset_would_crash_kernel(const struct cpuset *cp) 1917 { 1918 static int susceptible_system = -1; 1919 1920 if (!cp->cpu_exclusive) 1921 goto ok; 1922 1923 if (susceptible_system == -1) { 1924 struct utsname u; 1925 int rel_2_6_16, arch_i386, arch_x86_64; 1926 1927 if (uname(&u) < 0) 1928 goto fail; 1929 rel_2_6_16 = strprefix(u.release, "2.6.16."); 1930 arch_i386 = streq(u.machine, "i386"); 1931 arch_x86_64 = streq(u.machine, "x86_64"); 1932 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); 1933 } 1934 1935 if (susceptible_system) { 1936 int ncpus = cpuset_cpus_nbits(); 1937 int siblings = get_siblings(); 1938 unsigned int cpu; 1939 1940 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { 1941 int s, num_set = 0; 1942 1943 for (s = 0; s < siblings; s++) { 1944 if (bitmask_isbitset(cp->cpus, cpu + s)) 1945 num_set++; 1946 } 1947 1948 /* If none or all siblings set, we're still ok */ 1949 if (num_set == 0 || num_set == siblings) 1950 continue; 1951 1952 /* Found one that would crash kernel. Fail. */ 1953 errno = ENXIO; 1954 goto fail; 1955 } 1956 } 1957 /* If not susceptible, or if all ok, fall into "ok" ... */ 1958 ok: 1959 return 0; /* would not crash */ 1960 fail: 1961 return 1; /* would crash */ 1962 } 1963 1964 /* compare two cpuset and mark the dirty variable */ 1965 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) 1966 { 1967 if (cp1->cpu_exclusive_valid && 1968 cp1->cpu_exclusive != cp2->cpu_exclusive) 1969 cp1->cpu_exclusive_dirty = 1; 1970 1971 if (cp1->mem_exclusive_valid && 1972 cp1->mem_exclusive != cp2->mem_exclusive) 1973 cp1->mem_exclusive_dirty = 1; 1974 1975 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall) 1976 cp1->mem_hardwall_dirty = 1; 1977 1978 if (cp1->notify_on_release_valid && 1979 cp1->notify_on_release != cp2->notify_on_release) 1980 cp1->notify_on_release_dirty = 1; 1981 1982 if (cp1->memory_migrate_valid && 1983 cp1->memory_migrate != cp2->memory_migrate) 1984 cp1->memory_migrate_dirty = 1; 1985 1986 if (cp1->memory_pressure_enabled_valid && 1987 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) 1988 cp1->memory_pressure_enabled_dirty = 1; 1989 1990 if (cp1->memory_spread_page_valid && 1991 cp1->memory_spread_page != cp2->memory_spread_page) 1992 cp1->memory_spread_page_dirty = 1; 1993 1994 if (cp1->memory_spread_slab_valid && 1995 cp1->memory_spread_slab != cp2->memory_spread_slab) 1996 cp1->memory_spread_slab_dirty = 1; 1997 1998 if (cp1->sched_load_balance_valid && 1999 cp1->sched_load_balance != cp2->sched_load_balance) 2000 cp1->sched_load_balance_dirty = 1; 2001 2002 if (cp1->sched_relax_domain_level_valid && 2003 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) 2004 cp1->sched_relax_domain_level_dirty = 1; 2005 2006 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) 2007 cp1->cpus_dirty = 1; 2008 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) 2009 cp1->mems_dirty = 1; 2010 } 2011 2012 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ 2013 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) 2014 { 2015 char buf[PATH_MAX]; 2016 int do_rmdir_on_err = 0; 2017 int do_restore_cp_sav_on_err = 0; 2018 struct cpuset *cp_sav = NULL; 2019 int sav_errno; 2020 2021 if (check() < 0) 2022 goto err; 2023 2024 if (cpuset_would_crash_kernel(cp)) 2025 goto err; 2026 2027 fullpath(buf, sizeof(buf), relpath); 2028 2029 if (new) { 2030 if (mkdir(buf, 0755) < 0) 2031 goto err; 2032 /* we made it, so we should remove it on error */ 2033 do_rmdir_on_err = 1; 2034 } 2035 2036 if ((cp_sav = cpuset_alloc()) == NULL) 2037 goto err; 2038 if (cpuset_query(cp_sav, relpath) < 0) 2039 goto err; 2040 /* we have old settings to restore on error */ 2041 do_restore_cp_sav_on_err = 1; 2042 2043 /* check which variable need to restore on error */ 2044 mark_dirty_variable(cp_sav, cp); 2045 2046 if (apply_cpuset_settings(buf, cp) < 0) 2047 goto err; 2048 2049 cpuset_free(cp_sav); 2050 return 0; 2051 err: 2052 sav_errno = errno; 2053 if (do_restore_cp_sav_on_err) 2054 (void)apply_cpuset_settings(buf, cp_sav); 2055 if (cp_sav) 2056 cpuset_free(cp_sav); 2057 if (do_rmdir_on_err) 2058 (void)rmdir(buf); 2059 errno = sav_errno; 2060 return -1; 2061 } 2062 2063 /* Create cpuset 'cp' at location 'relpath' */ 2064 int cpuset_create(const char *relpath, const struct cpuset *cp) 2065 { 2066 return cr_or_mod(relpath, cp, 1); 2067 } 2068 2069 /* Delete cpuset at location 'path' (if empty) */ 2070 int cpuset_delete(const char *relpath) 2071 { 2072 char buf[PATH_MAX]; 2073 2074 if (check() < 0) 2075 goto err; 2076 2077 fullpath(buf, sizeof(buf), relpath); 2078 if (rmdir(buf) < 0) 2079 goto err; 2080 2081 return 0; 2082 err: 2083 return -1; 2084 } 2085 2086 /* Set cpuset cp to the cpuset at location 'path' */ 2087 int cpuset_query(struct cpuset *cp, const char *relpath) 2088 { 2089 char buf[PATH_MAX]; 2090 2091 if (check() < 0) 2092 goto err; 2093 2094 fullpath(buf, sizeof(buf), relpath); 2095 2096 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0) 2097 goto err; 2098 cp->cpu_exclusive_valid = 1; 2099 2100 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0) 2101 goto err; 2102 cp->mem_exclusive_valid = 1; 2103 2104 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) 2105 goto err; 2106 cp->notify_on_release_valid = 1; 2107 2108 if (exists_flag(buf, "memory_migrate")) { 2109 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0) 2110 goto err; 2111 cp->memory_migrate_valid = 1; 2112 } 2113 2114 if (exists_flag(buf, "mem_hardwall")) { 2115 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0) 2116 goto err; 2117 cp->mem_hardwall_valid = 1; 2118 } 2119 2120 if (exists_flag(buf, "memory_pressure_enabled")) { 2121 if (load_flag 2122 (buf, &cp->memory_pressure_enabled, 2123 "memory_pressure_enabled") < 0) 2124 goto err; 2125 cp->memory_pressure_enabled_valid = 1; 2126 } 2127 2128 if (exists_flag(buf, "memory_spread_page")) { 2129 if (load_flag 2130 (buf, &cp->memory_spread_page, "memory_spread_page") < 0) 2131 goto err; 2132 cp->memory_spread_page_valid = 1; 2133 } 2134 2135 if (exists_flag(buf, "memory_spread_slab")) { 2136 if (load_flag 2137 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0) 2138 goto err; 2139 cp->memory_spread_slab_valid = 1; 2140 } 2141 2142 if (exists_flag(buf, "sched_load_balance")) { 2143 if (load_flag 2144 (buf, &cp->sched_load_balance, "sched_load_balance") < 0) 2145 goto err; 2146 cp->sched_load_balance_valid = 1; 2147 } 2148 2149 if (exists_flag(buf, "sched_relax_domain_level")) { 2150 if (load_number 2151 (buf, &cp->sched_relax_domain_level, 2152 "sched_relax_domain_level") < 0) 2153 goto err; 2154 cp->sched_relax_domain_level_valid = 1; 2155 } 2156 2157 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0) 2158 goto err; 2159 cp->cpus_valid = 1; 2160 2161 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0) 2162 goto err; 2163 cp->mems_valid = 1; 2164 2165 return 0; 2166 err: 2167 return -1; 2168 } 2169 2170 /* Modify cpuset at location 'relpath' to values of 'cp' */ 2171 int cpuset_modify(const char *relpath, const struct cpuset *cp) 2172 { 2173 return cr_or_mod(relpath, cp, 0); 2174 } 2175 2176 /* Get cpuset path of pid into buf */ 2177 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) 2178 { 2179 int fd; /* dual use: cpuset file for pid and self */ 2180 int rc; /* dual use: snprintf and read return codes */ 2181 2182 if (check() < 0) 2183 return NULL; 2184 2185 /* borrow result buf[] to build cpuset file path */ 2186 if (pid == 0) 2187 rc = snprintf(buf, size, "/proc/self/cpuset"); 2188 else 2189 rc = snprintf(buf, size, "/proc/%d/cpuset", pid); 2190 if (rc >= (int)size) { 2191 errno = E2BIG; 2192 return NULL; 2193 } 2194 if ((fd = open(buf, O_RDONLY)) < 0) { 2195 int e = errno; 2196 if (e == ENOENT) 2197 e = ESRCH; 2198 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) 2199 e = ENOSYS; 2200 else 2201 close(fd); 2202 errno = e; 2203 return NULL; 2204 } 2205 rc = read(fd, buf, size); 2206 close(fd); 2207 if (rc < 0) 2208 return NULL; 2209 if (rc >= (int)size) { 2210 errno = E2BIG; 2211 return NULL; 2212 } 2213 buf[rc] = 0; 2214 chomp(buf); 2215 return buf; 2216 2217 } 2218 2219 /* Get cpuset 'cp' of pid */ 2220 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) 2221 { 2222 char buf[PATH_MAX]; 2223 2224 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 2225 return -1; 2226 if (cpuset_query(cp, buf) < 0) 2227 return -1; 2228 return 0; 2229 } 2230 2231 /* [optional] Return mountpoint of cpuset filesystem */ 2232 const char *cpuset_mountpoint(void) 2233 { 2234 if (check() < 0) { 2235 switch (errno) { 2236 case ENODEV: 2237 return "[cpuset filesystem not mounted]"; 2238 default: 2239 return "[cpuset filesystem not supported]"; 2240 } 2241 } 2242 return cpusetmnt; 2243 } 2244 2245 /* Return true if path is a directory. */ 2246 static int isdir(const char *path) 2247 { 2248 struct stat statbuf; 2249 2250 if (stat(path, &statbuf) < 0) 2251 return 0; 2252 return S_ISDIR(statbuf.st_mode); 2253 } 2254 2255 /* 2256 * [optional] cpuset_collides_exclusive() - True if would collide exclusive. 2257 * 2258 * Return true iff the specified cpuset would overlap with any 2259 * sibling cpusets in either cpus or mems, where either this 2260 * cpuset or the sibling is cpu_exclusive or mem_exclusive. 2261 * 2262 * cpuset_create() fails with errno == EINVAL if the requested cpuset 2263 * would overlap with any sibling, where either one is cpu_exclusive or 2264 * mem_exclusive. This is a common, and not obvious error. The 2265 * following routine checks for this particular case, so that code 2266 * creating cpusets can better identify the situation, perhaps to issue 2267 * a more informative error message. 2268 * 2269 * Can also be used to diagnose cpuset_modify failures. This 2270 * routine ignores any existing cpuset with the same path as the 2271 * given 'cpusetpath', and only looks for exclusive collisions with 2272 * sibling cpusets of that path. 2273 * 2274 * In case of any error, returns (0) -- does not collide. Presumably 2275 * any actual attempt to create or modify a cpuset will encounter the 2276 * same error, and report it usefully. 2277 * 2278 * This routine is not particularly efficient; most likely code creating or 2279 * modifying a cpuset will want to try the operation first, and then if that 2280 * fails with errno EINVAL, perhaps call this routine to determine if an 2281 * exclusive cpuset collision caused the error. 2282 */ 2283 2284 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) 2285 { 2286 char parent[PATH_MAX]; 2287 char *p; 2288 char *pathcopy = NULL; 2289 char *base; 2290 DIR *dir = NULL; 2291 struct dirent *dent; 2292 struct cpuset *cp2 = NULL; 2293 struct bitmask *cpus1 = NULL, *cpus2 = NULL; 2294 struct bitmask *mems1 = NULL, *mems2 = NULL; 2295 int ret; 2296 2297 if (check() < 0) 2298 goto err; 2299 2300 fullpath(parent, sizeof(parent), cpusetpath); 2301 if (streq(parent, cpusetmnt)) 2302 goto err; /* only one cpuset root - can't collide */ 2303 pathcopy = strdup(parent); 2304 p = strrchr(parent, '/'); 2305 if (!p) 2306 goto err; /* huh? - impossible - run and hide */ 2307 *p = 0; /* now parent is dirname of fullpath */ 2308 2309 p = strrchr(pathcopy, '/'); 2310 base = p + 1; /* now base is basename of fullpath */ 2311 if (!*base) 2312 goto err; /* this is also impossible - run away */ 2313 2314 if ((dir = opendir(parent)) == NULL) 2315 goto err; 2316 if ((cp2 = cpuset_alloc()) == NULL) 2317 goto err; 2318 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2319 goto err; 2320 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2321 goto err; 2322 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2323 goto err; 2324 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2325 goto err; 2326 2327 while ((dent = readdir(dir)) != NULL) { 2328 char child[PATH_MAX]; 2329 2330 if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) 2331 continue; 2332 if (streq(dent->d_name, base)) 2333 continue; 2334 pathcat2(child, sizeof(child), parent, dent->d_name); 2335 if (!isdir(child)) 2336 continue; 2337 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) 2338 goto err; 2339 if (cp1->cpu_exclusive || cp2->cpu_exclusive) { 2340 cpuset_getcpus(cp1, cpus1); 2341 cpuset_getcpus(cp2, cpus2); 2342 if (bitmask_intersects(cpus1, cpus2)) 2343 goto collides; 2344 } 2345 if (cp1->mem_exclusive || cp2->mem_exclusive) { 2346 cpuset_getmems(cp1, mems1); 2347 cpuset_getmems(cp2, mems2); 2348 if (bitmask_intersects(mems1, mems2)) 2349 goto collides; 2350 } 2351 } 2352 err: 2353 /* error, or did not collide */ 2354 ret = 0; 2355 goto done; 2356 collides: 2357 /* collides */ 2358 ret = 1; 2359 /* fall into ... */ 2360 done: 2361 if (dir) 2362 closedir(dir); 2363 cpuset_free(cp2); 2364 free(pathcopy); 2365 bitmask_free(cpus1); 2366 bitmask_free(cpus2); 2367 bitmask_free(mems1); 2368 bitmask_free(mems2); 2369 return ret; 2370 } 2371 2372 /* 2373 * [optional] cpuset_nuke() - Remove cpuset anyway possible 2374 * 2375 * Remove a cpuset, including killing tasks in it, and 2376 * removing any descendent cpusets and killing their tasks. 2377 * 2378 * Tasks can take a long time (minutes on some configurations) 2379 * to exit. Loop up to 'seconds' seconds, trying to kill them. 2380 * 2381 * How we do it: 2382 * 1) First, kill all the pids, looping until there are 2383 * no more pids in this cpuset or below, or until the 2384 * 'seconds' timeout limit is exceeded. 2385 * 2) Then depth first recursively rmdir the cpuset directories. 2386 * 3) If by this point the original cpuset is gone, we succeeded. 2387 * 2388 * If the timeout is exceeded, and tasks still exist, fail with 2389 * errno == ETIME. 2390 * 2391 * We sleep a variable amount of time. After the first attempt to 2392 * kill all the tasks in the cpuset or its descendents, we sleep 1 2393 * second, the next time 2 seconds, increasing 1 second each loop 2394 * up to a max of 10 seconds. If more loops past 10 are required 2395 * to kill all the tasks, we sleep 10 seconds each subsequent loop. 2396 * In any case, before the last loop, we sleep however many seconds 2397 * remain of the original timeout 'seconds' requested. The total 2398 * time of all sleeps will be no more than the requested 'seconds'. 2399 * 2400 * If the cpuset started out empty of any tasks, or if the passed in 2401 * 'seconds' was zero, then this routine will return quickly, having 2402 * not slept at all. Otherwise, this routine will at a minimum send 2403 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one 2404 * second, before looking to see if any tasks remain. If tasks remain 2405 * in the cpuset subtree, and a longer 'seconds' timeout was requested 2406 * (more than one), it will continue to kill remaining tasks and sleep, 2407 * in a loop, for as long as time and tasks remain. 2408 * 2409 * The signal sent for the kill is hardcoded to SIGKILL (9). If some 2410 * other signal should be sent first, use a separate code loop, 2411 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to 2412 * scan the task pids in a cpuset. If SIGKILL should -not- be sent, 2413 * this cpuset_nuke() routine can still be called to recursively 2414 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. 2415 * 2416 * On success, returns 0 with errno == 0. 2417 * 2418 * On failure, returns -1, with errno possibly one of: 2419 * EACCES - search permission denied on intervening directory 2420 * ETIME - timed out - tasks remain after 'seconds' timeout 2421 * EMFILE - too many open files 2422 * ENODEV - /dev/cpuset not mounted 2423 * ENOENT - component of cpuset path doesn't exist 2424 * ENOMEM - out of memory 2425 * ENOSYS - kernel doesn't support cpusets 2426 * ENOTDIR - component of cpuset path is not a directory 2427 * EPERM - lacked permission to kill a task 2428 * EPERM - lacked permission to read cpusets or files therein 2429 */ 2430 2431 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); 2432 2433 int cpuset_nuke(const char *relpath, unsigned int seconds) 2434 { 2435 unsigned int secs_left = seconds; /* total sleep seconds left */ 2436 unsigned int secs_loop = 1; /* how much sleep next loop */ 2437 unsigned int secs_slept; /* seconds slept in sleep() */ 2438 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ 2439 struct cpuset_fts_tree *cs_tree; 2440 const struct cpuset_fts_entry *cs_entry; 2441 int ret, sav_errno = 0; 2442 2443 if (check() < 0) 2444 return -1; 2445 2446 if (seconds == 0) 2447 goto rmdir_cpusets; 2448 2449 while (1) { 2450 int plen, j; 2451 2452 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { 2453 /* missing cpuset is as good as if already nuked */ 2454 if (errno == ENOENT) { 2455 ret = 0; 2456 goto no_more_cpuset; 2457 } 2458 2459 /* other problems reading cpuset are bad news */ 2460 sav_errno = errno; 2461 goto failed; 2462 } 2463 2464 if ((plen = cpuset_pidlist_length(pl)) == 0) 2465 goto rmdir_cpusets; 2466 2467 for (j = 0; j < plen; j++) { 2468 pid_t pid; 2469 2470 if ((pid = cpuset_get_pidlist(pl, j)) > 1) { 2471 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { 2472 sav_errno = errno; 2473 goto failed; 2474 } 2475 } 2476 } 2477 2478 if (secs_left == 0) 2479 goto took_too_long; 2480 2481 cpuset_freepidlist(pl); 2482 pl = NULL; 2483 2484 secs_slept = secs_loop - sleep(secs_loop); 2485 2486 /* Ensure forward progress */ 2487 if (secs_slept == 0) 2488 secs_slept = 1; 2489 2490 /* Ensure sane sleep() return (unnecessary?) */ 2491 if (secs_slept > secs_loop) 2492 secs_slept = secs_loop; 2493 2494 secs_left -= secs_slept; 2495 2496 if (secs_loop < 10) 2497 secs_loop++; 2498 2499 secs_loop = MIN(secs_left, secs_loop); 2500 } 2501 2502 took_too_long: 2503 sav_errno = ETIME; 2504 /* fall into ... */ 2505 failed: 2506 cpuset_freepidlist(pl); 2507 errno = sav_errno; 2508 return -1; 2509 2510 rmdir_cpusets: 2511 /* Let's try removing cpuset(s) now. */ 2512 cpuset_freepidlist(pl); 2513 2514 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) 2515 return -1; 2516 ret = 0; 2517 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ 2518 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2519 char buf[PATH_MAX]; 2520 2521 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); 2522 if (rmdir(buf) < 0 && errno != ENOENT) { 2523 sav_errno = errno; 2524 ret = -1; 2525 } 2526 } 2527 cpuset_fts_close(cs_tree); 2528 /* fall into ... */ 2529 no_more_cpuset: 2530 if (ret == 0) 2531 errno = 0; 2532 else 2533 errno = sav_errno; 2534 return ret; 2535 } 2536 2537 /* 2538 * When recursively reading all the tasks files from a subtree, 2539 * chain together the read results, one pidblock per tasks file, 2540 * containing the raw unprocessed ascii as read(2) in. After 2541 * we gather up this raw data, we then go back to count how 2542 * many pid's there are in total, allocate an array of pid_t 2543 * of that size, and transform the raw ascii data into this 2544 * array of pid_t's. 2545 */ 2546 2547 struct pidblock { 2548 char *buf; 2549 int buflen; 2550 struct pidblock *next; 2551 }; 2552 2553 /* 2554 * Chain the raw contents of a file onto the pbhead list. 2555 * 2556 * We malloc "+ 1" extra byte for a nul-terminator, so that 2557 * the strtoul() loop in pid_transform() won't scan past 2558 * the end of pb->buf[] and accidentally find more pids. 2559 */ 2560 static void add_pidblock(const char *file, struct pidblock **ppbhead) 2561 { 2562 FILE *fp = NULL; 2563 struct pidblock *pb = NULL; 2564 int fsz; 2565 2566 if ((fp = fopen(file, "r")) == NULL) 2567 goto err; 2568 fsz = filesize(fp); 2569 if (fsz == 0) 2570 goto err; 2571 if ((pb = calloc(1, sizeof(*pb))) == NULL) 2572 goto err; 2573 pb->buflen = fsz; 2574 if ((pb->buf = malloc(pb->buflen + 1)) == NULL) 2575 goto err; 2576 if (fread(pb->buf, 1, pb->buflen, fp) > 0) { 2577 pb->buf[pb->buflen] = '\0'; 2578 pb->next = *ppbhead; 2579 *ppbhead = pb; 2580 } 2581 fclose(fp); 2582 return; 2583 err: 2584 if (fp) 2585 fclose(fp); 2586 free(pb); 2587 } 2588 2589 static void read_task_file(const char *relpath, struct pidblock **ppbhead) 2590 { 2591 char buf[PATH_MAX]; 2592 2593 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2594 add_pidblock(buf, ppbhead); 2595 } 2596 2597 struct cpuset_pidlist { 2598 pid_t *pids; 2599 int npids; 2600 }; 2601 2602 /* Count how many pids in buf (one per line - just count newlines) */ 2603 static int pidcount(const char *buf, int buflen) 2604 { 2605 int n = 0; 2606 const char *cp; 2607 2608 for (cp = buf; cp < buf + buflen; cp++) { 2609 if (*cp == '\n') 2610 n++; 2611 } 2612 return n; 2613 } 2614 2615 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */ 2616 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) 2617 { 2618 char *a, *b; 2619 2620 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { 2621 pid_t p = strtoul(a, &b, 10); 2622 if (a == b) 2623 break; 2624 pl->pids[n++] = p; 2625 } 2626 return n; 2627 } 2628 2629 static void free_pidblocks(struct pidblock *pbhead) 2630 { 2631 struct pidblock *pb, *nextpb; 2632 2633 for (pb = pbhead; pb; pb = nextpb) { 2634 nextpb = pb->next; 2635 free(pb->buf); 2636 free(pb); 2637 } 2638 } 2639 2640 /* numeric comparison routine for qsort */ 2641 static int numericsort(const void *m1, const void *m2) 2642 { 2643 pid_t p1 = *(pid_t *) m1; 2644 pid_t p2 = *(pid_t *) m2; 2645 2646 return p1 - p2; 2647 } 2648 2649 /* Return list pids in cpuset 'path' */ 2650 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, 2651 int recursiveflag) 2652 { 2653 struct pidblock *pb = NULL; 2654 struct cpuset_pidlist *pl = NULL; 2655 struct pidblock *pbhead = NULL; 2656 int n; 2657 2658 if (check() < 0) 2659 goto err; 2660 2661 if (recursiveflag) { 2662 struct cpuset_fts_tree *cs_tree; 2663 const struct cpuset_fts_entry *cs_entry; 2664 2665 if ((cs_tree = cpuset_fts_open(relpath)) == NULL) 2666 goto err; 2667 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2668 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) 2669 continue; 2670 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); 2671 } 2672 cpuset_fts_close(cs_tree); 2673 } else { 2674 read_task_file(relpath, &pbhead); 2675 } 2676 2677 if ((pl = calloc(1, sizeof(*pl))) == NULL) 2678 goto err; 2679 pl->npids = 0; 2680 for (pb = pbhead; pb; pb = pb->next) 2681 pl->npids += pidcount(pb->buf, pb->buflen); 2682 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) 2683 goto err; 2684 n = 0; 2685 for (pb = pbhead; pb; pb = pb->next) 2686 n = pid_transform(pb, pl, n); 2687 free_pidblocks(pbhead); 2688 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); 2689 return pl; 2690 err: 2691 cpuset_freepidlist(pl); 2692 free_pidblocks(pbhead); 2693 return NULL; 2694 } 2695 2696 /* Return number of elements in pidlist */ 2697 int cpuset_pidlist_length(const struct cpuset_pidlist *pl) 2698 { 2699 if (pl) 2700 return pl->npids; 2701 else 2702 return 0; 2703 } 2704 2705 /* Return i'th element of pidlist */ 2706 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i) 2707 { 2708 if (pl && i >= 0 && i < pl->npids) 2709 return pl->pids[i]; 2710 else 2711 return (pid_t) - 1; 2712 } 2713 2714 /* Free pidlist */ 2715 void cpuset_freepidlist(struct cpuset_pidlist *pl) 2716 { 2717 if (pl && pl->pids) 2718 free(pl->pids); 2719 free(pl); 2720 } 2721 2722 static int __cpuset_move(pid_t pid, const char *path) 2723 { 2724 char buf[SMALL_BUFSZ]; 2725 2726 snprintf(buf, sizeof(buf), "%u", pid); 2727 return write_string_file(path, buf); 2728 } 2729 2730 /* Move task (pid == 0 for current) to a cpuset */ 2731 int cpuset_move(pid_t pid, const char *relpath) 2732 { 2733 char buf[PATH_MAX]; 2734 2735 if (check() < 0) 2736 return -1; 2737 2738 if (pid == 0) 2739 pid = getpid(); 2740 2741 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2742 return __cpuset_move(pid, buf); 2743 } 2744 2745 /* Move all tasks in pidlist to a cpuset */ 2746 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) 2747 { 2748 int i; 2749 char buf[PATH_MAX]; 2750 int ret; 2751 2752 if (check() < 0) 2753 return -1; 2754 2755 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2756 2757 ret = 0; 2758 for (i = 0; i < pl->npids; i++) 2759 if (__cpuset_move(pl->pids[i], buf) < 0) 2760 ret = -1; 2761 return ret; 2762 } 2763 2764 /* 2765 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a 2766 * cpuset to another cpuset 2767 * 2768 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may 2769 * race with tasks being added to or forking into fromrelpath. Loop 2770 * repeatedly, reading the tasks file of cpuset fromrelpath and writing 2771 * any task pid's found there to the tasks file of cpuset torelpath, 2772 * up to ten attempts, or until the tasks file of cpuset fromrelpath 2773 * is empty, or until fromrelpath is no longer present. 2774 * 2775 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset 2776 * fromrelpath. Of course it is still possible that some independent 2777 * task could add another task to cpuset fromrelpath at the same time 2778 * that such a successful result is being returned, so there can be 2779 * no guarantee that a successful return means that fromrelpath is 2780 * still empty of tasks. 2781 * 2782 * We are careful to allow for the possibility that the cpuset 2783 * fromrelpath might disappear out from under us, perhaps because it 2784 * has notify_on_release set and gets automatically removed as soon 2785 * as we detach its last task from it. Consider a missing fromrelpath 2786 * to be a successful move. 2787 * 2788 * If called with fromrelpath and torelpath pathnames that evaluate to 2789 * the same cpuset, then treat that as if cpuset_reattach() was called, 2790 * rebinding each task in this cpuset one time, and return success or 2791 * failure depending on the return of that cpuset_reattach() call. 2792 * 2793 * On failure, returns -1, with errno possibly one of: 2794 * EACCES - search permission denied on intervening directory 2795 * ENOTEMPTY - tasks remain after multiple attempts to move them 2796 * EMFILE - too many open files 2797 * ENODEV - /dev/cpuset not mounted 2798 * ENOENT - component of cpuset path doesn't exist 2799 * ENOMEM - out of memory 2800 * ENOSYS - kernel doesn't support cpusets 2801 * ENOTDIR - component of cpuset path is not a directory 2802 * EPERM - lacked permission to kill a task 2803 * EPERM - lacked permission to read cpusets or files therein 2804 * 2805 * This is an [optional] function. Use cpuset_function to invoke it. 2806 */ 2807 2808 #define NUMBER_MOVE_TASK_ATTEMPTS 10 2809 2810 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) 2811 { 2812 char fromfullpath[PATH_MAX]; 2813 char tofullpath[PATH_MAX]; 2814 int i; 2815 struct cpuset_pidlist *pl = NULL; 2816 int sav_errno; 2817 2818 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); 2819 fullpath(tofullpath, sizeof(tofullpath), torelpath); 2820 2821 if (samefile(fromfullpath, tofullpath)) 2822 return cpuset_reattach(fromrelpath); 2823 2824 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { 2825 int plen, j; 2826 2827 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { 2828 /* missing cpuset is as good as if all moved */ 2829 if (errno == ENOENT) 2830 goto no_more_cpuset; 2831 2832 /* other problems reading cpuset are bad news */ 2833 sav_errno = errno; 2834 goto failed; 2835 } 2836 2837 if ((plen = cpuset_pidlist_length(pl)) == 0) 2838 goto no_more_pids; 2839 2840 for (j = 0; j < plen; j++) { 2841 pid_t pid; 2842 2843 pid = cpuset_get_pidlist(pl, j); 2844 if (cpuset_move(pid, torelpath) < 0) { 2845 /* missing task is as good as if moved */ 2846 if (errno == ESRCH) 2847 continue; 2848 2849 /* other per-task errors are bad news */ 2850 sav_errno = errno; 2851 goto failed; 2852 } 2853 } 2854 2855 cpuset_freepidlist(pl); 2856 pl = NULL; 2857 } 2858 2859 sav_errno = ENOTEMPTY; 2860 /* fall into ... */ 2861 failed: 2862 cpuset_freepidlist(pl); 2863 errno = sav_errno; 2864 return -1; 2865 2866 no_more_pids: 2867 no_more_cpuset: 2868 /* Success - all tasks (or entire cpuset ;) gone. */ 2869 cpuset_freepidlist(pl); 2870 errno = 0; 2871 return 0; 2872 } 2873 2874 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ 2875 int cpuset_migrate(pid_t pid, const char *relpath) 2876 { 2877 char buf[PATH_MAX]; 2878 char buf2[PATH_MAX]; 2879 char memory_migrate_flag; 2880 int r; 2881 2882 if (check() < 0) 2883 return -1; 2884 2885 if (pid == 0) 2886 pid = getpid(); 2887 2888 fullpath(buf2, sizeof(buf2), relpath); 2889 2890 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2891 return -1; 2892 if (store_flag(buf2, "memory_migrate", 1) < 0) 2893 return -1; 2894 2895 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2896 2897 r = __cpuset_move(pid, buf); 2898 2899 store_flag(buf2, "memory_migrate", memory_migrate_flag); 2900 return r; 2901 } 2902 2903 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ 2904 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) 2905 { 2906 int i; 2907 char buf[PATH_MAX]; 2908 char buf2[PATH_MAX]; 2909 char memory_migrate_flag; 2910 int ret; 2911 2912 if (check() < 0) 2913 return -1; 2914 2915 fullpath(buf2, sizeof(buf2), relpath); 2916 2917 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2918 return -1; 2919 if (store_flag(buf2, "memory_migrate", 1) < 0) 2920 return -1; 2921 2922 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2923 2924 ret = 0; 2925 for (i = 0; i < pl->npids; i++) 2926 if (__cpuset_move(pl->pids[i], buf) < 0) 2927 ret = -1; 2928 2929 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) 2930 ret = -1; 2931 return ret; 2932 } 2933 2934 /* Rebind cpus_allowed of each task in cpuset 'path' */ 2935 int cpuset_reattach(const char *relpath) 2936 { 2937 struct cpuset_pidlist *pl; 2938 int rc; 2939 2940 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) 2941 return -1; 2942 rc = cpuset_move_all(pl, relpath); 2943 cpuset_freepidlist(pl); 2944 return rc; 2945 } 2946 2947 /* Map cpuset relative cpu number to system wide cpu number */ 2948 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) 2949 { 2950 struct cpuset *cp_tofree = NULL; 2951 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2952 int pos = -1; 2953 2954 if (!cp1) 2955 goto err; 2956 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); 2957 /* fall into ... */ 2958 err: 2959 cpuset_free(cp_tofree); 2960 return pos; 2961 } 2962 2963 /* Map system wide cpu number to cpuset relative cpu number */ 2964 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) 2965 { 2966 struct cpuset *cp_tofree = NULL; 2967 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2968 int pos = -1; 2969 2970 if (!cp1) 2971 goto err; 2972 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); 2973 /* fall into ... */ 2974 err: 2975 cpuset_free(cp_tofree); 2976 return pos; 2977 } 2978 2979 /* Map cpuset relative mem number to system wide mem number */ 2980 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) 2981 { 2982 struct cpuset *cp_tofree = NULL; 2983 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2984 int pos = -1; 2985 2986 if (!cp1) 2987 goto err; 2988 pos = bitmask_rel_to_abs_pos(cp1->mems, mem); 2989 /* fall into ... */ 2990 err: 2991 cpuset_free(cp_tofree); 2992 return pos; 2993 } 2994 2995 /* Map system wide mem number to cpuset relative mem number */ 2996 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) 2997 { 2998 struct cpuset *cp_tofree = NULL; 2999 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 3000 int pos = -1; 3001 3002 if (!cp1) 3003 goto err; 3004 pos = bitmask_abs_to_rel_pos(cp1->mems, mem); 3005 /* fall into ... */ 3006 err: 3007 cpuset_free(cp_tofree); 3008 return pos; 3009 } 3010 3011 /* Map pid's cpuset relative cpu number to system wide cpu number */ 3012 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) 3013 { 3014 struct cpuset *cp; 3015 int rc = -1; 3016 3017 if ((cp = cpuset_alloc()) == NULL) 3018 goto done; 3019 if (cpuset_cpusetofpid(cp, pid) < 0) 3020 goto done; 3021 rc = cpuset_c_rel_to_sys_cpu(cp, cpu); 3022 done: 3023 cpuset_free(cp); 3024 return rc; 3025 } 3026 3027 /* Map system wide cpu number to pid's cpuset relative cpu number */ 3028 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) 3029 { 3030 struct cpuset *cp; 3031 int rc = -1; 3032 3033 if ((cp = cpuset_alloc()) == NULL) 3034 goto done; 3035 if (cpuset_cpusetofpid(cp, pid) < 0) 3036 goto done; 3037 rc = cpuset_c_sys_to_rel_cpu(cp, cpu); 3038 done: 3039 cpuset_free(cp); 3040 return rc; 3041 } 3042 3043 /* Map pid's cpuset relative mem number to system wide mem number */ 3044 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) 3045 { 3046 struct cpuset *cp; 3047 int rc = -1; 3048 3049 if ((cp = cpuset_alloc()) == NULL) 3050 goto done; 3051 if (cpuset_cpusetofpid(cp, pid) < 0) 3052 goto done; 3053 rc = cpuset_c_rel_to_sys_mem(cp, mem); 3054 done: 3055 cpuset_free(cp); 3056 return rc; 3057 } 3058 3059 /* Map system wide mem number to pid's cpuset relative mem number */ 3060 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) 3061 { 3062 struct cpuset *cp; 3063 int rc = -1; 3064 3065 if ((cp = cpuset_alloc()) == NULL) 3066 goto done; 3067 if (cpuset_cpusetofpid(cp, pid) < 0) 3068 goto done; 3069 rc = cpuset_c_sys_to_rel_mem(cp, mem); 3070 done: 3071 cpuset_free(cp); 3072 return rc; 3073 } 3074 3075 /* 3076 * Override glibc's calls for get/set affinity - they have 3077 * something using cpu_set_t that will die when NR_CPUS > 1024. 3078 * Go directly to the 'real' system calls. Also override calls 3079 * for get_mempolicy and set_mempolicy. None of these 3080 * calls are yet (July 2004) guaranteed to be in all glibc versions 3081 * that we care about. 3082 */ 3083 3084 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) 3085 { 3086 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask); 3087 } 3088 3089 static int get_mempolicy(int *policy, unsigned long *nmask, 3090 unsigned long maxnode, void *addr, int flags) 3091 { 3092 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode, 3093 addr, flags); 3094 } 3095 3096 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) 3097 { 3098 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode); 3099 } 3100 3101 struct cpuset_placement { 3102 struct bitmask *cpus; 3103 struct bitmask *mems; 3104 char *path; 3105 }; 3106 3107 /* Allocate and fill in a placement struct - cpatures current placement */ 3108 struct cpuset_placement *cpuset_get_placement(pid_t pid) 3109 { 3110 struct cpuset_placement *plc; 3111 struct cpuset *cp = NULL; 3112 char buf[PATH_MAX]; 3113 int nbits; 3114 3115 if ((plc = calloc(1, sizeof(*plc))) == NULL) 3116 goto err; 3117 3118 nbits = cpuset_cpus_nbits(); 3119 if ((plc->cpus = bitmask_alloc(nbits)) == NULL) 3120 goto err; 3121 3122 nbits = cpuset_mems_nbits(); 3123 if ((plc->mems = bitmask_alloc(nbits)) == NULL) 3124 goto err; 3125 3126 if ((cp = cpuset_alloc()) == NULL) 3127 goto err; 3128 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 3129 goto err; 3130 if (cpuset_query(cp, buf) < 0) 3131 goto err; 3132 3133 bitmask_copy(plc->cpus, cp->cpus); 3134 bitmask_copy(plc->mems, cp->mems); 3135 plc->path = strdup(buf); 3136 3137 cpuset_free(cp); 3138 return plc; 3139 err: 3140 cpuset_free(cp); 3141 cpuset_free_placement(plc); 3142 return NULL; 3143 } 3144 3145 /* Compare two placement structs - use to detect changes in placement */ 3146 int cpuset_equal_placement(const struct cpuset_placement *plc1, 3147 const struct cpuset_placement *plc2) 3148 { 3149 return bitmask_equal(plc1->cpus, plc2->cpus) && 3150 bitmask_equal(plc1->mems, plc2->mems) && 3151 streq(plc1->path, plc2->path); 3152 } 3153 3154 /* Free a placement struct */ 3155 void cpuset_free_placement(struct cpuset_placement *plc) 3156 { 3157 if (!plc) 3158 return; 3159 bitmask_free(plc->cpus); 3160 bitmask_free(plc->mems); 3161 free(plc->path); 3162 free(plc); 3163 } 3164 3165 /* 3166 * A cpuset_fts_open() call constructs a linked list of entries 3167 * called a "cpuset_fts_tree", with one entry per cpuset below 3168 * the specified path. The cpuset_fts_read() routine returns the 3169 * next entry on this list. The various cpuset_fts_get_*() calls 3170 * return attributes of the specified entry. The cpuset_fts_close() 3171 * call frees the linked list and all associated data. All cpuset 3172 * entries and attributes for the cpuset_fts_tree returned from a 3173 * given cpuset_fts_open() call remain allocated and unchanged until 3174 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any 3175 * subsequent changes to the cpuset filesystem will go unnoticed 3176 * (not affect open cpuset_fts_tree's.) 3177 */ 3178 3179 struct cpuset_fts_entry; 3180 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); 3181 3182 struct cpuset_fts_tree { 3183 struct cpuset_fts_entry *head; /* head of linked entry list */ 3184 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ 3185 }; 3186 3187 struct cpuset_fts_entry { 3188 struct cpuset_fts_entry *next; /* linked entry list chain */ 3189 struct cpuset *cpuset; 3190 struct stat *stat; 3191 char *path; 3192 int info; 3193 int err; 3194 }; 3195 3196 /* Open a handle on a cpuset hierarchy. All the real work is done here. */ 3197 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) 3198 { 3199 FTS *fts = NULL; 3200 FTSENT *ftsent; 3201 char *path_argv[2]; 3202 char buf[PATH_MAX]; 3203 struct cpuset_fts_tree *cs_tree = NULL; 3204 struct cpuset_fts_entry *ep; /* the latest new list entry */ 3205 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ 3206 char *relpath; 3207 int fts_flags; 3208 3209 fullpath(buf, sizeof(buf), cpusetpath); 3210 path_argv[0] = buf; 3211 path_argv[1] = NULL; 3212 3213 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; 3214 fts = fts_open(path_argv, fts_flags, NULL); 3215 if (fts == NULL) 3216 goto err; 3217 3218 cs_tree = malloc(sizeof(*cs_tree)); 3219 if (cs_tree == NULL) 3220 goto err; 3221 pnlep = &cs_tree->head; 3222 *pnlep = NULL; 3223 3224 while ((ftsent = fts_read(fts)) != NULL) { 3225 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) 3226 continue; 3227 3228 /* ftsent is a directory (perhaps unreadable) ==> cpuset */ 3229 ep = calloc(1, sizeof(*ep)); 3230 if (ep == NULL) 3231 goto err; 3232 *pnlep = ep; 3233 pnlep = &ep->next; 3234 3235 /* Set entry's path, and if DNR, error */ 3236 relpath = ftsent->fts_path + strlen(cpusetmnt); 3237 if (strlen(relpath) == 0) 3238 relpath = "/"; 3239 ep->path = strdup(relpath); 3240 if (ep->path == NULL) 3241 goto err; 3242 if (ftsent->fts_info == FTS_DNR) { 3243 ep->info = CPUSET_FTS_ERR_DNR; 3244 ep->err = ftsent->fts_errno; 3245 continue; 3246 } 3247 3248 /* ftsent is a -readable- cpuset: set entry's stat, etc */ 3249 ep->stat = calloc(1, sizeof(struct stat)); 3250 if (ep->stat == NULL) 3251 goto err; 3252 if (stat(ftsent->fts_path, ep->stat) < 0) { 3253 ep->info = CPUSET_FTS_ERR_STAT; 3254 ep->err = ftsent->fts_errno; 3255 continue; 3256 } 3257 3258 ep->cpuset = calloc(1, sizeof(struct cpuset)); 3259 if (ep->cpuset == NULL) 3260 goto err; 3261 if (cpuset_query(ep->cpuset, relpath) < 0) { 3262 ep->info = CPUSET_FTS_ERR_CPUSET; 3263 ep->err = errno; 3264 continue; 3265 } 3266 ep->info = CPUSET_FTS_CPUSET; 3267 } 3268 3269 (void)fts_close(fts); 3270 cpuset_fts_rewind(cs_tree); 3271 return cs_tree; 3272 3273 err: 3274 if (cs_tree) 3275 cpuset_fts_close(cs_tree); 3276 if (fts) 3277 (void)fts_close(fts); 3278 return NULL; 3279 } 3280 3281 /* Return pointer to next cpuset entry in hierarchy */ 3282 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) 3283 { 3284 const struct cpuset_fts_entry *cs_entry = cs_tree->next; 3285 if (cs_tree->next != NULL) /* seek to next entry */ 3286 cs_tree->next = cs_tree->next->next; 3287 return cs_entry; 3288 } 3289 3290 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ 3291 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) 3292 { 3293 struct cpuset_fts_entry *cs1, *cs2, *cs3; 3294 3295 /* 3296 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer 3297 * is redirected from cs3 to cs1. 3298 */ 3299 3300 cs1 = cs2 = NULL; 3301 cs3 = cs_tree->head; 3302 while (cs3) { 3303 cs1 = cs2; 3304 cs2 = cs3; 3305 cs3 = cs3->next; 3306 cs2->next = cs1; 3307 } 3308 cs_tree->head = cs2; 3309 cpuset_fts_rewind(cs_tree); 3310 } 3311 3312 /* Rewind cpuset list to beginning */ 3313 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) 3314 { 3315 cs_tree->next = cs_tree->head; 3316 } 3317 3318 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */ 3319 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) 3320 { 3321 return cs_entry->path; 3322 } 3323 3324 /* Return pointer to stat(2) structure of a cpuset entry's directory */ 3325 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) 3326 { 3327 return cs_entry->stat; 3328 } 3329 3330 /* Return pointer to cpuset structure of a cpuset entry */ 3331 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry 3332 *cs_entry) 3333 { 3334 return cs_entry->cpuset; 3335 } 3336 3337 /* Return value of errno (0 if no error) on attempted cpuset operations */ 3338 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) 3339 { 3340 return cs_entry->err; 3341 } 3342 3343 /* Return operation identity causing error */ 3344 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) 3345 { 3346 return cs_entry->info; 3347 } 3348 3349 /* Close a cpuset hierarchy handle (free's all associated memory) */ 3350 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) 3351 { 3352 struct cpuset_fts_entry *cs_entry = cs_tree->head; 3353 3354 while (cs_entry) { 3355 struct cpuset_fts_entry *ep = cs_entry; 3356 3357 cs_entry = cs_entry->next; 3358 free(ep->path); 3359 free(ep->stat); 3360 cpuset_free(ep->cpuset); 3361 free(ep); 3362 } 3363 free(cs_tree); 3364 } 3365 3366 /* Bind current task to cpu (uses sched_setaffinity(2)) */ 3367 int cpuset_cpubind(int cpu) 3368 { 3369 struct bitmask *bmp; 3370 int r; 3371 3372 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3373 return -1; 3374 bitmask_setbit(bmp, cpu); 3375 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); 3376 bitmask_free(bmp); 3377 return r; 3378 } 3379 3380 /* 3381 * int cpuset_latestcpu(pid_t pid) 3382 * 3383 * Return most recent CPU on which task pid executed. If pid == 0, 3384 * examine current task. 3385 * 3386 * The last used CPU is visible for a given pid as field #39 (starting 3387 * with #1) in the file /proc/pid/stat. Currently this file has 41 3388 * fields, in which case this is the 3rd to the last field. 3389 * 3390 * Unfortunately field #2 is a command name and might have embedded 3391 * whitespace. So we can't just count white space separated fields. 3392 * Fortunately, this command name is surrounded by parentheses, as 3393 * for example "(sh)", and that closing parenthesis is the last ')' 3394 * character in the line. No remaining fields can have embedded 3395 * whitespace or parentheses. So instead of looking for the 39th 3396 * white space separated field, we can look for the 37th white space 3397 * separated field past the last ')' character on the line. 3398 */ 3399 3400 /* Return most recent CPU on which task pid executed */ 3401 int cpuset_latestcpu(pid_t pid) 3402 { 3403 char buf[PATH_MAX]; 3404 char *bp; 3405 int fd = -1; 3406 int cpu = -1; 3407 3408 if (pid == 0) 3409 snprintf(buf, sizeof(buf), "/proc/self/stat"); 3410 else 3411 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 3412 3413 if ((fd = open(buf, O_RDONLY)) < 0) 3414 goto err; 3415 if (read(fd, buf, sizeof(buf)) < 1) 3416 goto err; 3417 close(fd); 3418 3419 bp = strrchr(buf, ')'); 3420 if (bp) 3421 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ 3422 &cpu); 3423 if (cpu < 0) 3424 errno = EINVAL; 3425 return cpu; 3426 err: 3427 if (fd >= 0) 3428 close(fd); 3429 return -1; 3430 } 3431 3432 /* Bind current task to memory (uses set_mempolicy(2)) */ 3433 int cpuset_membind(int mem) 3434 { 3435 struct bitmask *bmp; 3436 int r; 3437 3438 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3439 return -1; 3440 bitmask_setbit(bmp, mem); 3441 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1); 3442 bitmask_free(bmp); 3443 return r; 3444 } 3445 3446 /* [optional] Return Memory Node holding page at specified addr */ 3447 int cpuset_addr2node(void *addr) 3448 { 3449 int node = -1; 3450 3451 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) { 3452 /* I realize this seems redundant, but I _want_ to make sure 3453 * that this value is -1. */ 3454 node = -1; 3455 } 3456 return node; 3457 } 3458 3459 /* 3460 * Transform cpuset into Text Format Representation in buffer 'buf', 3461 * of length 'buflen', nul-terminated if space allows. Return number 3462 * of characters that would have been written, if enough space had 3463 * been available, in the same way that snprintf() does. 3464 */ 3465 3466 /* Export cpuset settings to a regular file */ 3467 int cpuset_export(const struct cpuset *cp, char *buf, int buflen) 3468 { 3469 char *tmp = NULL; 3470 int n = 0; 3471 3472 if (cp->cpu_exclusive) 3473 n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n"); 3474 3475 if (cp->mem_exclusive) 3476 n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n"); 3477 3478 if (cp->notify_on_release) 3479 n += snprintf(buf + n, MAX(buflen - n, 0), 3480 "notify_on_release\n"); 3481 3482 if (cp->memory_pressure_enabled) 3483 n += snprintf(buf + n, MAX(buflen - n, 0), 3484 "memory_pressure_enabled\n"); 3485 3486 if (cp->memory_migrate) 3487 n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n"); 3488 3489 if (cp->memory_spread_page) 3490 n += snprintf(buf + n, MAX(buflen - n, 0), 3491 "memory_spread_page\n"); 3492 3493 if (cp->memory_spread_slab) 3494 n += snprintf(buf + n, MAX(buflen - n, 0), 3495 "memory_spread_slab\n"); 3496 3497 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) 3498 return -1; 3499 n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp); 3500 free(tmp); 3501 tmp = NULL; 3502 3503 if ((tmp = sprint_mask_buf(cp->mems)) == NULL) 3504 return -1; 3505 n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp); 3506 free(tmp); 3507 tmp = NULL; 3508 3509 return n; 3510 } 3511 3512 static int import_list(UNUSED const char *tok, const char *arg, 3513 struct bitmask *bmp, char *emsg, int elen) 3514 { 3515 if (bitmask_parselist(arg, bmp) < 0) { 3516 if (emsg) 3517 snprintf(emsg, elen, "Invalid list format: %s", arg); 3518 return -1; 3519 } 3520 return 0; 3521 } 3522 3523 static void stolower(char *s) 3524 { 3525 while (*s) { 3526 unsigned char c = *s; 3527 *s = tolower(c); 3528 s++; 3529 } 3530 } 3531 3532 /* Import cpuset settings from a regular file */ 3533 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, 3534 char *emsg, int elen) 3535 { 3536 char *linebuf = NULL; 3537 int linebuflen; 3538 int linenum = 0; 3539 int offset = 0; 3540 3541 linebuflen = strlen(buf) + 1; 3542 if ((linebuf = malloc(linebuflen)) == NULL) { 3543 if (emsg) 3544 snprintf(emsg, elen, "Insufficient memory"); 3545 goto err; 3546 } 3547 3548 while (slgets(linebuf, linebuflen, buf, &offset)) { 3549 char *tok, *arg; 3550 char *ptr; /* for strtok_r */ 3551 3552 linenum++; 3553 if ((tok = strchr(linebuf, '#')) != NULL) 3554 *tok = 0; 3555 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) 3556 continue; 3557 stolower(tok); 3558 3559 arg = strtok_r(0, " \t", &ptr); 3560 3561 if (streq(tok, "cpu_exclusive")) { 3562 cp->cpu_exclusive = 1; 3563 goto eol; 3564 } 3565 if (streq(tok, "mem_exclusive")) { 3566 cp->mem_exclusive = 1; 3567 goto eol; 3568 } 3569 if (streq(tok, "notify_on_release")) { 3570 cp->notify_on_release = 1; 3571 goto eol; 3572 } 3573 if (streq(tok, "memory_pressure_enabled")) { 3574 cp->memory_pressure_enabled = 1; 3575 goto eol; 3576 } 3577 if (streq(tok, "memory_migrate")) { 3578 cp->memory_migrate = 1; 3579 goto eol; 3580 } 3581 if (streq(tok, "memory_spread_page")) { 3582 cp->memory_spread_page = 1; 3583 goto eol; 3584 } 3585 if (streq(tok, "memory_spread_slab")) { 3586 cp->memory_spread_slab = 1; 3587 goto eol; 3588 } 3589 if (streq(tok, "cpu") || streq(tok, "cpus")) { 3590 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) 3591 goto err; 3592 goto eol; 3593 } 3594 if (streq(tok, "mem") || streq(tok, "mems")) { 3595 if (import_list(tok, arg, cp->mems, emsg, elen) < 0) 3596 goto err; 3597 goto eol; 3598 } 3599 if (emsg) 3600 snprintf(emsg, elen, "Unrecognized token: '%s'", tok); 3601 goto err; 3602 eol: 3603 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { 3604 if (emsg) 3605 snprintf(emsg, elen, "Surplus token: '%s'", 3606 tok); 3607 goto err; 3608 } 3609 continue; 3610 } 3611 3612 free(linebuf); 3613 3614 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) 3615 cpuset_localcpus(cp->mems, cp->cpus); 3616 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) 3617 cpuset_localmems(cp->cpus, cp->mems); 3618 3619 /* 3620 * All cpuset attributes are determined in an import. 3621 * Those that aren't explicitly specified are presumed 3622 * to be unchanged (zero, if it's a freshly allocated 3623 * struct cpuset.) 3624 */ 3625 3626 cp->cpus_valid = 1; 3627 cp->mems_valid = 1; 3628 cp->cpu_exclusive_valid = 1; 3629 cp->mem_exclusive_valid = 1; 3630 cp->notify_on_release_valid = 1; 3631 cp->memory_migrate_valid = 1; 3632 cp->memory_pressure_enabled_valid = 1; 3633 cp->memory_spread_page_valid = 1; 3634 cp->memory_spread_slab_valid = 1; 3635 3636 return 0; 3637 err: 3638 if (elinenum) 3639 *elinenum = linenum; 3640 free(linebuf); 3641 return -1; 3642 } 3643 3644 /* Pin current task CPU (and memory) */ 3645 int cpuset_pin(int relcpu) 3646 { 3647 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3648 int cpu, r; 3649 3650 if (check() < 0) 3651 return -1; 3652 3653 do { 3654 cpuset_free_placement(plc1); 3655 plc1 = cpuset_get_placement(0); 3656 3657 r = 0; 3658 if (cpuset_unpin() < 0) 3659 r = -1; 3660 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); 3661 if (cpuset_cpubind(cpu) < 0) 3662 r = -1; 3663 3664 cpuset_free_placement(plc2); 3665 plc2 = cpuset_get_placement(0); 3666 } while (!cpuset_equal_placement(plc1, plc2)); 3667 3668 cpuset_free_placement(plc1); 3669 cpuset_free_placement(plc2); 3670 return r; 3671 } 3672 3673 /* Return number CPUs in current tasks cpuset */ 3674 int cpuset_size(void) 3675 { 3676 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3677 int r; 3678 3679 if (check() < 0) 3680 return -1; 3681 3682 do { 3683 cpuset_free_placement(plc1); 3684 plc1 = cpuset_get_placement(0); 3685 3686 r = cpuset_cpus_weight(0); 3687 3688 cpuset_free_placement(plc2); 3689 plc2 = cpuset_get_placement(0); 3690 } while (!cpuset_equal_placement(plc1, plc2)); 3691 3692 cpuset_free_placement(plc1); 3693 cpuset_free_placement(plc2); 3694 return r; 3695 } 3696 3697 /* Return relative CPU number, within current cpuset, last executed on */ 3698 int cpuset_where(void) 3699 { 3700 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3701 int r; 3702 3703 if (check() < 0) 3704 return -1; 3705 3706 do { 3707 cpuset_free_placement(plc1); 3708 plc1 = cpuset_get_placement(0); 3709 3710 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); 3711 3712 cpuset_free_placement(plc2); 3713 plc2 = cpuset_get_placement(0); 3714 } while (!cpuset_equal_placement(plc1, plc2)); 3715 3716 cpuset_free_placement(plc1); 3717 cpuset_free_placement(plc2); 3718 return r; 3719 } 3720 3721 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ 3722 int cpuset_unpin(void) 3723 { 3724 struct bitmask *cpus = NULL, *mems = NULL; 3725 int r = -1; 3726 3727 if (check() < 0) 3728 goto err; 3729 3730 /* 3731 * Don't need cpuset_*_placement() guard against concurrent 3732 * cpuset migration, because none of the following depends 3733 * on the tasks cpuset placement. 3734 */ 3735 3736 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3737 goto err; 3738 bitmask_setall(cpus); 3739 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) 3740 goto err; 3741 3742 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3743 goto err; 3744 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), 3745 bitmask_nbits(mems) + 1) < 0) 3746 goto err; 3747 r = 0; 3748 /* fall into ... */ 3749 err: 3750 bitmask_free(cpus); 3751 bitmask_free(mems); 3752 return r; 3753 3754 } 3755 3756 struct cpuset_function_list { 3757 const char *fname; 3758 void *func; 3759 } flist[] = { 3760 { 3761 "cpuset_version", cpuset_version}, { 3762 "cpuset_alloc", cpuset_alloc}, { 3763 "cpuset_free", cpuset_free}, { 3764 "cpuset_cpus_nbits", cpuset_cpus_nbits}, { 3765 "cpuset_mems_nbits", cpuset_mems_nbits}, { 3766 "cpuset_setcpus", cpuset_setcpus}, { 3767 "cpuset_setmems", cpuset_setmems}, { 3768 "cpuset_set_iopt", cpuset_set_iopt}, { 3769 "cpuset_set_sopt", cpuset_set_sopt}, { 3770 "cpuset_getcpus", cpuset_getcpus}, { 3771 "cpuset_getmems", cpuset_getmems}, { 3772 "cpuset_cpus_weight", cpuset_cpus_weight}, { 3773 "cpuset_mems_weight", cpuset_mems_weight}, { 3774 "cpuset_get_iopt", cpuset_get_iopt}, { 3775 "cpuset_get_sopt", cpuset_get_sopt}, { 3776 "cpuset_localcpus", cpuset_localcpus}, { 3777 "cpuset_localmems", cpuset_localmems}, { 3778 "cpuset_cpumemdist", cpuset_cpumemdist}, { 3779 "cpuset_cpu2node", cpuset_cpu2node}, { 3780 "cpuset_addr2node", cpuset_addr2node}, { 3781 "cpuset_create", cpuset_create}, { 3782 "cpuset_delete", cpuset_delete}, { 3783 "cpuset_query", cpuset_query}, { 3784 "cpuset_modify", cpuset_modify}, { 3785 "cpuset_getcpusetpath", cpuset_getcpusetpath}, { 3786 "cpuset_cpusetofpid", cpuset_cpusetofpid}, { 3787 "cpuset_mountpoint", cpuset_mountpoint}, { 3788 "cpuset_collides_exclusive", cpuset_collides_exclusive}, { 3789 "cpuset_nuke", cpuset_nuke}, { 3790 "cpuset_init_pidlist", cpuset_init_pidlist}, { 3791 "cpuset_pidlist_length", cpuset_pidlist_length}, { 3792 "cpuset_get_pidlist", cpuset_get_pidlist}, { 3793 "cpuset_freepidlist", cpuset_freepidlist}, { 3794 "cpuset_move", cpuset_move}, { 3795 "cpuset_move_all", cpuset_move_all}, { 3796 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, { 3797 "cpuset_migrate", cpuset_migrate}, { 3798 "cpuset_migrate_all", cpuset_migrate_all}, { 3799 "cpuset_reattach", cpuset_reattach}, { 3800 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, { 3801 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, { 3802 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, { 3803 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, { 3804 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, { 3805 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, { 3806 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, { 3807 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, { 3808 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, { 3809 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, { 3810 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, { 3811 "cpuset_get_placement", cpuset_get_placement}, { 3812 "cpuset_equal_placement", cpuset_equal_placement}, { 3813 "cpuset_free_placement", cpuset_free_placement}, { 3814 "cpuset_fts_open", cpuset_fts_open}, { 3815 "cpuset_fts_read", cpuset_fts_read}, { 3816 "cpuset_fts_reverse", cpuset_fts_reverse}, { 3817 "cpuset_fts_rewind", cpuset_fts_rewind}, { 3818 "cpuset_fts_get_path", cpuset_fts_get_path}, { 3819 "cpuset_fts_get_stat", cpuset_fts_get_stat}, { 3820 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, { 3821 "cpuset_fts_get_errno", cpuset_fts_get_errno}, { 3822 "cpuset_fts_get_info", cpuset_fts_get_info}, { 3823 "cpuset_fts_close", cpuset_fts_close}, { 3824 "cpuset_cpubind", cpuset_cpubind}, { 3825 "cpuset_latestcpu", cpuset_latestcpu}, { 3826 "cpuset_membind", cpuset_membind}, { 3827 "cpuset_export", cpuset_export}, { 3828 "cpuset_import", cpuset_import}, { 3829 "cpuset_function", cpuset_function}, { 3830 "cpuset_pin", cpuset_pin}, { 3831 "cpuset_size", cpuset_size}, { 3832 "cpuset_where", cpuset_where}, { 3833 "cpuset_unpin", cpuset_unpin},}; 3834 3835 /* Return pointer to a libcpuset.so function, or NULL */ 3836 void *cpuset_function(const char *function_name) 3837 { 3838 unsigned int i; 3839 3840 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++) 3841 if (streq(function_name, flist[i].fname)) 3842 return flist[i].func; 3843 return NULL; 3844 } 3845 3846 /* Fortran interface to basic cpuset routines */ 3847 int cpuset_pin_(int *ptr_relcpu) 3848 { 3849 return cpuset_pin(*ptr_relcpu); 3850 } 3851 3852 int cpuset_size_(void) 3853 { 3854 return cpuset_size(); 3855 } 3856 3857 int cpuset_where_(void) 3858 { 3859 return cpuset_where(); 3860 } 3861 3862 int cpuset_unpin_(void) 3863 { 3864 return cpuset_unpin(); 3865 } 3866 3867 #endif /* HAVE_LINUX_MEMPOLICY_H */ 3868