1 /* 2 * cpuset user library implementation. 3 * 4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. 5 * 6 * Paul Jackson <pj (at) sgi.com> 7 */ 8 9 /* 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #define _XOPEN_SOURCE 500 /* need to see pread() */ 26 #define _BSD_SOURCE 1 /* need to see syscall() */ 27 #include <unistd.h> 28 29 #include <ctype.h> 30 #include <dirent.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <fts.h> 34 #include <limits.h> 35 #include <signal.h> 36 #include <stdint.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <sys/stat.h> 41 #include <sys/syscall.h> 42 #include <sys/types.h> 43 #include <time.h> 44 #include <utime.h> 45 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ 46 47 #include "bitmask.h" 48 #include "cpuset.h" 49 #include "common.h" 50 #include "test.h" 51 #include "linux_syscall_numbers.h" 52 #include "config.h" 53 #if HAVE_LINUX_MEMPOLICY_H 54 #include <linux/mempolicy.h> 55 56 /* Bump version, and update Change History, when libcpuset API changes */ 57 #define CPUSET_VERSION 3 58 59 /* 60 * For a history of what changed in each version, see the "Change 61 * History" section, at the end of the libcpuset master document. 62 */ 63 64 int cpuset_version(void) 65 { 66 return CPUSET_VERSION; 67 } 68 69 struct cpuset { 70 struct bitmask *cpus; 71 struct bitmask *mems; 72 char cpu_exclusive; 73 char mem_exclusive; 74 char mem_hardwall; 75 char notify_on_release; 76 char memory_migrate; 77 char memory_pressure_enabled; 78 char memory_spread_page; 79 char memory_spread_slab; 80 char sched_load_balance; 81 int sched_relax_domain_level; 82 83 /* 84 * Each field 'x' above gets an 'x_valid' field below. 85 * The apply_cpuset_settings() will only set those fields whose 86 * corresponding *_valid flags are set. The cpuset_alloc() 87 * routine clears these flags as part of the clear in calloc(), 88 * and the various cpuset_set*() routines set these flags when 89 * setting the corresponding value. 90 * 91 * The purpose of these valid fields is to ensure that when 92 * we create a new cpuset, we don't accidentally overwrite 93 * some non-zero kernel default, such as an inherited 94 * memory_spread_* flag, just because the user application 95 * code didn't override the default zero settings resulting 96 * from the calloc() call in cpuset_alloc(). 97 * 98 * The choice of 'char' for the type of the flags above, 99 * but a bitfield for the flags below, is somewhat capricious. 100 */ 101 unsigned cpus_valid:1; 102 unsigned mems_valid:1; 103 unsigned cpu_exclusive_valid:1; 104 unsigned mem_exclusive_valid:1; 105 unsigned mem_hardwall_valid:1; 106 unsigned notify_on_release_valid:1; 107 unsigned memory_migrate_valid:1; 108 unsigned memory_pressure_enabled_valid:1; 109 unsigned memory_spread_page_valid:1; 110 unsigned memory_spread_slab_valid:1; 111 unsigned sched_load_balance_valid:1; 112 unsigned sched_relax_domain_level_valid:1; 113 114 /* 115 * if the relative variable was modified, use following flags 116 * to put a mark 117 */ 118 unsigned cpus_dirty:1; 119 unsigned mems_dirty:1; 120 unsigned cpu_exclusive_dirty:1; 121 unsigned mem_exclusive_dirty:1; 122 unsigned mem_hardwall_dirty:1; 123 unsigned notify_on_release_dirty:1; 124 unsigned memory_migrate_dirty:1; 125 unsigned memory_pressure_enabled_dirty:1; 126 unsigned memory_spread_page_dirty:1; 127 unsigned memory_spread_slab_dirty:1; 128 unsigned sched_load_balance_dirty:1; 129 unsigned sched_relax_domain_level_dirty:1; 130 }; 131 132 /* Presumed cpuset file system mount point */ 133 static const char *cpusetmnt = "/dev/cpuset"; 134 135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */ 136 static const char *mapfile = "/var/run/cpunodemap"; 137 138 /* The primary source for the cpunodemap[] is available below here. */ 139 static const char *sysdevices = "/sys/devices/system"; 140 141 #define max(a,b) ((a) > (b) ? (a) : (b)) 142 #define min(a,b) ((a) < (b) ? (a) : (b)) 143 144 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ 145 #define SMALL_BUFSZ 16 146 147 /* 148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t 149 * and nodemask_t sizes. The lines in this file that begin with the 150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask 151 * and nodemask string, respectively. The lengths of these strings 152 * reflect the kernel's internal cpumask_t and nodemask_t sizes, 153 * which sizes are needed to correctly call the sched_setaffinity 154 * and set_mempolicy system calls, and to size user level 155 * bitmasks to match the kernels. 156 */ 157 158 static const char *mask_size_file = "/proc/self/status"; 159 static const char *cpumask_prefix = "Cpus_allowed:\t"; 160 static const char *nodemask_prefix = "Mems_allowed:\t"; 161 162 /* 163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. 164 * 165 * The first time we need these, we parse the Cpus_allowed and 166 * Mems_allowed lines from mask_size_file ("/proc/self/status"). 167 */ 168 169 static int cpumask_sz; 170 static int nodemask_sz; 171 172 /* 173 * These defaults only kick in if we fail to size the kernel 174 * cpumask and nodemask by reading the Cpus_allowed and 175 * Mems_allowed fields from the /proc/self/status file. 176 */ 177 178 #define DEFCPUBITS (512) 179 #define DEFNODEBITS (DEFCPUBITS/2) 180 181 /* 182 * Arch-neutral API for obtaining NUMA distances between CPUs 183 * and Memory Nodes, via the files: 184 * /sys/devices/system/node/nodeN/distance 185 * which have lines such as: 186 * 46 66 10 20 187 * which say that for cpu on node N (from the path above), the 188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, 189 * respectively. 190 */ 191 192 static const char *distance_directory = "/sys/devices/system/node"; 193 194 /* 195 * Someday, we should disable, then later discard, the SN code 196 * marked ALTERNATE_SN_DISTMAP. 197 */ 198 199 #define ALTERNATE_SN_DISTMAP 1 200 #ifdef ALTERNATE_SN_DISTMAP 201 202 /* 203 * Alternative SN (SGI ia64) architecture specific API for obtaining 204 * NUMA distances between CPUs and Memory Nodes is via the file 205 * /proc/sgi_sn/sn_topology, which has lines such as: 206 * 207 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 208 * 209 * which says that for each CPU on node 2, the distance to nodes 210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. 211 * 212 * This file has other lines as well, which start with other 213 * keywords than "node". Ignore these other lines. 214 */ 215 216 static const char *sn_topology = "/proc/sgi_sn/sn_topology"; 217 static const char *sn_top_node_prefix = "node "; 218 219 #endif 220 221 /* 222 * Check that cpusets supported, /dev/cpuset mounted. 223 * If ok, return 0. 224 * If not, return -1 and set errno: 225 * ENOSYS - kernel doesn't support cpusets 226 * ENODEV - /dev/cpuset not mounted 227 */ 228 229 static enum { 230 check_notdone, 231 check_enosys, 232 check_enodev, 233 check_ok 234 } check_state = check_notdone; 235 236 static int check() 237 { 238 if (check_state == check_notdone) { 239 struct stat statbuf; 240 241 if (stat("/proc/self/cpuset", &statbuf) < 0) { 242 check_state = check_enosys; 243 goto done; 244 } 245 246 if (stat("/dev/cpuset/tasks", &statbuf) < 0) { 247 check_state = check_enodev; 248 goto done; 249 } 250 251 check_state = check_ok; 252 } 253 done: 254 switch (check_state) { 255 case check_enosys: 256 errno = ENOSYS; 257 return -1; 258 case check_enodev: 259 errno = ENODEV; 260 return -1; 261 default: 262 break; 263 } 264 return 0; 265 } 266 267 static void chomp(char *s) 268 { 269 char *t; 270 271 for (t = s + strlen(s) - 1; t >= s; t--) { 272 if (*t == '\n' || *t == '\r') 273 *t = '\0'; 274 else 275 break; 276 } 277 } 278 279 /* 280 * Determine number of bytes in a seekable open file, without 281 * assuming that stat(2) on that file has a useful size. 282 * Has side affect of leaving the file rewound to the beginnning. 283 */ 284 static int filesize(FILE * fp) 285 { 286 int sz = 0; 287 rewind(fp); 288 while (fgetc(fp) != EOF) 289 sz++; 290 rewind(fp); 291 return sz; 292 } 293 294 /* Are strings s1 and s2 equal? */ 295 static int streq(const char *s1, const char *s2) 296 { 297 return strcmp(s1, s2) == 0; 298 } 299 300 /* Is string 'pre' a prefix of string 's'? */ 301 static int strprefix(const char *s, const char *pre) 302 { 303 return strncmp(s, pre, strlen(pre)) == 0; 304 } 305 306 /* 307 * char *flgets(char *buf, int buflen, FILE *fp) 308 * 309 * Obtain one line from input file fp. Copy up to first 310 * buflen-1 chars of line into buffer buf, discarding any remainder 311 * of line. Stop reading at newline, discarding newline. 312 * Nul terminate result and return pointer to buffer buf 313 * on success, or NULL if nothing more to read or failure. 314 */ 315 316 static char *flgets(char *buf, int buflen, FILE * fp) 317 { 318 int c = -1; 319 char *bp; 320 321 bp = buf; 322 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { 323 if (c == '\n') 324 goto newline; 325 *bp++ = c; 326 } 327 if ((c < 0) && (bp == buf)) 328 return NULL; 329 330 if (c > 0) { 331 while ((c = getc(fp)) >= 0) { 332 if (c == '\n') 333 break; 334 } 335 } 336 337 newline: 338 *bp++ = '\0'; 339 return buf; 340 } 341 342 /* 343 * sgetc(const char *inputbuf, int *offsetptr) 344 * 345 * Return next char from nul-terminated input buffer inputbuf, 346 * starting at offset *offsetptr. Increment *offsetptr. 347 * If next char would be nul ('\0'), return EOF and don't 348 * increment *offsetptr. 349 */ 350 351 static int sgetc(const char *inputbuf, int *offsetptr) 352 { 353 char c; 354 355 if ((c = inputbuf[*offsetptr]) != 0) { 356 *offsetptr = *offsetptr + 1; 357 return c; 358 } else { 359 return EOF; 360 } 361 } 362 363 /* 364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 365 * 366 * Obtain next line from nul-terminated input buffer 'inputbuf', 367 * starting at offset *offsetptr. Copy up to first buflen-1 368 * chars of line into output buffer buf, discarding any remainder 369 * of line. Stop reading at newline, discarding newline. 370 * Nul terminate result and return pointer to output buffer 371 * buf on success, or NULL if nothing more to read. 372 */ 373 374 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 375 { 376 int c = -1; 377 char *bp; 378 379 bp = buf; 380 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { 381 if (c == '\n') 382 goto newline; 383 *bp++ = c; 384 } 385 if ((c < 0) && (bp == buf)) 386 return NULL; 387 388 if (c > 0) { 389 while ((c = sgetc(inputbuf, offsetptr)) >= 0) { 390 if (c == '\n') 391 break; 392 } 393 } 394 395 newline: 396 *bp++ = '\0'; 397 return buf; 398 } 399 400 /* 401 * time_t get_mtime(char *path) 402 * 403 * Return modtime of file at location path, else return 0. 404 */ 405 406 static time_t get_mtime(const char *path) 407 { 408 struct stat statbuf; 409 410 if (stat(path, &statbuf) != 0) 411 return 0; 412 return statbuf.st_mtime; 413 } 414 415 /* 416 * int set_mtime(const char *path, time_t mtime) 417 * 418 * Set modtime of file 'path' to 'mtime'. Return 0 on success, 419 * or -1 on error, setting errno. 420 */ 421 422 static int set_mtime(const char *path, time_t mtime) 423 { 424 struct utimbuf times; 425 426 times.actime = mtime; 427 times.modtime = mtime; 428 return utime(path, ×); 429 } 430 431 /* 432 * True if two pathnames resolve to same file. 433 * False if either path can not be stat'd, 434 * or if the two paths resolve to a different file. 435 */ 436 437 static int samefile(const char *path1, const char *path2) 438 { 439 struct stat sb1, sb2; 440 441 if (stat(path1, &sb1) != 0) 442 return 0; 443 if (stat(path2, &sb2) != 0) 444 return 0; 445 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; 446 } 447 448 #define slash(c) (*(c) == '/') 449 #define eocomp(c) (slash(c) || !*(c)) 450 #define dot1(c) (*(c) == '.' && eocomp(c+1)) 451 452 /* In place path compression. Remove extra dots and slashes. */ 453 static char *pathcomp(char *p) 454 { 455 char *a = p; 456 char *b = p; 457 458 if (!p || !*p) 459 return p; 460 if (slash(p)) 461 *b++ = *a++; 462 for (;;) { 463 if (slash(a)) 464 while (slash(++a)) 465 continue; 466 if (!*a) { 467 if (b == p) 468 *b++ = '.'; 469 *b = '\0'; 470 return (p); 471 } else if (dot1(a)) { 472 a++; 473 } else { 474 if ((b != p) && !slash(b - 1)) 475 *b++ = '/'; 476 while (!eocomp(a)) 477 *b++ = *a++; 478 } 479 } 480 } 481 482 #undef slash 483 #undef eocomp 484 #undef dot1 485 486 /* 487 * pathcat2(buf, buflen, name1, name2) 488 * 489 * Return buf, of length buflen, with name1/name2 stored in it. 490 */ 491 492 static char *pathcat2(char *buf, int buflen, const char *name1, 493 const char *name2) 494 { 495 (void)snprintf(buf, buflen, "%s/%s", name1, name2); 496 return pathcomp(buf); 497 } 498 499 /* 500 * pathcat3(buf, buflen, name1, name2, name3) 501 * 502 * Return buf, of length buflen, with name1/name2/name3 stored in it. 503 */ 504 505 static char *pathcat3(char *buf, int buflen, const char *name1, 506 const char *name2, const char *name3) 507 { 508 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); 509 return pathcomp(buf); 510 } 511 512 /* 513 * fullpath(buf, buflen, name) 514 * 515 * Put full path of cpuset 'name' in buffer 'buf'. If name 516 * starts with a slash (``/``) character, then this a path 517 * relative to ``/dev/cpuset``, otherwise it is relative to 518 * the current tasks cpuset. Return 0 on success, else 519 * -1 on error, setting errno. 520 */ 521 522 static int fullpath(char *buf, int buflen, const char *name) 523 { 524 int len; 525 526 /* easy case */ 527 if (*name == '/') { 528 pathcat2(buf, buflen, cpusetmnt, name); 529 pathcomp(buf); 530 return 0; 531 } 532 533 /* hard case */ 534 snprintf(buf, buflen, "%s/", cpusetmnt); 535 len = strlen(buf); 536 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) 537 return -1; 538 if (strlen(buf) >= buflen - 1 - strlen(name)) { 539 errno = E2BIG; 540 return -1; 541 } 542 strcat(buf, "/"); 543 strcat(buf, name); 544 pathcomp(buf); 545 return 0; 546 } 547 548 /* 549 * fullpath2(buf, buflen, name1, name2) 550 * 551 * Like fullpath(), only concatenate two pathname components on end. 552 */ 553 554 static int fullpath2(char *buf, int buflen, const char *name1, 555 const char *name2) 556 { 557 if (fullpath(buf, buflen, name1) < 0) 558 return -1; 559 if (strlen(buf) >= buflen - 1 - strlen(name2)) { 560 errno = E2BIG; 561 return -1; 562 } 563 strcat(buf, "/"); 564 strcat(buf, name2); 565 pathcomp(buf); 566 return 0; 567 } 568 569 /* 570 * Convert the string length of an ascii hex mask to the number 571 * of bits represented by that mask. 572 * 573 * The cpumask and nodemask values in /proc/self/status are in an 574 * ascii format that uses 9 characters for each 32 bits of mask. 575 */ 576 static int s2nbits(const char *s) 577 { 578 return strlen(s) * 32 / 9; 579 } 580 581 static void update_mask_sizes() 582 { 583 FILE *fp = NULL; 584 char *buf = NULL; 585 int fsize; 586 587 if ((fp = fopen(mask_size_file, "r")) == NULL) 588 goto done; 589 fsize = filesize(fp); 590 if ((buf = malloc(fsize)) == NULL) 591 goto done; 592 593 /* 594 * Beware: mask sizing arithmetic is fussy. 595 * The trailing newline left by fgets() is required. 596 */ 597 while (fgets(buf, fsize, fp)) { 598 if (strprefix(buf, cpumask_prefix)) 599 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); 600 if (strprefix(buf, nodemask_prefix)) 601 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); 602 } 603 done: 604 free(buf); 605 if (fp != NULL) 606 fclose(fp); 607 if (cpumask_sz == 0) 608 cpumask_sz = DEFCPUBITS; 609 if (nodemask_sz == 0) 610 nodemask_sz = DEFNODEBITS; 611 } 612 613 /* Allocate a new struct cpuset */ 614 struct cpuset *cpuset_alloc() 615 { 616 struct cpuset *cp = NULL; 617 int nbits; 618 619 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) 620 goto err; 621 622 nbits = cpuset_cpus_nbits(); 623 if ((cp->cpus = bitmask_alloc(nbits)) == NULL) 624 goto err; 625 626 nbits = cpuset_mems_nbits(); 627 if ((cp->mems = bitmask_alloc(nbits)) == NULL) 628 goto err; 629 630 return cp; 631 err: 632 if (cp && cp->cpus) 633 bitmask_free(cp->cpus); 634 if (cp && cp->mems) 635 bitmask_free(cp->mems); 636 free(cp); 637 return NULL; 638 } 639 640 /* Free struct cpuset *cp */ 641 void cpuset_free(struct cpuset *cp) 642 { 643 if (!cp) 644 return; 645 if (cp->cpus) 646 bitmask_free(cp->cpus); 647 if (cp->mems) 648 bitmask_free(cp->mems); 649 free(cp); 650 } 651 652 /* Number of bits in a CPU bitmask on current system */ 653 int cpuset_cpus_nbits() 654 { 655 if (cpumask_sz == 0) 656 update_mask_sizes(); 657 return cpumask_sz; 658 } 659 660 /* Number of bits in a Memory bitmask on current system */ 661 int cpuset_mems_nbits() 662 { 663 if (nodemask_sz == 0) 664 update_mask_sizes(); 665 return nodemask_sz; 666 } 667 668 /* Set CPUs in cpuset cp to bitmask cpus */ 669 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) 670 { 671 if (cp->cpus) 672 bitmask_free(cp->cpus); 673 cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); 674 if (cp->cpus == NULL) 675 return -1; 676 bitmask_copy(cp->cpus, cpus); 677 cp->cpus_valid = 1; 678 cp->cpus_dirty = 1; 679 return 0; 680 } 681 682 /* Set Memory Nodes in cpuset cp to bitmask mems */ 683 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) 684 { 685 if (cp->mems) 686 bitmask_free(cp->mems); 687 cp->mems = bitmask_alloc(bitmask_nbits(mems)); 688 if (cp->mems == NULL) 689 return -1; 690 bitmask_copy(cp->mems, mems); 691 cp->mems_valid = 1; 692 cp->mems_dirty = 1; 693 return 0; 694 } 695 696 /* Set integer value optname of cpuset cp */ 697 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) 698 { 699 if (streq(optionname, "cpu_exclusive")) { 700 cp->cpu_exclusive = ! !value; 701 cp->cpu_exclusive_valid = 1; 702 cp->cpu_exclusive_dirty = 1; 703 } else if (streq(optionname, "mem_exclusive")) { 704 cp->mem_exclusive = ! !value; 705 cp->mem_exclusive_valid = 1; 706 cp->mem_exclusive_dirty = 1; 707 } else if (streq(optionname, "mem_hardwall")) { 708 cp->mem_hardwall = ! !value; 709 cp->mem_hardwall_valid = 1; 710 cp->mem_hardwall_dirty = 1; 711 } else if (streq(optionname, "notify_on_release")) { 712 cp->notify_on_release = ! !value; 713 cp->notify_on_release_valid = 1; 714 cp->notify_on_release_dirty = 1; 715 } else if (streq(optionname, "memory_pressure_enabled")) { 716 cp->memory_pressure_enabled = ! !value; 717 cp->memory_pressure_enabled_valid = 1; 718 cp->memory_pressure_enabled_dirty = 1; 719 } else if (streq(optionname, "memory_migrate")) { 720 cp->memory_migrate = ! !value; 721 cp->memory_migrate_valid = 1; 722 cp->memory_migrate_dirty = 1; 723 } else if (streq(optionname, "memory_spread_page")) { 724 cp->memory_spread_page = ! !value; 725 cp->memory_spread_page_valid = 1; 726 cp->memory_spread_page_dirty = 1; 727 } else if (streq(optionname, "memory_spread_slab")) { 728 cp->memory_spread_slab = ! !value; 729 cp->memory_spread_slab_valid = 1; 730 cp->memory_spread_slab_dirty = 1; 731 } else if (streq(optionname, "sched_load_balance")) { 732 cp->sched_load_balance = ! !value; 733 cp->sched_load_balance_valid = 1; 734 cp->sched_load_balance_dirty = 1; 735 } else if (streq(optionname, "sched_relax_domain_level")) { 736 cp->sched_relax_domain_level = value; 737 cp->sched_relax_domain_level_valid = 1; 738 cp->sched_relax_domain_level_dirty = 1; 739 } else 740 return -2; /* optionname not recognized */ 741 return 0; 742 } 743 744 /* [optional] Set string value optname */ 745 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, 746 UNUSED const char *value) 747 { 748 return -2; /* For now, all string options unrecognized */ 749 } 750 751 /* Return handle for reading memory_pressure. */ 752 int cpuset_open_memory_pressure(const char *cpusetpath) 753 { 754 char buf[PATH_MAX]; 755 756 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); 757 return open(buf, O_RDONLY); 758 } 759 760 /* Return current memory_pressure of cpuset. */ 761 int cpuset_read_memory_pressure(int han) 762 { 763 char buf[SMALL_BUFSZ]; 764 765 if (pread(han, buf, sizeof(buf), 0L) < 0) 766 return -1; 767 return atoi(buf); 768 } 769 770 /* Close handle for reading memory pressure. */ 771 void cpuset_close_memory_pressure(int han) 772 { 773 close(han); 774 } 775 776 /* 777 * Resolve cpuset pointer (to that of current task if cp == NULL). 778 * 779 * If cp not NULL, just return it. If cp is NULL, return pointer 780 * to temporary cpuset for current task, and set *cp_tofree to 781 * pointer to that same temporary cpuset, to be freed later. 782 * 783 * Return NULL and set errno on error. Errors can occur when 784 * resolving the current tasks cpuset. 785 */ 786 static const struct cpuset *resolve_cp(const struct cpuset *cp, 787 struct cpuset **cp_tofree) 788 { 789 const struct cpuset *rcp; 790 791 if (cp) { 792 rcp = cp; 793 } else { 794 struct cpuset *cp1 = cpuset_alloc(); 795 if (cp1 == NULL) 796 goto err; 797 if (cpuset_cpusetofpid(cp1, 0) < 0) { 798 cpuset_free(cp1); 799 goto err; 800 } 801 *cp_tofree = cp1; 802 rcp = cp1; 803 } 804 return rcp; 805 err: 806 return NULL; 807 } 808 809 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ 810 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) 811 { 812 struct cpuset *cp_tofree = NULL; 813 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 814 815 if (!cp1) 816 goto err; 817 if (cp1->cpus == NULL) { 818 errno = EINVAL; 819 goto err; 820 } 821 bitmask_copy(cpus, cp1->cpus); 822 cpuset_free(cp_tofree); 823 return 0; 824 err: 825 cpuset_free(cp_tofree); 826 return -1; 827 } 828 829 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ 830 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) 831 { 832 struct cpuset *cp_tofree = NULL; 833 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 834 835 if (!cp1) 836 goto err; 837 if (cp1->mems == NULL) { 838 errno = EINVAL; 839 goto err; 840 } 841 bitmask_copy(mems, cp1->mems); 842 cpuset_free(cp_tofree); 843 return 0; 844 err: 845 cpuset_free(cp_tofree); 846 return -1; 847 } 848 849 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */ 850 int cpuset_cpus_weight(const struct cpuset *cp) 851 { 852 struct cpuset *cp_tofree = NULL; 853 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 854 int w = -1; 855 856 if (!cp1) 857 goto err; 858 if (cp1->cpus == NULL) { 859 errno = EINVAL; 860 goto err; 861 } 862 w = bitmask_weight(cp1->cpus); 863 /* fall into ... */ 864 err: 865 cpuset_free(cp_tofree); 866 return w; 867 } 868 869 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ 870 int cpuset_mems_weight(const struct cpuset *cp) 871 { 872 struct cpuset *cp_tofree = NULL; 873 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 874 int w = -1; 875 876 if (!cp1) 877 goto err; 878 if (cp1->mems == NULL) { 879 errno = EINVAL; 880 goto err; 881 } 882 w = bitmask_weight(cp1->mems); 883 /* fall into ... */ 884 err: 885 cpuset_free(cp_tofree); 886 return w; 887 } 888 889 /* Return integer value of option optname in cp */ 890 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) 891 { 892 if (streq(optionname, "cpu_exclusive")) 893 return cp->cpu_exclusive; 894 else if (streq(optionname, "mem_exclusive")) 895 return cp->mem_exclusive; 896 else if (streq(optionname, "mem_hardwall")) 897 return cp->mem_hardwall; 898 else if (streq(optionname, "notify_on_release")) 899 return cp->notify_on_release; 900 else if (streq(optionname, "memory_pressure_enabled")) 901 return cp->memory_pressure_enabled; 902 else if (streq(optionname, "memory_migrate")) 903 return cp->memory_migrate; 904 else if (streq(optionname, "memory_spread_page")) 905 return cp->memory_spread_page; 906 else if (streq(optionname, "memory_spread_slab")) 907 return cp->memory_spread_slab; 908 else if (streq(optionname, "sched_load_balance")) 909 return cp->sched_load_balance; 910 else if (streq(optionname, "sched_relax_domain_level")) 911 return cp->sched_relax_domain_level; 912 else 913 return -2; /* optionname not recognized */ 914 } 915 916 /* [optional] Return string value of optname */ 917 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, 918 UNUSED const char *optionname) 919 { 920 return NULL; /* For now, all string options unrecognized */ 921 } 922 923 static int read_flag(const char *filepath, char *flagp) 924 { 925 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ 926 int fd = -1; 927 928 if ((fd = open(filepath, O_RDONLY)) < 0) 929 goto err; 930 if (read(fd, buf, sizeof(buf)) < 1) 931 goto err; 932 if (atoi(buf)) 933 *flagp = 1; 934 else 935 *flagp = 0; 936 close(fd); 937 return 0; 938 err: 939 if (fd >= 0) 940 close(fd); 941 return -1; 942 } 943 944 static int load_flag(const char *path, char *flagp, const char *flag) 945 { 946 char buf[PATH_MAX]; 947 948 pathcat2(buf, sizeof(buf), path, flag); 949 return read_flag(buf, flagp); 950 } 951 952 static int read_number(const char *filepath, int *numberp) 953 { 954 char buf[SMALL_BUFSZ]; 955 int fd = -1; 956 957 if ((fd = open(filepath, O_RDONLY)) < 0) 958 goto err; 959 if (read(fd, buf, sizeof(buf)) < 1) 960 goto err; 961 *numberp = atoi(buf); 962 close(fd); 963 return 0; 964 err: 965 if (fd >= 0) 966 close(fd); 967 return -1; 968 } 969 970 static int load_number(const char *path, int *numberp, const char *file) 971 { 972 char buf[PATH_MAX]; 973 974 pathcat2(buf, sizeof(buf), path, file); 975 return read_number(buf, numberp); 976 } 977 978 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) 979 { 980 FILE *fp = NULL; 981 char *buf = NULL; 982 int buflen; 983 struct bitmask *bmp = NULL; 984 985 if ((fp = fopen(filepath, "r")) == NULL) 986 goto err; 987 buflen = filesize(fp) + 1; /* + 1 for nul term */ 988 if ((buf = malloc(buflen)) == NULL) 989 goto err; 990 if (flgets(buf, buflen, fp) == NULL) 991 goto err; 992 fclose(fp); 993 fp = NULL; 994 995 if ((bmp = bitmask_alloc(nbits)) == NULL) 996 goto err; 997 if (*buf && bitmask_parselist(buf, bmp) < 0) 998 goto err; 999 if (*bmpp) 1000 bitmask_free(*bmpp); 1001 *bmpp = bmp; 1002 free(buf); 1003 buf = NULL; 1004 return 0; 1005 err: 1006 if (buf != NULL) 1007 free(buf); 1008 if (fp != NULL) 1009 fclose(fp); 1010 if (bmp != NULL) 1011 bitmask_free(bmp); 1012 return -1; 1013 } 1014 1015 static int load_mask(const char *path, struct bitmask **bmpp, 1016 int nbits, const char *mask) 1017 { 1018 char buf[PATH_MAX]; 1019 1020 pathcat2(buf, sizeof(buf), path, mask); 1021 return read_mask(buf, bmpp, nbits); 1022 } 1023 1024 /* Write string to file at given filepath. Create or truncate file. */ 1025 static int write_string_file(const char *filepath, const char *str) 1026 { 1027 int fd = -1; 1028 1029 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0) 1030 goto err; 1031 if (write(fd, str, strlen(str)) < 0) 1032 goto err; 1033 close(fd); 1034 return 0; 1035 err: 1036 if (fd >= 0) 1037 close(fd); 1038 return -1; 1039 } 1040 1041 /* Size and allocate buffer. Write bitmask into it. Caller must free */ 1042 static char *sprint_mask_buf(const struct bitmask *bmp) 1043 { 1044 char *buf = NULL; 1045 int buflen; 1046 char c; 1047 1048 /* First bitmask_displaylist() call just to get the length */ 1049 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ 1050 if ((buf = malloc(buflen)) == NULL) 1051 return NULL; 1052 bitmask_displaylist(buf, buflen, bmp); 1053 return buf; 1054 } 1055 1056 static int exists_flag(const char *path, const char *flag) 1057 { 1058 char buf[PATH_MAX]; 1059 struct stat statbuf; 1060 int rc; 1061 1062 pathcat2(buf, sizeof(buf), path, flag); 1063 rc = (stat(buf, &statbuf) == 0); 1064 errno = 0; 1065 return rc; 1066 } 1067 1068 static int store_flag(const char *path, const char *flag, int val) 1069 { 1070 char buf[PATH_MAX]; 1071 1072 pathcat2(buf, sizeof(buf), path, flag); 1073 return write_string_file(buf, val ? "1" : "0"); 1074 } 1075 1076 static int store_number(const char *path, const char *file, int val) 1077 { 1078 char buf[PATH_MAX]; 1079 char data[SMALL_BUFSZ]; 1080 1081 memset(data, 0, sizeof(data)); 1082 pathcat2(buf, sizeof(buf), path, file); 1083 snprintf(data, sizeof(data), "%d", val); 1084 return write_string_file(buf, data); 1085 } 1086 1087 static int store_mask(const char *path, const char *mask, 1088 const struct bitmask *bmp) 1089 { 1090 char maskpath[PATH_MAX]; 1091 char *bp = NULL; 1092 int rc; 1093 1094 if (bmp == NULL) 1095 return 0; 1096 pathcat2(maskpath, sizeof(maskpath), path, mask); 1097 if ((bp = sprint_mask_buf(bmp)) == NULL) 1098 return -1; 1099 rc = write_string_file(maskpath, bp); 1100 free(bp); 1101 return rc; 1102 } 1103 1104 /* 1105 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file 1106 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents 1107 * were N == cpu number. 1108 */ 1109 1110 char cpu_online(unsigned int cpu) 1111 { 1112 char online; 1113 char cpupath[PATH_MAX]; 1114 1115 (void)snprintf(cpupath, sizeof(cpupath), 1116 "/sys/devices/system/cpu/cpu%d/online", cpu); 1117 if (read_flag(cpupath, &online) < 0) 1118 return 0; /* oops - guess that cpu's not there */ 1119 return online; 1120 } 1121 1122 /* 1123 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), 1124 * to the node on which that cpu resides or cpuset_mems_nbits(). 1125 * 1126 * To avoid every user having to recalculate this relation 1127 * from various clues in the sysfs file system (below the 1128 * path /sys/devices/system) a copy of this map is kept at 1129 * /var/run/cpunodemap. 1130 * 1131 * The system automatically cleans out files below 1132 * /var/run on each system reboot (see the init script 1133 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry 1134 * about stale data in this file across reboots. If the file 1135 * is missing, let the first process that needs it, and has 1136 * permission to write in the /var/run directory, rebuild it. 1137 * 1138 * If using this cached data, remember the mtime of the mapfile 1139 * the last time we read it in case something like a hotplug 1140 * event results in the file being removed and rebuilt, so we 1141 * can detect if we're using a stale cache, and need to reload. 1142 * 1143 * The mtime of this file is set to the time when we did 1144 * the recalculation of the map, from the clues beneath 1145 * /sys/devices/system. This is done so that a program 1146 * won't see the mapfile it just wrote as being newer than what 1147 * it just wrote out (store_map) and read the same map back in 1148 * (load_file). 1149 */ 1150 1151 /* 1152 * Hold flockfile(stdin) while using cpunodemap for posix thread safety. 1153 * 1154 * Note on locking and flockfile(FILE *): 1155 * 1156 * We use flockfile() and funlockfile() instead of directly 1157 * calling pthread_mutex_lock and pthread_mutex_unlock on 1158 * a pthread_mutex_t, because this avoids forcing the app 1159 * to link with libpthread. The glibc implementation of 1160 * flockfile/funlockfile will fall back to no-ops if libpthread 1161 * doesn't happen to be linked. 1162 * 1163 * Since flockfile already has the moderately convoluted 1164 * combination of weak and strong symbols required to accomplish 1165 * this, it is easier to use flockfile() on some handy FILE * 1166 * stream as a surrogate for pthread locking than it is to so 1167 * re-invent that wheel. 1168 * 1169 * Forcing all apps that use cpusets to link with libpthread 1170 * would force non-transparent initialization on apps that 1171 * might not be prepared to handle it. 1172 * 1173 * The application using libcpuset should never notice this 1174 * odd use of flockfile(), because we never return to the 1175 * application from any libcpuset call with any such lock held. 1176 * We just use this locking for guarding some non-atomic cached 1177 * data updates and accesses, internal to some libcpuset calls. 1178 * Also, flockfile() allows recursive nesting, so if the app 1179 * calls libcpuset holding such a file lock, we won't deadlock 1180 * if we go to acquire the same lock. We'll just get the lock 1181 * and increment its counter while we hold it. 1182 */ 1183 1184 static struct cpunodemap { 1185 int *map; /* map[cpumask_sz]: maps cpu to its node */ 1186 time_t mtime; /* modtime of mapfile when last read */ 1187 } cpunodemap; 1188 1189 /* 1190 * rebuild_map() - Rebuild cpunodemap[] from scratch. 1191 * 1192 * Situation: 1193 * Neither our in-memory cpunodemap[] array nor the 1194 * cache of it in mapfile is current. 1195 * Action: 1196 * Rebuild it from first principles and the information 1197 * available below /sys/devices/system. 1198 */ 1199 1200 static void rebuild_map() 1201 { 1202 char buf[PATH_MAX]; 1203 DIR *dir1, *dir2; 1204 struct dirent *dent1, *dent2; 1205 int ncpus = cpuset_cpus_nbits(); 1206 int nmems = cpuset_mems_nbits(); 1207 unsigned int cpu, mem; 1208 1209 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1210 cpunodemap.map[cpu] = -1; 1211 pathcat2(buf, sizeof(buf), sysdevices, "node"); 1212 if ((dir1 = opendir(buf)) == NULL) 1213 return; 1214 while ((dent1 = readdir(dir1)) != NULL) { 1215 if (sscanf(dent1->d_name, "node%u", &mem) < 1) 1216 continue; 1217 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); 1218 if ((dir2 = opendir(buf)) == NULL) 1219 continue; 1220 while ((dent2 = readdir(dir2)) != NULL) { 1221 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) 1222 continue; 1223 if (cpu >= (unsigned int)ncpus 1224 || mem >= (unsigned int)nmems) 1225 continue; 1226 cpunodemap.map[cpu] = mem; 1227 } 1228 closedir(dir2); 1229 } 1230 closedir(dir1); 1231 cpunodemap.mtime = time(0); 1232 } 1233 1234 /* 1235 * load_map() - Load cpunodemap[] from mapfile. 1236 * 1237 * Situation: 1238 * The cpunodemap in mapfile is more recent than 1239 * what we have in the cpunodemap[] array. 1240 * Action: 1241 * Reload the cpunodemap[] array from the file. 1242 */ 1243 1244 static void load_map() 1245 { 1246 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ 1247 FILE *mapfp; /* File stream on mapfile */ 1248 int ncpus = cpuset_cpus_nbits(); 1249 int nmems = cpuset_mems_nbits(); 1250 unsigned int cpu, mem; 1251 1252 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL) 1253 return; 1254 cpunodemap.mtime = get_mtime(mapfile); 1255 if ((mapfp = fopen(mapfile, "r")) == NULL) 1256 return; 1257 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1258 cpunodemap.map[cpu] = nmems; 1259 while (flgets(buf, sizeof(buf), mapfp) != NULL) { 1260 if (sscanf(buf, "%u %u", &cpu, &mem) < 2) 1261 continue; 1262 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) 1263 continue; 1264 cpunodemap.map[cpu] = mem; 1265 } 1266 fclose(mapfp); 1267 } 1268 1269 /* 1270 * store_map() - Write cpunodemap[] out to mapfile. 1271 * 1272 * Situation: 1273 * The cpunodemap in the cpunodemap[] array is 1274 * more recent than the one in mapfile. 1275 * Action: 1276 * Write cpunodemap[] out to mapfile. 1277 */ 1278 1279 static void store_map() 1280 { 1281 char buf[PATH_MAX]; 1282 int fd = -1; 1283 FILE *mapfp = NULL; 1284 int ncpus = cpuset_cpus_nbits(); 1285 int nmems = cpuset_mems_nbits(); 1286 unsigned int cpu, mem; 1287 1288 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); 1289 if ((fd = mkstemp(buf)) < 0) 1290 goto err; 1291 if ((mapfp = fdopen(fd, "w")) == NULL) 1292 goto err; 1293 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1294 mem = cpunodemap.map[cpu]; 1295 if (mem < (unsigned int)nmems) 1296 fprintf(mapfp, "%u %u\n", cpu, mem); 1297 } 1298 fclose(mapfp); 1299 set_mtime(buf, cpunodemap.mtime); 1300 if (rename(buf, mapfile) < 0) 1301 goto err; 1302 /* mkstemp() creates mode 0600 - change to world readable */ 1303 (void)chmod(mapfile, 0444); 1304 return; 1305 err: 1306 if (mapfp != NULL) { 1307 fclose(mapfp); 1308 fd = -1; 1309 } 1310 if (fd >= 0) 1311 close(fd); 1312 (void)unlink(buf); 1313 } 1314 1315 /* 1316 * Load and gain thread safe access to the <cpu, node> map. 1317 * 1318 * Return 0 on success with flockfile(stdin) held. 1319 * Each successful get_map() call must be matched with a 1320 * following put_map() call to release the lock. 1321 * 1322 * On error, return -1 with errno set and no lock held. 1323 */ 1324 1325 static int get_map() 1326 { 1327 time_t file_mtime; 1328 1329 flockfile(stdin); 1330 1331 if (cpunodemap.map == NULL) { 1332 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); 1333 if (cpunodemap.map == NULL) 1334 goto err; 1335 } 1336 1337 /* If no one has a good cpunodemap, rebuild from scratch */ 1338 file_mtime = get_mtime(mapfile); 1339 if (cpunodemap.mtime == 0 && file_mtime == 0) 1340 rebuild_map(); 1341 1342 /* If either cpunodemap[] or mapfile newer, update other with it */ 1343 file_mtime = get_mtime(mapfile); 1344 if (cpunodemap.mtime < file_mtime) 1345 load_map(); 1346 else if (cpunodemap.mtime > file_mtime) 1347 store_map(); 1348 return 0; 1349 err: 1350 funlockfile(stdin); 1351 return -1; 1352 } 1353 1354 static void put_map() 1355 { 1356 funlockfile(stdin); 1357 } 1358 1359 /* Set cpus to those local to Memory Nodes mems */ 1360 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) 1361 { 1362 int ncpus = cpuset_cpus_nbits(); 1363 unsigned int cpu; 1364 1365 if (check() < 0) 1366 return -1; 1367 1368 get_map(); 1369 bitmask_clearall(cpus); 1370 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1371 if (bitmask_isbitset(mems, cpunodemap.map[cpu])) 1372 bitmask_setbit(cpus, cpu); 1373 } 1374 put_map(); 1375 return 0; 1376 } 1377 1378 /* Set mems to those local to CPUs cpus */ 1379 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) 1380 { 1381 int ncpus = cpuset_cpus_nbits(); 1382 unsigned int cpu; 1383 1384 if (check() < 0) 1385 return -1; 1386 1387 get_map(); 1388 bitmask_clearall(mems); 1389 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1390 if (bitmask_isbitset(cpus, cpu)) 1391 bitmask_setbit(mems, cpunodemap.map[cpu]); 1392 } 1393 put_map(); 1394 return 0; 1395 } 1396 1397 /* 1398 * distmap[] 1399 * 1400 * Array of ints of size cpumask_sz by nodemask_sz. 1401 * 1402 * Element distmap[cpu][mem] is the distance between CPU cpu 1403 * and Memory Node mem. Distances are weighted to roughly 1404 * approximate the cost of memory references, and scaled so that 1405 * the distance from a CPU to its local Memory Node is ten (10). 1406 * 1407 * The first call to cpuset_cpumemdist() builds this map, from 1408 * whatever means the kernel provides to obtain these distances. 1409 * 1410 * These distances derive from ACPI SLIT table entries, which are 1411 * eight bits in size. 1412 * 1413 * Hold flockfile(stdout) while using distmap for posix thread safety. 1414 */ 1415 1416 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ 1417 1418 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ 1419 1420 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ 1421 1422 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ 1423 1424 /* 1425 * Parse arch neutral lines from 'distance' files of form: 1426 * 1427 * 46 66 10 20 1428 * 1429 * The lines contain a space separated list of distances, which is parsed 1430 * into array dists[] of each nodes distance from the specified node. 1431 * 1432 * Result is placed in distmap[ncpus][nmems]: 1433 * 1434 * For each cpu c on node: 1435 * For each node position n in list of distances: 1436 * distmap[c][n] = dists[n] 1437 */ 1438 1439 static int parse_distmap_line(unsigned int node, char *buf) 1440 { 1441 char *p, *q; 1442 int ncpus = cpuset_cpus_nbits(); 1443 int nmems = cpuset_mems_nbits(); 1444 unsigned int c, n; 1445 distmap_entry_t *dists = NULL; 1446 struct bitmask *cpus = NULL, *mems = NULL; 1447 int ret = -1; 1448 1449 p = buf; 1450 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1451 goto err; 1452 for (n = 0; n < (unsigned int)nmems; n++) 1453 dists[n] = DISTMAP_MAX; 1454 1455 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1456 unsigned int d; 1457 1458 if ((p = strpbrk(p, "0123456789")) == NULL) 1459 break; 1460 d = strtoul(p, &q, 10); 1461 if (p == q) 1462 break; 1463 if (d < DISTMAP_MAX) 1464 dists[n] = (distmap_entry_t) d; 1465 } 1466 1467 if ((mems = bitmask_alloc(nmems)) == NULL) 1468 goto err; 1469 bitmask_setbit(mems, node); 1470 1471 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1472 goto err; 1473 cpuset_localcpus(mems, cpus); 1474 1475 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1476 c = bitmask_next(cpus, c + 1)) 1477 for (n = 0; n < (unsigned int)nmems; n++) 1478 distmap[I(c, n)] = dists[n]; 1479 ret = 0; 1480 /* fall into ... */ 1481 err: 1482 bitmask_free(mems); 1483 bitmask_free(cpus); 1484 free(dists); 1485 return ret; 1486 } 1487 1488 static int parse_distance_file(unsigned int node, const char *path) 1489 { 1490 FILE *fp; 1491 char *buf = NULL; 1492 int buflen; 1493 1494 if ((fp = fopen(path, "r")) == NULL) 1495 goto err; 1496 1497 buflen = filesize(fp); 1498 1499 if ((buf = malloc(buflen)) == NULL) 1500 goto err; 1501 1502 if (flgets(buf, buflen, fp) == NULL) 1503 goto err; 1504 1505 if (parse_distmap_line(node, buf) < 0) 1506 goto err; 1507 1508 free(buf); 1509 fclose(fp); 1510 return 0; 1511 err: 1512 free(buf); 1513 if (fp) 1514 fclose(fp); 1515 return -1; 1516 } 1517 1518 static void build_distmap() 1519 { 1520 static int tried_before = 0; 1521 int ncpus = cpuset_cpus_nbits(); 1522 int nmems = cpuset_mems_nbits(); 1523 int c, m; 1524 DIR *dir = NULL; 1525 struct dirent *dent; 1526 1527 if (tried_before) 1528 goto err; 1529 tried_before = 1; 1530 1531 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1532 goto err; 1533 1534 for (c = 0; c < ncpus; c++) 1535 for (m = 0; m < nmems; m++) 1536 distmap[I(c, m)] = DISTMAP_MAX; 1537 1538 if ((dir = opendir(distance_directory)) == NULL) 1539 goto err; 1540 while ((dent = readdir(dir)) != NULL) { 1541 char buf[PATH_MAX]; 1542 unsigned int node; 1543 1544 if (sscanf(dent->d_name, "node%u", &node) < 1) 1545 continue; 1546 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, 1547 "distance"); 1548 if (parse_distance_file(node, buf) < 0) 1549 goto err; 1550 } 1551 closedir(dir); 1552 return; 1553 err: 1554 if (dir) 1555 closedir(dir); 1556 free(distmap); 1557 distmap = NULL; 1558 } 1559 1560 #ifdef ALTERNATE_SN_DISTMAP 1561 1562 /* 1563 * Parse SN architecture specific line of form: 1564 * 1565 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 1566 * 1567 * Second field is node number. The "dist" field is the colon separated list 1568 * of distances, which is parsed into array dists[] of each nodes distance 1569 * from that node. 1570 * 1571 * Result is placed in distmap[ncpus][nmems]: 1572 * 1573 * For each cpu c on that node: 1574 * For each node position n in list of distances: 1575 * distmap[c][n] = dists[n] 1576 */ 1577 1578 static void parse_distmap_line_sn(char *buf) 1579 { 1580 char *p, *pend, *q; 1581 int ncpus = cpuset_cpus_nbits(); 1582 int nmems = cpuset_mems_nbits(); 1583 unsigned long c, n, node; 1584 distmap_entry_t *dists = NULL; 1585 struct bitmask *cpus = NULL, *mems = NULL; 1586 1587 if ((p = strchr(buf, ' ')) == NULL) 1588 goto err; 1589 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) 1590 goto err; 1591 if ((p = strstr(q, " dist ")) == NULL) 1592 goto err; 1593 p += strlen(" dist "); 1594 if ((pend = strchr(p, ' ')) != NULL) 1595 *pend = '\0'; 1596 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1597 goto err; 1598 for (n = 0; n < (unsigned int)nmems; n++) 1599 dists[n] = DISTMAP_MAX; 1600 1601 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1602 unsigned long d; 1603 1604 if ((p = strpbrk(p, "0123456789")) == NULL) 1605 break; 1606 d = strtoul(p, &q, 10); 1607 if (p == q) 1608 break; 1609 if (d < DISTMAP_MAX) 1610 dists[n] = (distmap_entry_t) d; 1611 } 1612 1613 if ((mems = bitmask_alloc(nmems)) == NULL) 1614 goto err; 1615 bitmask_setbit(mems, node); 1616 1617 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1618 goto err; 1619 cpuset_localcpus(mems, cpus); 1620 1621 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1622 c = bitmask_next(cpus, c + 1)) 1623 for (n = 0; n < (unsigned int)nmems; n++) 1624 distmap[I(c, n)] = dists[n]; 1625 /* fall into ... */ 1626 err: 1627 bitmask_free(mems); 1628 bitmask_free(cpus); 1629 free(dists); 1630 } 1631 1632 static void build_distmap_sn() 1633 { 1634 int ncpus = cpuset_cpus_nbits(); 1635 int nmems = cpuset_mems_nbits(); 1636 int c, m; 1637 static int tried_before = 0; 1638 FILE *fp = NULL; 1639 char *buf = NULL; 1640 int buflen; 1641 1642 if (tried_before) 1643 goto err; 1644 tried_before = 1; 1645 1646 if ((fp = fopen(sn_topology, "r")) == NULL) 1647 goto err; 1648 1649 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1650 goto err; 1651 1652 for (c = 0; c < ncpus; c++) 1653 for (m = 0; m < nmems; m++) 1654 distmap[I(c, m)] = DISTMAP_MAX; 1655 1656 buflen = filesize(fp); 1657 if ((buf = malloc(buflen)) == NULL) 1658 goto err; 1659 1660 while (flgets(buf, buflen, fp) != NULL) 1661 if (strprefix(buf, sn_top_node_prefix)) 1662 parse_distmap_line_sn(buf); 1663 1664 free(buf); 1665 fclose(fp); 1666 return; 1667 err: 1668 free(buf); 1669 free(distmap); 1670 distmap = NULL; 1671 if (fp) 1672 fclose(fp); 1673 } 1674 1675 #endif 1676 1677 /* [optional] Hardware distance from CPU to Memory Node */ 1678 unsigned int cpuset_cpumemdist(int cpu, int mem) 1679 { 1680 int ncpus = cpuset_cpus_nbits(); 1681 int nmems = cpuset_mems_nbits(); 1682 distmap_entry_t r = DISTMAP_MAX; 1683 1684 flockfile(stdout); 1685 1686 if (check() < 0) 1687 goto err; 1688 1689 if (distmap == NULL) 1690 build_distmap(); 1691 1692 #ifdef ALTERNATE_SN_DISTMAP 1693 if (distmap == NULL) 1694 build_distmap_sn(); 1695 #endif 1696 1697 if (distmap == NULL) 1698 goto err; 1699 1700 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) 1701 goto err; 1702 1703 r = distmap[I(cpu, mem)]; 1704 /* fall into ... */ 1705 err: 1706 funlockfile(stdout); 1707 return r; 1708 } 1709 1710 /* [optional] Return Memory Node closest to cpu */ 1711 int cpuset_cpu2node(int cpu) 1712 { 1713 int ncpus = cpuset_cpus_nbits(); 1714 int nmems = cpuset_mems_nbits(); 1715 struct bitmask *cpus = NULL, *mems = NULL; 1716 int r = -1; 1717 1718 if (check() < 0) 1719 goto err; 1720 1721 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1722 goto err; 1723 bitmask_setbit(cpus, cpu); 1724 1725 if ((mems = bitmask_alloc(nmems)) == NULL) 1726 goto err; 1727 cpuset_localmems(cpus, mems); 1728 r = bitmask_first(mems); 1729 /* fall into ... */ 1730 err: 1731 bitmask_free(cpus); 1732 bitmask_free(mems); 1733 return r; 1734 } 1735 1736 static int apply_cpuset_settings(const char *path, const struct cpuset *cp) 1737 { 1738 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { 1739 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) 1740 goto err; 1741 } 1742 1743 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { 1744 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) 1745 goto err; 1746 } 1747 1748 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { 1749 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) 1750 goto err; 1751 } 1752 1753 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { 1754 if (store_flag(path, "notify_on_release", cp->notify_on_release) 1755 < 0) 1756 goto err; 1757 } 1758 1759 if (cp->memory_migrate_valid && 1760 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) { 1761 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) 1762 goto err; 1763 } 1764 1765 if (cp->memory_pressure_enabled_valid && 1766 cp->memory_pressure_enabled_dirty && 1767 exists_flag(path, "memory_pressure_enabled")) { 1768 if (store_flag 1769 (path, "memory_pressure_enabled", 1770 cp->memory_pressure_enabled) < 0) 1771 goto err; 1772 } 1773 1774 if (cp->memory_spread_page_valid && 1775 cp->memory_spread_page_dirty && 1776 exists_flag(path, "memory_spread_page")) { 1777 if (store_flag 1778 (path, "memory_spread_page", cp->memory_spread_page) < 0) 1779 goto err; 1780 } 1781 1782 if (cp->memory_spread_slab_valid && 1783 cp->memory_spread_slab_dirty && 1784 exists_flag(path, "memory_spread_slab")) { 1785 if (store_flag 1786 (path, "memory_spread_slab", cp->memory_spread_slab) < 0) 1787 goto err; 1788 } 1789 1790 if (cp->sched_load_balance_valid && 1791 cp->sched_load_balance_dirty && 1792 exists_flag(path, "sched_load_balance")) { 1793 if (store_flag 1794 (path, "sched_load_balance", cp->sched_load_balance) < 0) 1795 goto err; 1796 } 1797 1798 if (cp->sched_relax_domain_level_valid && 1799 cp->sched_relax_domain_level_dirty && 1800 exists_flag(path, "sched_relax_domain_level")) { 1801 if (store_number 1802 (path, "sched_relax_domain_level", 1803 cp->sched_relax_domain_level) < 0) 1804 goto err; 1805 } 1806 1807 if (cp->cpus_valid && cp->cpus_dirty) { 1808 if (store_mask(path, "cpus", cp->cpus) < 0) 1809 goto err; 1810 } 1811 1812 if (cp->mems_valid && cp->mems_dirty) { 1813 if (store_mask(path, "mems", cp->mems) < 0) 1814 goto err; 1815 } 1816 return 0; 1817 err: 1818 return -1; 1819 } 1820 1821 /* 1822 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. 1823 * 1824 * Extract max value of any 'siblings' field in /proc/cpuinfo. 1825 * Cache the result - only need to extract once in lifetime of task. 1826 * 1827 * The siblings field is the number of logical CPUs in a physical 1828 * processor package. It is equal to the product of the number of 1829 * cores in that package, times the number of hyper-threads per core. 1830 * The bug that cpuset_would_crash_kernel() is detecting arises 1831 * when a cpu_exclusive cpuset tries to include just some, not all, 1832 * of the sibling logical CPUs available in a processor package. 1833 * 1834 * In the improbable case that a system has mixed values of siblings 1835 * (some processor packages have more than others, perhaps due to 1836 * partially enabling Hyper-Threading), we take the worse case value, 1837 * the largest siblings value. This might be overkill. I don't know 1838 * if this kernel bug considers each processor package's siblings 1839 * separately or not. But it sure is easier this way ... 1840 * 1841 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from 1842 * open to close, the first time called. 1843 */ 1844 1845 static int get_siblings() 1846 { 1847 static int siblings; 1848 char buf[32]; /* big enough for one 'siblings' line */ 1849 FILE *fp; 1850 1851 if (siblings) 1852 return siblings; 1853 1854 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 1855 return 4; /* wing it - /proc not mounted ? */ 1856 while (flgets(buf, sizeof(buf), fp) != NULL) { 1857 int s; 1858 1859 if (sscanf(buf, "siblings : %d", &s) < 1) 1860 continue; 1861 if (s > siblings) 1862 siblings = s; 1863 } 1864 fclose(fp); 1865 if (siblings == 0) 1866 siblings = 1; /* old kernel, no siblings, default to 1 */ 1867 return siblings; 1868 } 1869 1870 /* 1871 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic 1872 * scheduler domain code invoked for cpu_exclusive cpusets that causes 1873 * the kernel to freeze, requiring a hardware reset. 1874 * 1875 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' 1876 * cpuset is defined where that cpusets 'cpus' are not on package 1877 * boundaries then the kernel will freeze, usually as soon as this 1878 * cpuset is created, requiring a hardware reset. 1879 * 1880 * A cpusets 'cpus' are not on package boundaries if the cpuset 1881 * includes a proper non-empty subset (some, but not all) of the 1882 * logical cpus on a processor package. This requires multiple 1883 * logical CPUs per package, available with either Hyper-Thread or 1884 * Multi-Core support. Without one of these features, there is only 1885 * one logical CPU per physical package, and it's not possible to 1886 * have a proper, non-empty subset of a set of cardinality one. 1887 * 1888 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC 1889 * on i386 and x86_64 arch's. 1890 * 1891 * The objective of this routine cpuset_would_crash_kernel() is to 1892 * determine if a proposed cpuset setting would crash the kernel due 1893 * to this bug, so that the caller can avoid the crash. 1894 * 1895 * Ideally we'd check for exactly these conditions here, but computing 1896 * the package (identified by the 'physical id' field of /proc/cpuinfo) 1897 * of each cpu in a cpuset is more effort than it's worth here. 1898 * 1899 * Also there is no obvious way to identify exactly whether the kernel 1900 * one is executing on has this bug, short of trying it, and seeing 1901 * if the kernel just crashed. 1902 * 1903 * So for now, we look for a simpler set of conditions, that meets 1904 * our immediate need - avoid this crash on SUSE SLES10 systems that 1905 * are susceptible to it. We look for the kernel version 2.6.16.*, 1906 * which is the base kernel of SUSE SLES10, and for i386 or x86_64 1907 * processors, which had CONFIG_SCHED_MC enabled. 1908 * 1909 * If these simpler conditions are met, we further simplify the check, 1910 * by presuming that the logical CPUs are numbered on processor 1911 * package boundaries. If each package has S siblings, we assume 1912 * that CPUs numbered N through N + S -1 are on the same package, 1913 * for any CPU N such that N mod S == 0. 1914 * 1915 * Yes, this is a hack, focused on avoiding kernel freezes on 1916 * susceptible SUSE SLES10 systems. 1917 */ 1918 1919 static int cpuset_would_crash_kernel(const struct cpuset *cp) 1920 { 1921 static int susceptible_system = -1; 1922 1923 if (!cp->cpu_exclusive) 1924 goto ok; 1925 1926 if (susceptible_system == -1) { 1927 struct utsname u; 1928 int rel_2_6_16, arch_i386, arch_x86_64; 1929 1930 if (uname(&u) < 0) 1931 goto fail; 1932 rel_2_6_16 = strprefix(u.release, "2.6.16."); 1933 arch_i386 = streq(u.machine, "i386"); 1934 arch_x86_64 = streq(u.machine, "x86_64"); 1935 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); 1936 } 1937 1938 if (susceptible_system) { 1939 int ncpus = cpuset_cpus_nbits(); 1940 int siblings = get_siblings(); 1941 unsigned int cpu; 1942 1943 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { 1944 int s, num_set = 0; 1945 1946 for (s = 0; s < siblings; s++) { 1947 if (bitmask_isbitset(cp->cpus, cpu + s)) 1948 num_set++; 1949 } 1950 1951 /* If none or all siblings set, we're still ok */ 1952 if (num_set == 0 || num_set == siblings) 1953 continue; 1954 1955 /* Found one that would crash kernel. Fail. */ 1956 errno = ENXIO; 1957 goto fail; 1958 } 1959 } 1960 /* If not susceptible, or if all ok, fall into "ok" ... */ 1961 ok: 1962 return 0; /* would not crash */ 1963 fail: 1964 return 1; /* would crash */ 1965 } 1966 1967 /* compare two cpuset and mark the dirty variable */ 1968 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) 1969 { 1970 if (cp1->cpu_exclusive_valid && 1971 cp1->cpu_exclusive != cp2->cpu_exclusive) 1972 cp1->cpu_exclusive_dirty = 1; 1973 1974 if (cp1->mem_exclusive_valid && 1975 cp1->mem_exclusive != cp2->mem_exclusive) 1976 cp1->mem_exclusive_dirty = 1; 1977 1978 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall) 1979 cp1->mem_hardwall_dirty = 1; 1980 1981 if (cp1->notify_on_release_valid && 1982 cp1->notify_on_release != cp2->notify_on_release) 1983 cp1->notify_on_release_dirty = 1; 1984 1985 if (cp1->memory_migrate_valid && 1986 cp1->memory_migrate != cp2->memory_migrate) 1987 cp1->memory_migrate_dirty = 1; 1988 1989 if (cp1->memory_pressure_enabled_valid && 1990 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) 1991 cp1->memory_pressure_enabled_dirty = 1; 1992 1993 if (cp1->memory_spread_page_valid && 1994 cp1->memory_spread_page != cp2->memory_spread_page) 1995 cp1->memory_spread_page_dirty = 1; 1996 1997 if (cp1->memory_spread_slab_valid && 1998 cp1->memory_spread_slab != cp2->memory_spread_slab) 1999 cp1->memory_spread_slab_dirty = 1; 2000 2001 if (cp1->sched_load_balance_valid && 2002 cp1->sched_load_balance != cp2->sched_load_balance) 2003 cp1->sched_load_balance_dirty = 1; 2004 2005 if (cp1->sched_relax_domain_level_valid && 2006 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) 2007 cp1->sched_relax_domain_level_dirty = 1; 2008 2009 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) 2010 cp1->cpus_dirty = 1; 2011 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) 2012 cp1->mems_dirty = 1; 2013 } 2014 2015 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ 2016 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) 2017 { 2018 char buf[PATH_MAX]; 2019 int do_rmdir_on_err = 0; 2020 int do_restore_cp_sav_on_err = 0; 2021 struct cpuset *cp_sav = NULL; 2022 int sav_errno; 2023 2024 if (check() < 0) 2025 goto err; 2026 2027 if (cpuset_would_crash_kernel(cp)) 2028 goto err; 2029 2030 fullpath(buf, sizeof(buf), relpath); 2031 2032 if (new) { 2033 if (mkdir(buf, 0755) < 0) 2034 goto err; 2035 /* we made it, so we should remove it on error */ 2036 do_rmdir_on_err = 1; 2037 } 2038 2039 if ((cp_sav = cpuset_alloc()) == NULL) 2040 goto err; 2041 if (cpuset_query(cp_sav, relpath) < 0) 2042 goto err; 2043 /* we have old settings to restore on error */ 2044 do_restore_cp_sav_on_err = 1; 2045 2046 /* check which variable need to restore on error */ 2047 mark_dirty_variable(cp_sav, cp); 2048 2049 if (apply_cpuset_settings(buf, cp) < 0) 2050 goto err; 2051 2052 cpuset_free(cp_sav); 2053 return 0; 2054 err: 2055 sav_errno = errno; 2056 if (do_restore_cp_sav_on_err) 2057 (void)apply_cpuset_settings(buf, cp_sav); 2058 if (cp_sav) 2059 cpuset_free(cp_sav); 2060 if (do_rmdir_on_err) 2061 (void)rmdir(buf); 2062 errno = sav_errno; 2063 return -1; 2064 } 2065 2066 /* Create cpuset 'cp' at location 'relpath' */ 2067 int cpuset_create(const char *relpath, const struct cpuset *cp) 2068 { 2069 return cr_or_mod(relpath, cp, 1); 2070 } 2071 2072 /* Delete cpuset at location 'path' (if empty) */ 2073 int cpuset_delete(const char *relpath) 2074 { 2075 char buf[PATH_MAX]; 2076 2077 if (check() < 0) 2078 goto err; 2079 2080 fullpath(buf, sizeof(buf), relpath); 2081 if (rmdir(buf) < 0) 2082 goto err; 2083 2084 return 0; 2085 err: 2086 return -1; 2087 } 2088 2089 /* Set cpuset cp to the cpuset at location 'path' */ 2090 int cpuset_query(struct cpuset *cp, const char *relpath) 2091 { 2092 char buf[PATH_MAX]; 2093 2094 if (check() < 0) 2095 goto err; 2096 2097 fullpath(buf, sizeof(buf), relpath); 2098 2099 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0) 2100 goto err; 2101 cp->cpu_exclusive_valid = 1; 2102 2103 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0) 2104 goto err; 2105 cp->mem_exclusive_valid = 1; 2106 2107 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) 2108 goto err; 2109 cp->notify_on_release_valid = 1; 2110 2111 if (exists_flag(buf, "memory_migrate")) { 2112 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0) 2113 goto err; 2114 cp->memory_migrate_valid = 1; 2115 } 2116 2117 if (exists_flag(buf, "mem_hardwall")) { 2118 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0) 2119 goto err; 2120 cp->mem_hardwall_valid = 1; 2121 } 2122 2123 if (exists_flag(buf, "memory_pressure_enabled")) { 2124 if (load_flag 2125 (buf, &cp->memory_pressure_enabled, 2126 "memory_pressure_enabled") < 0) 2127 goto err; 2128 cp->memory_pressure_enabled_valid = 1; 2129 } 2130 2131 if (exists_flag(buf, "memory_spread_page")) { 2132 if (load_flag 2133 (buf, &cp->memory_spread_page, "memory_spread_page") < 0) 2134 goto err; 2135 cp->memory_spread_page_valid = 1; 2136 } 2137 2138 if (exists_flag(buf, "memory_spread_slab")) { 2139 if (load_flag 2140 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0) 2141 goto err; 2142 cp->memory_spread_slab_valid = 1; 2143 } 2144 2145 if (exists_flag(buf, "sched_load_balance")) { 2146 if (load_flag 2147 (buf, &cp->sched_load_balance, "sched_load_balance") < 0) 2148 goto err; 2149 cp->sched_load_balance_valid = 1; 2150 } 2151 2152 if (exists_flag(buf, "sched_relax_domain_level")) { 2153 if (load_number 2154 (buf, &cp->sched_relax_domain_level, 2155 "sched_relax_domain_level") < 0) 2156 goto err; 2157 cp->sched_relax_domain_level_valid = 1; 2158 } 2159 2160 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0) 2161 goto err; 2162 cp->cpus_valid = 1; 2163 2164 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0) 2165 goto err; 2166 cp->mems_valid = 1; 2167 2168 return 0; 2169 err: 2170 return -1; 2171 } 2172 2173 /* Modify cpuset at location 'relpath' to values of 'cp' */ 2174 int cpuset_modify(const char *relpath, const struct cpuset *cp) 2175 { 2176 return cr_or_mod(relpath, cp, 0); 2177 } 2178 2179 /* Get cpuset path of pid into buf */ 2180 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) 2181 { 2182 int fd; /* dual use: cpuset file for pid and self */ 2183 int rc; /* dual use: snprintf and read return codes */ 2184 2185 if (check() < 0) 2186 return NULL; 2187 2188 /* borrow result buf[] to build cpuset file path */ 2189 if (pid == 0) 2190 rc = snprintf(buf, size, "/proc/self/cpuset"); 2191 else 2192 rc = snprintf(buf, size, "/proc/%d/cpuset", pid); 2193 if (rc >= (int)size) { 2194 errno = E2BIG; 2195 return NULL; 2196 } 2197 if ((fd = open(buf, O_RDONLY)) < 0) { 2198 int e = errno; 2199 if (e == ENOENT) 2200 e = ESRCH; 2201 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) 2202 e = ENOSYS; 2203 else 2204 close(fd); 2205 errno = e; 2206 return NULL; 2207 } 2208 rc = read(fd, buf, size); 2209 close(fd); 2210 if (rc < 0) 2211 return NULL; 2212 if (rc >= (int)size) { 2213 errno = E2BIG; 2214 return NULL; 2215 } 2216 buf[rc] = 0; 2217 chomp(buf); 2218 return buf; 2219 2220 } 2221 2222 /* Get cpuset 'cp' of pid */ 2223 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) 2224 { 2225 char buf[PATH_MAX]; 2226 2227 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 2228 return -1; 2229 if (cpuset_query(cp, buf) < 0) 2230 return -1; 2231 return 0; 2232 } 2233 2234 /* [optional] Return mountpoint of cpuset filesystem */ 2235 const char *cpuset_mountpoint() 2236 { 2237 if (check() < 0) { 2238 switch (errno) { 2239 case ENODEV: 2240 return "[cpuset filesystem not mounted]"; 2241 default: 2242 return "[cpuset filesystem not supported]"; 2243 } 2244 } 2245 return cpusetmnt; 2246 } 2247 2248 /* Return true if path is a directory. */ 2249 static int isdir(const char *path) 2250 { 2251 struct stat statbuf; 2252 2253 if (stat(path, &statbuf) < 0) 2254 return 0; 2255 return S_ISDIR(statbuf.st_mode); 2256 } 2257 2258 /* 2259 * [optional] cpuset_collides_exclusive() - True if would collide exclusive. 2260 * 2261 * Return true iff the specified cpuset would overlap with any 2262 * sibling cpusets in either cpus or mems, where either this 2263 * cpuset or the sibling is cpu_exclusive or mem_exclusive. 2264 * 2265 * cpuset_create() fails with errno == EINVAL if the requested cpuset 2266 * would overlap with any sibling, where either one is cpu_exclusive or 2267 * mem_exclusive. This is a common, and not obvious error. The 2268 * following routine checks for this particular case, so that code 2269 * creating cpusets can better identify the situation, perhaps to issue 2270 * a more informative error message. 2271 * 2272 * Can also be used to diagnose cpuset_modify failures. This 2273 * routine ignores any existing cpuset with the same path as the 2274 * given 'cpusetpath', and only looks for exclusive collisions with 2275 * sibling cpusets of that path. 2276 * 2277 * In case of any error, returns (0) -- does not collide. Presumably 2278 * any actual attempt to create or modify a cpuset will encounter the 2279 * same error, and report it usefully. 2280 * 2281 * This routine is not particularly efficient; most likely code creating or 2282 * modifying a cpuset will want to try the operation first, and then if that 2283 * fails with errno EINVAL, perhaps call this routine to determine if an 2284 * exclusive cpuset collision caused the error. 2285 */ 2286 2287 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) 2288 { 2289 char parent[PATH_MAX]; 2290 char *p; 2291 char *pathcopy = NULL; 2292 char *base; 2293 DIR *dir = NULL; 2294 struct dirent *dent; 2295 struct cpuset *cp2 = NULL; 2296 struct bitmask *cpus1 = NULL, *cpus2 = NULL; 2297 struct bitmask *mems1 = NULL, *mems2 = NULL; 2298 int ret; 2299 2300 if (check() < 0) 2301 goto err; 2302 2303 fullpath(parent, sizeof(parent), cpusetpath); 2304 if (streq(parent, cpusetmnt)) 2305 goto err; /* only one cpuset root - can't collide */ 2306 pathcopy = strdup(parent); 2307 p = strrchr(parent, '/'); 2308 if (!p) 2309 goto err; /* huh? - impossible - run and hide */ 2310 *p = 0; /* now parent is dirname of fullpath */ 2311 2312 p = strrchr(pathcopy, '/'); 2313 base = p + 1; /* now base is basename of fullpath */ 2314 if (!*base) 2315 goto err; /* this is also impossible - run away */ 2316 2317 if ((dir = opendir(parent)) == NULL) 2318 goto err; 2319 if ((cp2 = cpuset_alloc()) == NULL) 2320 goto err; 2321 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2322 goto err; 2323 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2324 goto err; 2325 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2326 goto err; 2327 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2328 goto err; 2329 2330 while ((dent = readdir(dir)) != NULL) { 2331 char child[PATH_MAX]; 2332 2333 if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) 2334 continue; 2335 if (streq(dent->d_name, base)) 2336 continue; 2337 pathcat2(child, sizeof(child), parent, dent->d_name); 2338 if (!isdir(child)) 2339 continue; 2340 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) 2341 goto err; 2342 if (cp1->cpu_exclusive || cp2->cpu_exclusive) { 2343 cpuset_getcpus(cp1, cpus1); 2344 cpuset_getcpus(cp2, cpus2); 2345 if (bitmask_intersects(cpus1, cpus2)) 2346 goto collides; 2347 } 2348 if (cp1->mem_exclusive || cp2->mem_exclusive) { 2349 cpuset_getmems(cp1, mems1); 2350 cpuset_getmems(cp2, mems2); 2351 if (bitmask_intersects(mems1, mems2)) 2352 goto collides; 2353 } 2354 } 2355 err: 2356 /* error, or did not collide */ 2357 ret = 0; 2358 goto done; 2359 collides: 2360 /* collides */ 2361 ret = 1; 2362 /* fall into ... */ 2363 done: 2364 if (dir) 2365 closedir(dir); 2366 cpuset_free(cp2); 2367 free(pathcopy); 2368 bitmask_free(cpus1); 2369 bitmask_free(cpus2); 2370 bitmask_free(mems1); 2371 bitmask_free(mems2); 2372 return ret; 2373 } 2374 2375 /* 2376 * [optional] cpuset_nuke() - Remove cpuset anyway possible 2377 * 2378 * Remove a cpuset, including killing tasks in it, and 2379 * removing any descendent cpusets and killing their tasks. 2380 * 2381 * Tasks can take a long time (minutes on some configurations) 2382 * to exit. Loop up to 'seconds' seconds, trying to kill them. 2383 * 2384 * How we do it: 2385 * 1) First, kill all the pids, looping until there are 2386 * no more pids in this cpuset or below, or until the 2387 * 'seconds' timeout limit is exceeded. 2388 * 2) Then depth first recursively rmdir the cpuset directories. 2389 * 3) If by this point the original cpuset is gone, we succeeded. 2390 * 2391 * If the timeout is exceeded, and tasks still exist, fail with 2392 * errno == ETIME. 2393 * 2394 * We sleep a variable amount of time. After the first attempt to 2395 * kill all the tasks in the cpuset or its descendents, we sleep 1 2396 * second, the next time 2 seconds, increasing 1 second each loop 2397 * up to a max of 10 seconds. If more loops past 10 are required 2398 * to kill all the tasks, we sleep 10 seconds each subsequent loop. 2399 * In any case, before the last loop, we sleep however many seconds 2400 * remain of the original timeout 'seconds' requested. The total 2401 * time of all sleeps will be no more than the requested 'seconds'. 2402 * 2403 * If the cpuset started out empty of any tasks, or if the passed in 2404 * 'seconds' was zero, then this routine will return quickly, having 2405 * not slept at all. Otherwise, this routine will at a minimum send 2406 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one 2407 * second, before looking to see if any tasks remain. If tasks remain 2408 * in the cpuset subtree, and a longer 'seconds' timeout was requested 2409 * (more than one), it will continue to kill remaining tasks and sleep, 2410 * in a loop, for as long as time and tasks remain. 2411 * 2412 * The signal sent for the kill is hardcoded to SIGKILL (9). If some 2413 * other signal should be sent first, use a separate code loop, 2414 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to 2415 * scan the task pids in a cpuset. If SIGKILL should -not- be sent, 2416 * this cpuset_nuke() routine can still be called to recursively 2417 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. 2418 * 2419 * On success, returns 0 with errno == 0. 2420 * 2421 * On failure, returns -1, with errno possibly one of: 2422 * EACCES - search permission denied on intervening directory 2423 * ETIME - timed out - tasks remain after 'seconds' timeout 2424 * EMFILE - too many open files 2425 * ENODEV - /dev/cpuset not mounted 2426 * ENOENT - component of cpuset path doesn't exist 2427 * ENOMEM - out of memory 2428 * ENOSYS - kernel doesn't support cpusets 2429 * ENOTDIR - component of cpuset path is not a directory 2430 * EPERM - lacked permission to kill a task 2431 * EPERM - lacked permission to read cpusets or files therein 2432 */ 2433 2434 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); 2435 2436 int cpuset_nuke(const char *relpath, unsigned int seconds) 2437 { 2438 unsigned int secs_left = seconds; /* total sleep seconds left */ 2439 unsigned int secs_loop = 1; /* how much sleep next loop */ 2440 unsigned int secs_slept; /* seconds slept in sleep() */ 2441 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ 2442 struct cpuset_fts_tree *cs_tree; 2443 const struct cpuset_fts_entry *cs_entry; 2444 int ret, sav_errno = 0; 2445 2446 if (check() < 0) 2447 return -1; 2448 2449 if (seconds == 0) 2450 goto rmdir_cpusets; 2451 2452 while (1) { 2453 int plen, j; 2454 2455 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { 2456 /* missing cpuset is as good as if already nuked */ 2457 if (errno == ENOENT) { 2458 ret = 0; 2459 goto no_more_cpuset; 2460 } 2461 2462 /* other problems reading cpuset are bad news */ 2463 sav_errno = errno; 2464 goto failed; 2465 } 2466 2467 if ((plen = cpuset_pidlist_length(pl)) == 0) 2468 goto rmdir_cpusets; 2469 2470 for (j = 0; j < plen; j++) { 2471 pid_t pid; 2472 2473 if ((pid = cpuset_get_pidlist(pl, j)) > 1) { 2474 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { 2475 sav_errno = errno; 2476 goto failed; 2477 } 2478 } 2479 } 2480 2481 if (secs_left == 0) 2482 goto took_too_long; 2483 2484 cpuset_freepidlist(pl); 2485 pl = NULL; 2486 2487 secs_slept = secs_loop - sleep(secs_loop); 2488 2489 /* Ensure forward progress */ 2490 if (secs_slept == 0) 2491 secs_slept = 1; 2492 2493 /* Ensure sane sleep() return (unnecessary?) */ 2494 if (secs_slept > secs_loop) 2495 secs_slept = secs_loop; 2496 2497 secs_left -= secs_slept; 2498 2499 if (secs_loop < 10) 2500 secs_loop++; 2501 2502 secs_loop = min(secs_left, secs_loop); 2503 } 2504 2505 took_too_long: 2506 sav_errno = ETIME; 2507 /* fall into ... */ 2508 failed: 2509 cpuset_freepidlist(pl); 2510 errno = sav_errno; 2511 return -1; 2512 2513 rmdir_cpusets: 2514 /* Let's try removing cpuset(s) now. */ 2515 cpuset_freepidlist(pl); 2516 2517 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) 2518 return -1; 2519 ret = 0; 2520 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ 2521 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2522 char buf[PATH_MAX]; 2523 2524 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); 2525 if (rmdir(buf) < 0 && errno != ENOENT) { 2526 sav_errno = errno; 2527 ret = -1; 2528 } 2529 } 2530 cpuset_fts_close(cs_tree); 2531 /* fall into ... */ 2532 no_more_cpuset: 2533 if (ret == 0) 2534 errno = 0; 2535 else 2536 errno = sav_errno; 2537 return ret; 2538 } 2539 2540 /* 2541 * When recursively reading all the tasks files from a subtree, 2542 * chain together the read results, one pidblock per tasks file, 2543 * containing the raw unprocessed ascii as read(2) in. After 2544 * we gather up this raw data, we then go back to count how 2545 * many pid's there are in total, allocate an array of pid_t 2546 * of that size, and transform the raw ascii data into this 2547 * array of pid_t's. 2548 */ 2549 2550 struct pidblock { 2551 char *buf; 2552 int buflen; 2553 struct pidblock *next; 2554 }; 2555 2556 /* 2557 * Chain the raw contents of a file onto the pbhead list. 2558 * 2559 * We malloc "+ 1" extra byte for a nul-terminator, so that 2560 * the strtoul() loop in pid_transform() won't scan past 2561 * the end of pb->buf[] and accidentally find more pids. 2562 */ 2563 static void add_pidblock(const char *file, struct pidblock **ppbhead) 2564 { 2565 FILE *fp = NULL; 2566 struct pidblock *pb = NULL; 2567 int fsz; 2568 2569 if ((fp = fopen(file, "r")) == NULL) 2570 goto err; 2571 fsz = filesize(fp); 2572 if (fsz == 0) 2573 goto err; 2574 if ((pb = calloc(1, sizeof(*pb))) == NULL) 2575 goto err; 2576 pb->buflen = fsz; 2577 if ((pb->buf = malloc(pb->buflen + 1)) == NULL) 2578 goto err; 2579 if (fread(pb->buf, 1, pb->buflen, fp) > 0) { 2580 pb->buf[pb->buflen] = '\0'; 2581 pb->next = *ppbhead; 2582 *ppbhead = pb; 2583 } 2584 fclose(fp); 2585 return; 2586 err: 2587 if (fp) 2588 fclose(fp); 2589 free(pb); 2590 } 2591 2592 static void read_task_file(const char *relpath, struct pidblock **ppbhead) 2593 { 2594 char buf[PATH_MAX]; 2595 2596 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2597 add_pidblock(buf, ppbhead); 2598 } 2599 2600 struct cpuset_pidlist { 2601 pid_t *pids; 2602 int npids; 2603 }; 2604 2605 /* Count how many pids in buf (one per line - just count newlines) */ 2606 static int pidcount(const char *buf, int buflen) 2607 { 2608 int n = 0; 2609 const char *cp; 2610 2611 for (cp = buf; cp < buf + buflen; cp++) { 2612 if (*cp == '\n') 2613 n++; 2614 } 2615 return n; 2616 } 2617 2618 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */ 2619 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) 2620 { 2621 char *a, *b; 2622 2623 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { 2624 pid_t p = strtoul(a, &b, 10); 2625 if (a == b) 2626 break; 2627 pl->pids[n++] = p; 2628 } 2629 return n; 2630 } 2631 2632 static void free_pidblocks(struct pidblock *pbhead) 2633 { 2634 struct pidblock *pb, *nextpb; 2635 2636 for (pb = pbhead; pb; pb = nextpb) { 2637 nextpb = pb->next; 2638 free(pb->buf); 2639 free(pb); 2640 } 2641 } 2642 2643 /* numeric comparison routine for qsort */ 2644 static int numericsort(const void *m1, const void *m2) 2645 { 2646 pid_t p1 = *(pid_t *) m1; 2647 pid_t p2 = *(pid_t *) m2; 2648 2649 return p1 - p2; 2650 } 2651 2652 /* Return list pids in cpuset 'path' */ 2653 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, 2654 int recursiveflag) 2655 { 2656 struct pidblock *pb = NULL; 2657 struct cpuset_pidlist *pl = NULL; 2658 struct pidblock *pbhead = NULL; 2659 int n; 2660 2661 if (check() < 0) 2662 goto err; 2663 2664 if (recursiveflag) { 2665 struct cpuset_fts_tree *cs_tree; 2666 const struct cpuset_fts_entry *cs_entry; 2667 2668 if ((cs_tree = cpuset_fts_open(relpath)) == NULL) 2669 goto err; 2670 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2671 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) 2672 continue; 2673 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); 2674 } 2675 cpuset_fts_close(cs_tree); 2676 } else { 2677 read_task_file(relpath, &pbhead); 2678 } 2679 2680 if ((pl = calloc(1, sizeof(*pl))) == NULL) 2681 goto err; 2682 pl->npids = 0; 2683 for (pb = pbhead; pb; pb = pb->next) 2684 pl->npids += pidcount(pb->buf, pb->buflen); 2685 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) 2686 goto err; 2687 n = 0; 2688 for (pb = pbhead; pb; pb = pb->next) 2689 n = pid_transform(pb, pl, n); 2690 free_pidblocks(pbhead); 2691 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); 2692 return pl; 2693 err: 2694 cpuset_freepidlist(pl); 2695 free_pidblocks(pbhead); 2696 return NULL; 2697 } 2698 2699 /* Return number of elements in pidlist */ 2700 int cpuset_pidlist_length(const struct cpuset_pidlist *pl) 2701 { 2702 if (pl) 2703 return pl->npids; 2704 else 2705 return 0; 2706 } 2707 2708 /* Return i'th element of pidlist */ 2709 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i) 2710 { 2711 if (pl && i >= 0 && i < pl->npids) 2712 return pl->pids[i]; 2713 else 2714 return (pid_t) - 1; 2715 } 2716 2717 /* Free pidlist */ 2718 void cpuset_freepidlist(struct cpuset_pidlist *pl) 2719 { 2720 if (pl && pl->pids) 2721 free(pl->pids); 2722 free(pl); 2723 } 2724 2725 static int __cpuset_move(pid_t pid, const char *path) 2726 { 2727 char buf[SMALL_BUFSZ]; 2728 2729 snprintf(buf, sizeof(buf), "%u", pid); 2730 return write_string_file(path, buf); 2731 } 2732 2733 /* Move task (pid == 0 for current) to a cpuset */ 2734 int cpuset_move(pid_t pid, const char *relpath) 2735 { 2736 char buf[PATH_MAX]; 2737 2738 if (check() < 0) 2739 return -1; 2740 2741 if (pid == 0) 2742 pid = getpid(); 2743 2744 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2745 return __cpuset_move(pid, buf); 2746 } 2747 2748 /* Move all tasks in pidlist to a cpuset */ 2749 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) 2750 { 2751 int i; 2752 char buf[PATH_MAX]; 2753 int ret; 2754 2755 if (check() < 0) 2756 return -1; 2757 2758 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2759 2760 ret = 0; 2761 for (i = 0; i < pl->npids; i++) 2762 if (__cpuset_move(pl->pids[i], buf) < 0) 2763 ret = -1; 2764 return ret; 2765 } 2766 2767 /* 2768 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a 2769 * cpuset to another cpuset 2770 * 2771 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may 2772 * race with tasks being added to or forking into fromrelpath. Loop 2773 * repeatedly, reading the tasks file of cpuset fromrelpath and writing 2774 * any task pid's found there to the tasks file of cpuset torelpath, 2775 * up to ten attempts, or until the tasks file of cpuset fromrelpath 2776 * is empty, or until fromrelpath is no longer present. 2777 * 2778 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset 2779 * fromrelpath. Of course it is still possible that some independent 2780 * task could add another task to cpuset fromrelpath at the same time 2781 * that such a successful result is being returned, so there can be 2782 * no guarantee that a successful return means that fromrelpath is 2783 * still empty of tasks. 2784 * 2785 * We are careful to allow for the possibility that the cpuset 2786 * fromrelpath might disappear out from under us, perhaps because it 2787 * has notify_on_release set and gets automatically removed as soon 2788 * as we detach its last task from it. Consider a missing fromrelpath 2789 * to be a successful move. 2790 * 2791 * If called with fromrelpath and torelpath pathnames that evaluate to 2792 * the same cpuset, then treat that as if cpuset_reattach() was called, 2793 * rebinding each task in this cpuset one time, and return success or 2794 * failure depending on the return of that cpuset_reattach() call. 2795 * 2796 * On failure, returns -1, with errno possibly one of: 2797 * EACCES - search permission denied on intervening directory 2798 * ENOTEMPTY - tasks remain after multiple attempts to move them 2799 * EMFILE - too many open files 2800 * ENODEV - /dev/cpuset not mounted 2801 * ENOENT - component of cpuset path doesn't exist 2802 * ENOMEM - out of memory 2803 * ENOSYS - kernel doesn't support cpusets 2804 * ENOTDIR - component of cpuset path is not a directory 2805 * EPERM - lacked permission to kill a task 2806 * EPERM - lacked permission to read cpusets or files therein 2807 * 2808 * This is an [optional] function. Use cpuset_function to invoke it. 2809 */ 2810 2811 #define NUMBER_MOVE_TASK_ATTEMPTS 10 2812 2813 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) 2814 { 2815 char fromfullpath[PATH_MAX]; 2816 char tofullpath[PATH_MAX]; 2817 int i; 2818 struct cpuset_pidlist *pl = NULL; 2819 int sav_errno; 2820 2821 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); 2822 fullpath(tofullpath, sizeof(tofullpath), torelpath); 2823 2824 if (samefile(fromfullpath, tofullpath)) 2825 return cpuset_reattach(fromrelpath); 2826 2827 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { 2828 int plen, j; 2829 2830 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { 2831 /* missing cpuset is as good as if all moved */ 2832 if (errno == ENOENT) 2833 goto no_more_cpuset; 2834 2835 /* other problems reading cpuset are bad news */ 2836 sav_errno = errno; 2837 goto failed; 2838 } 2839 2840 if ((plen = cpuset_pidlist_length(pl)) == 0) 2841 goto no_more_pids; 2842 2843 for (j = 0; j < plen; j++) { 2844 pid_t pid; 2845 2846 pid = cpuset_get_pidlist(pl, j); 2847 if (cpuset_move(pid, torelpath) < 0) { 2848 /* missing task is as good as if moved */ 2849 if (errno == ESRCH) 2850 continue; 2851 2852 /* other per-task errors are bad news */ 2853 sav_errno = errno; 2854 goto failed; 2855 } 2856 } 2857 2858 cpuset_freepidlist(pl); 2859 pl = NULL; 2860 } 2861 2862 sav_errno = ENOTEMPTY; 2863 /* fall into ... */ 2864 failed: 2865 cpuset_freepidlist(pl); 2866 errno = sav_errno; 2867 return -1; 2868 2869 no_more_pids: 2870 no_more_cpuset: 2871 /* Success - all tasks (or entire cpuset ;) gone. */ 2872 cpuset_freepidlist(pl); 2873 errno = 0; 2874 return 0; 2875 } 2876 2877 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ 2878 int cpuset_migrate(pid_t pid, const char *relpath) 2879 { 2880 char buf[PATH_MAX]; 2881 char buf2[PATH_MAX]; 2882 char memory_migrate_flag; 2883 int r; 2884 2885 if (check() < 0) 2886 return -1; 2887 2888 if (pid == 0) 2889 pid = getpid(); 2890 2891 fullpath(buf2, sizeof(buf2), relpath); 2892 2893 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2894 return -1; 2895 if (store_flag(buf2, "memory_migrate", 1) < 0) 2896 return -1; 2897 2898 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2899 2900 r = __cpuset_move(pid, buf); 2901 2902 store_flag(buf2, "memory_migrate", memory_migrate_flag); 2903 return r; 2904 } 2905 2906 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ 2907 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) 2908 { 2909 int i; 2910 char buf[PATH_MAX]; 2911 char buf2[PATH_MAX]; 2912 char memory_migrate_flag; 2913 int ret; 2914 2915 if (check() < 0) 2916 return -1; 2917 2918 fullpath(buf2, sizeof(buf2), relpath); 2919 2920 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2921 return -1; 2922 if (store_flag(buf2, "memory_migrate", 1) < 0) 2923 return -1; 2924 2925 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2926 2927 ret = 0; 2928 for (i = 0; i < pl->npids; i++) 2929 if (__cpuset_move(pl->pids[i], buf) < 0) 2930 ret = -1; 2931 2932 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) 2933 ret = -1; 2934 return ret; 2935 } 2936 2937 /* Rebind cpus_allowed of each task in cpuset 'path' */ 2938 int cpuset_reattach(const char *relpath) 2939 { 2940 struct cpuset_pidlist *pl; 2941 int rc; 2942 2943 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) 2944 return -1; 2945 rc = cpuset_move_all(pl, relpath); 2946 cpuset_freepidlist(pl); 2947 return rc; 2948 } 2949 2950 /* Map cpuset relative cpu number to system wide cpu number */ 2951 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) 2952 { 2953 struct cpuset *cp_tofree = NULL; 2954 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2955 int pos = -1; 2956 2957 if (!cp1) 2958 goto err; 2959 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); 2960 /* fall into ... */ 2961 err: 2962 cpuset_free(cp_tofree); 2963 return pos; 2964 } 2965 2966 /* Map system wide cpu number to cpuset relative cpu number */ 2967 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) 2968 { 2969 struct cpuset *cp_tofree = NULL; 2970 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2971 int pos = -1; 2972 2973 if (!cp1) 2974 goto err; 2975 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); 2976 /* fall into ... */ 2977 err: 2978 cpuset_free(cp_tofree); 2979 return pos; 2980 } 2981 2982 /* Map cpuset relative mem number to system wide mem number */ 2983 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) 2984 { 2985 struct cpuset *cp_tofree = NULL; 2986 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2987 int pos = -1; 2988 2989 if (!cp1) 2990 goto err; 2991 pos = bitmask_rel_to_abs_pos(cp1->mems, mem); 2992 /* fall into ... */ 2993 err: 2994 cpuset_free(cp_tofree); 2995 return pos; 2996 } 2997 2998 /* Map system wide mem number to cpuset relative mem number */ 2999 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) 3000 { 3001 struct cpuset *cp_tofree = NULL; 3002 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 3003 int pos = -1; 3004 3005 if (!cp1) 3006 goto err; 3007 pos = bitmask_abs_to_rel_pos(cp1->mems, mem); 3008 /* fall into ... */ 3009 err: 3010 cpuset_free(cp_tofree); 3011 return pos; 3012 } 3013 3014 /* Map pid's cpuset relative cpu number to system wide cpu number */ 3015 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) 3016 { 3017 struct cpuset *cp; 3018 int rc = -1; 3019 3020 if ((cp = cpuset_alloc()) == NULL) 3021 goto done; 3022 if (cpuset_cpusetofpid(cp, pid) < 0) 3023 goto done; 3024 rc = cpuset_c_rel_to_sys_cpu(cp, cpu); 3025 done: 3026 cpuset_free(cp); 3027 return rc; 3028 } 3029 3030 /* Map system wide cpu number to pid's cpuset relative cpu number */ 3031 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) 3032 { 3033 struct cpuset *cp; 3034 int rc = -1; 3035 3036 if ((cp = cpuset_alloc()) == NULL) 3037 goto done; 3038 if (cpuset_cpusetofpid(cp, pid) < 0) 3039 goto done; 3040 rc = cpuset_c_sys_to_rel_cpu(cp, cpu); 3041 done: 3042 cpuset_free(cp); 3043 return rc; 3044 } 3045 3046 /* Map pid's cpuset relative mem number to system wide mem number */ 3047 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) 3048 { 3049 struct cpuset *cp; 3050 int rc = -1; 3051 3052 if ((cp = cpuset_alloc()) == NULL) 3053 goto done; 3054 if (cpuset_cpusetofpid(cp, pid) < 0) 3055 goto done; 3056 rc = cpuset_c_rel_to_sys_mem(cp, mem); 3057 done: 3058 cpuset_free(cp); 3059 return rc; 3060 } 3061 3062 /* Map system wide mem number to pid's cpuset relative mem number */ 3063 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) 3064 { 3065 struct cpuset *cp; 3066 int rc = -1; 3067 3068 if ((cp = cpuset_alloc()) == NULL) 3069 goto done; 3070 if (cpuset_cpusetofpid(cp, pid) < 0) 3071 goto done; 3072 rc = cpuset_c_sys_to_rel_mem(cp, mem); 3073 done: 3074 cpuset_free(cp); 3075 return rc; 3076 } 3077 3078 /* 3079 * Override glibc's calls for get/set affinity - they have 3080 * something using cpu_set_t that will die when NR_CPUS > 1024. 3081 * Go directly to the 'real' system calls. Also override calls 3082 * for get_mempolicy and set_mempolicy. None of these 3083 * calls are yet (July 2004) guaranteed to be in all glibc versions 3084 * that we care about. 3085 */ 3086 3087 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) 3088 { 3089 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask); 3090 } 3091 3092 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3093 static int get_mempolicy(int *policy, unsigned long *nmask, 3094 unsigned long maxnode, void *addr, int flags) 3095 { 3096 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode, 3097 addr, flags); 3098 } 3099 #endif 3100 3101 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT 3102 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) 3103 { 3104 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode); 3105 } 3106 #endif 3107 3108 struct cpuset_placement { 3109 struct bitmask *cpus; 3110 struct bitmask *mems; 3111 char *path; 3112 }; 3113 3114 /* Allocate and fill in a placement struct - cpatures current placement */ 3115 struct cpuset_placement *cpuset_get_placement(pid_t pid) 3116 { 3117 struct cpuset_placement *plc; 3118 struct cpuset *cp = NULL; 3119 char buf[PATH_MAX]; 3120 int nbits; 3121 3122 if ((plc = calloc(1, sizeof(*plc))) == NULL) 3123 goto err; 3124 3125 nbits = cpuset_cpus_nbits(); 3126 if ((plc->cpus = bitmask_alloc(nbits)) == NULL) 3127 goto err; 3128 3129 nbits = cpuset_mems_nbits(); 3130 if ((plc->mems = bitmask_alloc(nbits)) == NULL) 3131 goto err; 3132 3133 if ((cp = cpuset_alloc()) == NULL) 3134 goto err; 3135 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 3136 goto err; 3137 if (cpuset_query(cp, buf) < 0) 3138 goto err; 3139 3140 bitmask_copy(plc->cpus, cp->cpus); 3141 bitmask_copy(plc->mems, cp->mems); 3142 plc->path = strdup(buf); 3143 3144 cpuset_free(cp); 3145 return plc; 3146 err: 3147 cpuset_free(cp); 3148 cpuset_free_placement(plc); 3149 return NULL; 3150 } 3151 3152 /* Compare two placement structs - use to detect changes in placement */ 3153 int cpuset_equal_placement(const struct cpuset_placement *plc1, 3154 const struct cpuset_placement *plc2) 3155 { 3156 return bitmask_equal(plc1->cpus, plc2->cpus) && 3157 bitmask_equal(plc1->mems, plc2->mems) && 3158 streq(plc1->path, plc2->path); 3159 } 3160 3161 /* Free a placement struct */ 3162 void cpuset_free_placement(struct cpuset_placement *plc) 3163 { 3164 if (!plc) 3165 return; 3166 bitmask_free(plc->cpus); 3167 bitmask_free(plc->mems); 3168 free(plc->path); 3169 free(plc); 3170 } 3171 3172 /* 3173 * A cpuset_fts_open() call constructs a linked list of entries 3174 * called a "cpuset_fts_tree", with one entry per cpuset below 3175 * the specified path. The cpuset_fts_read() routine returns the 3176 * next entry on this list. The various cpuset_fts_get_*() calls 3177 * return attributes of the specified entry. The cpuset_fts_close() 3178 * call frees the linked list and all associated data. All cpuset 3179 * entries and attributes for the cpuset_fts_tree returned from a 3180 * given cpuset_fts_open() call remain allocated and unchanged until 3181 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any 3182 * subsequent changes to the cpuset filesystem will go unnoticed 3183 * (not affect open cpuset_fts_tree's.) 3184 */ 3185 3186 struct cpuset_fts_entry; 3187 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); 3188 3189 struct cpuset_fts_tree { 3190 struct cpuset_fts_entry *head; /* head of linked entry list */ 3191 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ 3192 }; 3193 3194 struct cpuset_fts_entry { 3195 struct cpuset_fts_entry *next; /* linked entry list chain */ 3196 struct cpuset *cpuset; 3197 struct stat *stat; 3198 char *path; 3199 int info; 3200 int err; 3201 }; 3202 3203 /* Open a handle on a cpuset hierarchy. All the real work is done here. */ 3204 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) 3205 { 3206 FTS *fts = NULL; 3207 FTSENT *ftsent; 3208 char *path_argv[2]; 3209 char buf[PATH_MAX]; 3210 struct cpuset_fts_tree *cs_tree = NULL; 3211 struct cpuset_fts_entry *ep; /* the latest new list entry */ 3212 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ 3213 char *relpath; 3214 int fts_flags; 3215 3216 fullpath(buf, sizeof(buf), cpusetpath); 3217 path_argv[0] = buf; 3218 path_argv[1] = NULL; 3219 3220 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; 3221 fts = fts_open(path_argv, fts_flags, NULL); 3222 if (fts == NULL) 3223 goto err; 3224 3225 cs_tree = malloc(sizeof(*cs_tree)); 3226 if (cs_tree == NULL) 3227 goto err; 3228 pnlep = &cs_tree->head; 3229 *pnlep = NULL; 3230 3231 while ((ftsent = fts_read(fts)) != NULL) { 3232 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) 3233 continue; 3234 3235 /* ftsent is a directory (perhaps unreadable) ==> cpuset */ 3236 ep = calloc(1, sizeof(*ep)); 3237 if (ep == NULL) 3238 goto err; 3239 *pnlep = ep; 3240 pnlep = &ep->next; 3241 3242 /* Set entry's path, and if DNR, error */ 3243 relpath = ftsent->fts_path + strlen(cpusetmnt); 3244 if (strlen(relpath) == 0) 3245 relpath = "/"; 3246 ep->path = strdup(relpath); 3247 if (ep->path == NULL) 3248 goto err; 3249 if (ftsent->fts_info == FTS_DNR) { 3250 ep->info = CPUSET_FTS_ERR_DNR; 3251 ep->err = ftsent->fts_errno; 3252 continue; 3253 } 3254 3255 /* ftsent is a -readable- cpuset: set entry's stat, etc */ 3256 ep->stat = calloc(1, sizeof(struct stat)); 3257 if (ep->stat == NULL) 3258 goto err; 3259 if (stat(ftsent->fts_path, ep->stat) < 0) { 3260 ep->info = CPUSET_FTS_ERR_STAT; 3261 ep->err = ftsent->fts_errno; 3262 continue; 3263 } 3264 3265 ep->cpuset = calloc(1, sizeof(struct cpuset)); 3266 if (ep->cpuset == NULL) 3267 goto err; 3268 if (cpuset_query(ep->cpuset, relpath) < 0) { 3269 ep->info = CPUSET_FTS_ERR_CPUSET; 3270 ep->err = errno; 3271 continue; 3272 } 3273 ep->info = CPUSET_FTS_CPUSET; 3274 } 3275 3276 (void)fts_close(fts); 3277 cpuset_fts_rewind(cs_tree); 3278 return cs_tree; 3279 3280 err: 3281 if (cs_tree) 3282 cpuset_fts_close(cs_tree); 3283 if (fts) 3284 (void)fts_close(fts); 3285 return NULL; 3286 } 3287 3288 /* Return pointer to next cpuset entry in hierarchy */ 3289 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) 3290 { 3291 const struct cpuset_fts_entry *cs_entry = cs_tree->next; 3292 if (cs_tree->next != NULL) /* seek to next entry */ 3293 cs_tree->next = cs_tree->next->next; 3294 return cs_entry; 3295 } 3296 3297 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ 3298 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) 3299 { 3300 struct cpuset_fts_entry *cs1, *cs2, *cs3; 3301 3302 /* 3303 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer 3304 * is redirected from cs3 to cs1. 3305 */ 3306 3307 cs1 = cs2 = NULL; 3308 cs3 = cs_tree->head; 3309 while (cs3) { 3310 cs1 = cs2; 3311 cs2 = cs3; 3312 cs3 = cs3->next; 3313 cs2->next = cs1; 3314 } 3315 cs_tree->head = cs2; 3316 cpuset_fts_rewind(cs_tree); 3317 } 3318 3319 /* Rewind cpuset list to beginning */ 3320 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) 3321 { 3322 cs_tree->next = cs_tree->head; 3323 } 3324 3325 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */ 3326 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) 3327 { 3328 return cs_entry->path; 3329 } 3330 3331 /* Return pointer to stat(2) structure of a cpuset entry's directory */ 3332 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) 3333 { 3334 return cs_entry->stat; 3335 } 3336 3337 /* Return pointer to cpuset structure of a cpuset entry */ 3338 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry 3339 *cs_entry) 3340 { 3341 return cs_entry->cpuset; 3342 } 3343 3344 /* Return value of errno (0 if no error) on attempted cpuset operations */ 3345 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) 3346 { 3347 return cs_entry->err; 3348 } 3349 3350 /* Return operation identity causing error */ 3351 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) 3352 { 3353 return cs_entry->info; 3354 } 3355 3356 /* Close a cpuset hierarchy handle (free's all associated memory) */ 3357 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) 3358 { 3359 struct cpuset_fts_entry *cs_entry = cs_tree->head; 3360 3361 while (cs_entry) { 3362 struct cpuset_fts_entry *ep = cs_entry; 3363 3364 cs_entry = cs_entry->next; 3365 free(ep->path); 3366 free(ep->stat); 3367 cpuset_free(ep->cpuset); 3368 free(ep); 3369 } 3370 free(cs_tree); 3371 } 3372 3373 /* Bind current task to cpu (uses sched_setaffinity(2)) */ 3374 int cpuset_cpubind(int cpu) 3375 { 3376 struct bitmask *bmp; 3377 int r; 3378 3379 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3380 return -1; 3381 bitmask_setbit(bmp, cpu); 3382 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); 3383 bitmask_free(bmp); 3384 return r; 3385 } 3386 3387 /* 3388 * int cpuset_latestcpu(pid_t pid) 3389 * 3390 * Return most recent CPU on which task pid executed. If pid == 0, 3391 * examine current task. 3392 * 3393 * The last used CPU is visible for a given pid as field #39 (starting 3394 * with #1) in the file /proc/pid/stat. Currently this file has 41 3395 * fields, in which case this is the 3rd to the last field. 3396 * 3397 * Unfortunately field #2 is a command name and might have embedded 3398 * whitespace. So we can't just count white space separated fields. 3399 * Fortunately, this command name is surrounded by parentheses, as 3400 * for example "(sh)", and that closing parenthesis is the last ')' 3401 * character in the line. No remaining fields can have embedded 3402 * whitespace or parentheses. So instead of looking for the 39th 3403 * white space separated field, we can look for the 37th white space 3404 * separated field past the last ')' character on the line. 3405 */ 3406 3407 /* Return most recent CPU on which task pid executed */ 3408 int cpuset_latestcpu(pid_t pid) 3409 { 3410 char buf[PATH_MAX]; 3411 char *bp; 3412 int fd = -1; 3413 int cpu = -1; 3414 3415 if (pid == 0) 3416 snprintf(buf, sizeof(buf), "/proc/self/stat"); 3417 else 3418 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 3419 3420 if ((fd = open(buf, O_RDONLY)) < 0) 3421 goto err; 3422 if (read(fd, buf, sizeof(buf)) < 1) 3423 goto err; 3424 close(fd); 3425 3426 bp = strrchr(buf, ')'); 3427 if (bp) 3428 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ 3429 &cpu); 3430 if (cpu < 0) 3431 errno = EINVAL; 3432 return cpu; 3433 err: 3434 if (fd >= 0) 3435 close(fd); 3436 return -1; 3437 } 3438 3439 /* Bind current task to memory (uses set_mempolicy(2)) */ 3440 int cpuset_membind(int mem) 3441 { 3442 struct bitmask *bmp; 3443 int r; 3444 3445 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3446 return -1; 3447 bitmask_setbit(bmp, mem); 3448 #if HAVE_DECL_MPOL_BIND 3449 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1); 3450 #else 3451 r = -1; 3452 errno = ENOSYS; 3453 #endif 3454 bitmask_free(bmp); 3455 return r; 3456 } 3457 3458 /* [optional] Return Memory Node holding page at specified addr */ 3459 int cpuset_addr2node(void *addr) 3460 { 3461 int node = -1; 3462 3463 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3464 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) { 3465 /* I realize this seems redundant, but I _want_ to make sure 3466 * that this value is -1. */ 3467 node = -1; 3468 } 3469 #endif 3470 return node; 3471 } 3472 3473 /* 3474 * Transform cpuset into Text Format Representation in buffer 'buf', 3475 * of length 'buflen', nul-terminated if space allows. Return number 3476 * of characters that would have been written, if enough space had 3477 * been available, in the same way that snprintf() does. 3478 */ 3479 3480 /* Export cpuset settings to a regular file */ 3481 int cpuset_export(const struct cpuset *cp, char *buf, int buflen) 3482 { 3483 char *tmp = NULL; 3484 int n = 0; 3485 3486 if (cp->cpu_exclusive) 3487 n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n"); 3488 3489 if (cp->mem_exclusive) 3490 n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n"); 3491 3492 if (cp->notify_on_release) 3493 n += snprintf(buf + n, max(buflen - n, 0), 3494 "notify_on_release\n"); 3495 3496 if (cp->memory_pressure_enabled) 3497 n += snprintf(buf + n, max(buflen - n, 0), 3498 "memory_pressure_enabled\n"); 3499 3500 if (cp->memory_migrate) 3501 n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n"); 3502 3503 if (cp->memory_spread_page) 3504 n += snprintf(buf + n, max(buflen - n, 0), 3505 "memory_spread_page\n"); 3506 3507 if (cp->memory_spread_slab) 3508 n += snprintf(buf + n, max(buflen - n, 0), 3509 "memory_spread_slab\n"); 3510 3511 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) 3512 return -1; 3513 n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp); 3514 free(tmp); 3515 tmp = NULL; 3516 3517 if ((tmp = sprint_mask_buf(cp->mems)) == NULL) 3518 return -1; 3519 n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp); 3520 free(tmp); 3521 tmp = NULL; 3522 3523 return n; 3524 } 3525 3526 static int import_list(UNUSED const char *tok, const char *arg, 3527 struct bitmask *bmp, char *emsg, int elen) 3528 { 3529 if (bitmask_parselist(arg, bmp) < 0) { 3530 if (emsg) 3531 snprintf(emsg, elen, "Invalid list format: %s", arg); 3532 return -1; 3533 } 3534 return 0; 3535 } 3536 3537 static void stolower(char *s) 3538 { 3539 while (*s) { 3540 unsigned char c = *s; 3541 *s = tolower(c); 3542 s++; 3543 } 3544 } 3545 3546 /* Import cpuset settings from a regular file */ 3547 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, 3548 char *emsg, int elen) 3549 { 3550 char *linebuf = NULL; 3551 int linebuflen; 3552 int linenum = 0; 3553 int offset = 0; 3554 3555 linebuflen = strlen(buf) + 1; 3556 if ((linebuf = malloc(linebuflen)) == NULL) { 3557 if (emsg) 3558 snprintf(emsg, elen, "Insufficient memory"); 3559 goto err; 3560 } 3561 3562 while (slgets(linebuf, linebuflen, buf, &offset)) { 3563 char *tok, *arg; 3564 char *ptr; /* for strtok_r */ 3565 3566 linenum++; 3567 if ((tok = strchr(linebuf, '#')) != NULL) 3568 *tok = 0; 3569 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) 3570 continue; 3571 stolower(tok); 3572 3573 arg = strtok_r(0, " \t", &ptr); 3574 3575 if (streq(tok, "cpu_exclusive")) { 3576 cp->cpu_exclusive = 1; 3577 goto eol; 3578 } 3579 if (streq(tok, "mem_exclusive")) { 3580 cp->mem_exclusive = 1; 3581 goto eol; 3582 } 3583 if (streq(tok, "notify_on_release")) { 3584 cp->notify_on_release = 1; 3585 goto eol; 3586 } 3587 if (streq(tok, "memory_pressure_enabled")) { 3588 cp->memory_pressure_enabled = 1; 3589 goto eol; 3590 } 3591 if (streq(tok, "memory_migrate")) { 3592 cp->memory_migrate = 1; 3593 goto eol; 3594 } 3595 if (streq(tok, "memory_spread_page")) { 3596 cp->memory_spread_page = 1; 3597 goto eol; 3598 } 3599 if (streq(tok, "memory_spread_slab")) { 3600 cp->memory_spread_slab = 1; 3601 goto eol; 3602 } 3603 if (streq(tok, "cpu") || streq(tok, "cpus")) { 3604 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) 3605 goto err; 3606 goto eol; 3607 } 3608 if (streq(tok, "mem") || streq(tok, "mems")) { 3609 if (import_list(tok, arg, cp->mems, emsg, elen) < 0) 3610 goto err; 3611 goto eol; 3612 } 3613 if (emsg) 3614 snprintf(emsg, elen, "Unrecognized token: '%s'", tok); 3615 goto err; 3616 eol: 3617 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { 3618 if (emsg) 3619 snprintf(emsg, elen, "Surplus token: '%s'", 3620 tok); 3621 goto err; 3622 } 3623 continue; 3624 } 3625 3626 free(linebuf); 3627 3628 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) 3629 cpuset_localcpus(cp->mems, cp->cpus); 3630 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) 3631 cpuset_localmems(cp->cpus, cp->mems); 3632 3633 /* 3634 * All cpuset attributes are determined in an import. 3635 * Those that aren't explicitly specified are presumed 3636 * to be unchanged (zero, if it's a freshly allocated 3637 * struct cpuset.) 3638 */ 3639 3640 cp->cpus_valid = 1; 3641 cp->mems_valid = 1; 3642 cp->cpu_exclusive_valid = 1; 3643 cp->mem_exclusive_valid = 1; 3644 cp->notify_on_release_valid = 1; 3645 cp->memory_migrate_valid = 1; 3646 cp->memory_pressure_enabled_valid = 1; 3647 cp->memory_spread_page_valid = 1; 3648 cp->memory_spread_slab_valid = 1; 3649 3650 return 0; 3651 err: 3652 if (elinenum) 3653 *elinenum = linenum; 3654 free(linebuf); 3655 return -1; 3656 } 3657 3658 /* Pin current task CPU (and memory) */ 3659 int cpuset_pin(int relcpu) 3660 { 3661 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3662 int cpu, r; 3663 3664 if (check() < 0) 3665 return -1; 3666 3667 do { 3668 cpuset_free_placement(plc1); 3669 plc1 = cpuset_get_placement(0); 3670 3671 r = 0; 3672 if (cpuset_unpin() < 0) 3673 r = -1; 3674 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); 3675 if (cpuset_cpubind(cpu) < 0) 3676 r = -1; 3677 3678 cpuset_free_placement(plc2); 3679 plc2 = cpuset_get_placement(0); 3680 } while (!cpuset_equal_placement(plc1, plc2)); 3681 3682 cpuset_free_placement(plc1); 3683 cpuset_free_placement(plc2); 3684 return r; 3685 } 3686 3687 /* Return number CPUs in current tasks cpuset */ 3688 int cpuset_size() 3689 { 3690 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3691 int r; 3692 3693 if (check() < 0) 3694 return -1; 3695 3696 do { 3697 cpuset_free_placement(plc1); 3698 plc1 = cpuset_get_placement(0); 3699 3700 r = cpuset_cpus_weight(0); 3701 3702 cpuset_free_placement(plc2); 3703 plc2 = cpuset_get_placement(0); 3704 } while (!cpuset_equal_placement(plc1, plc2)); 3705 3706 cpuset_free_placement(plc1); 3707 cpuset_free_placement(plc2); 3708 return r; 3709 } 3710 3711 /* Return relative CPU number, within current cpuset, last executed on */ 3712 int cpuset_where() 3713 { 3714 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3715 int r; 3716 3717 if (check() < 0) 3718 return -1; 3719 3720 do { 3721 cpuset_free_placement(plc1); 3722 plc1 = cpuset_get_placement(0); 3723 3724 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); 3725 3726 cpuset_free_placement(plc2); 3727 plc2 = cpuset_get_placement(0); 3728 } while (!cpuset_equal_placement(plc1, plc2)); 3729 3730 cpuset_free_placement(plc1); 3731 cpuset_free_placement(plc2); 3732 return r; 3733 } 3734 3735 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ 3736 int cpuset_unpin() 3737 { 3738 struct bitmask *cpus = NULL, *mems = NULL; 3739 int r = -1; 3740 3741 if (check() < 0) 3742 goto err; 3743 3744 /* 3745 * Don't need cpuset_*_placement() guard against concurrent 3746 * cpuset migration, because none of the following depends 3747 * on the tasks cpuset placement. 3748 */ 3749 3750 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3751 goto err; 3752 bitmask_setall(cpus); 3753 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) 3754 goto err; 3755 3756 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3757 goto err; 3758 #if HAVE_DECL_MPOL_DEFAULT 3759 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), 3760 bitmask_nbits(mems) + 1) < 0) 3761 goto err; 3762 r = 0; 3763 #endif 3764 /* fall into ... */ 3765 err: 3766 bitmask_free(cpus); 3767 bitmask_free(mems); 3768 return r; 3769 3770 } 3771 3772 struct cpuset_function_list { 3773 const char *fname; 3774 void *func; 3775 } flist[] = { 3776 { 3777 "cpuset_version", cpuset_version}, { 3778 "cpuset_alloc", cpuset_alloc}, { 3779 "cpuset_free", cpuset_free}, { 3780 "cpuset_cpus_nbits", cpuset_cpus_nbits}, { 3781 "cpuset_mems_nbits", cpuset_mems_nbits}, { 3782 "cpuset_setcpus", cpuset_setcpus}, { 3783 "cpuset_setmems", cpuset_setmems}, { 3784 "cpuset_set_iopt", cpuset_set_iopt}, { 3785 "cpuset_set_sopt", cpuset_set_sopt}, { 3786 "cpuset_getcpus", cpuset_getcpus}, { 3787 "cpuset_getmems", cpuset_getmems}, { 3788 "cpuset_cpus_weight", cpuset_cpus_weight}, { 3789 "cpuset_mems_weight", cpuset_mems_weight}, { 3790 "cpuset_get_iopt", cpuset_get_iopt}, { 3791 "cpuset_get_sopt", cpuset_get_sopt}, { 3792 "cpuset_localcpus", cpuset_localcpus}, { 3793 "cpuset_localmems", cpuset_localmems}, { 3794 "cpuset_cpumemdist", cpuset_cpumemdist}, { 3795 "cpuset_cpu2node", cpuset_cpu2node}, { 3796 "cpuset_addr2node", cpuset_addr2node}, { 3797 "cpuset_create", cpuset_create}, { 3798 "cpuset_delete", cpuset_delete}, { 3799 "cpuset_query", cpuset_query}, { 3800 "cpuset_modify", cpuset_modify}, { 3801 "cpuset_getcpusetpath", cpuset_getcpusetpath}, { 3802 "cpuset_cpusetofpid", cpuset_cpusetofpid}, { 3803 "cpuset_mountpoint", cpuset_mountpoint}, { 3804 "cpuset_collides_exclusive", cpuset_collides_exclusive}, { 3805 "cpuset_nuke", cpuset_nuke}, { 3806 "cpuset_init_pidlist", cpuset_init_pidlist}, { 3807 "cpuset_pidlist_length", cpuset_pidlist_length}, { 3808 "cpuset_get_pidlist", cpuset_get_pidlist}, { 3809 "cpuset_freepidlist", cpuset_freepidlist}, { 3810 "cpuset_move", cpuset_move}, { 3811 "cpuset_move_all", cpuset_move_all}, { 3812 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, { 3813 "cpuset_migrate", cpuset_migrate}, { 3814 "cpuset_migrate_all", cpuset_migrate_all}, { 3815 "cpuset_reattach", cpuset_reattach}, { 3816 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, { 3817 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, { 3818 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, { 3819 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, { 3820 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, { 3821 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, { 3822 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, { 3823 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, { 3824 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, { 3825 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, { 3826 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, { 3827 "cpuset_get_placement", cpuset_get_placement}, { 3828 "cpuset_equal_placement", cpuset_equal_placement}, { 3829 "cpuset_free_placement", cpuset_free_placement}, { 3830 "cpuset_fts_open", cpuset_fts_open}, { 3831 "cpuset_fts_read", cpuset_fts_read}, { 3832 "cpuset_fts_reverse", cpuset_fts_reverse}, { 3833 "cpuset_fts_rewind", cpuset_fts_rewind}, { 3834 "cpuset_fts_get_path", cpuset_fts_get_path}, { 3835 "cpuset_fts_get_stat", cpuset_fts_get_stat}, { 3836 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, { 3837 "cpuset_fts_get_errno", cpuset_fts_get_errno}, { 3838 "cpuset_fts_get_info", cpuset_fts_get_info}, { 3839 "cpuset_fts_close", cpuset_fts_close}, { 3840 "cpuset_cpubind", cpuset_cpubind}, { 3841 "cpuset_latestcpu", cpuset_latestcpu}, { 3842 "cpuset_membind", cpuset_membind}, { 3843 "cpuset_export", cpuset_export}, { 3844 "cpuset_import", cpuset_import}, { 3845 "cpuset_function", cpuset_function}, { 3846 "cpuset_pin", cpuset_pin}, { 3847 "cpuset_size", cpuset_size}, { 3848 "cpuset_where", cpuset_where}, { 3849 "cpuset_unpin", cpuset_unpin},}; 3850 3851 /* Return pointer to a libcpuset.so function, or NULL */ 3852 void *cpuset_function(const char *function_name) 3853 { 3854 unsigned int i; 3855 3856 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++) 3857 if (streq(function_name, flist[i].fname)) 3858 return flist[i].func; 3859 return NULL; 3860 } 3861 3862 /* Fortran interface to basic cpuset routines */ 3863 int cpuset_pin_(int *ptr_relcpu) 3864 { 3865 return cpuset_pin(*ptr_relcpu); 3866 } 3867 3868 int cpuset_size_(void) 3869 { 3870 return cpuset_size(); 3871 } 3872 3873 int cpuset_where_(void) 3874 { 3875 return cpuset_where(); 3876 } 3877 3878 int cpuset_unpin_(void) 3879 { 3880 return cpuset_unpin(); 3881 } 3882 3883 #endif /* HAVE_LINUX_MEMPOLICY_H */ 3884