1 /* 2 * cpuset user library implementation. 3 * 4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved. 5 * 6 * Paul Jackson <pj (at) sgi.com> 7 */ 8 9 /* 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser General Public License as published by 12 * the Free Software Foundation; either version 2.1 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License for more details. 19 * 20 * You should have received a copy of the GNU Lesser General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 */ 24 25 #define _XOPEN_SOURCE 500 /* need to see pread() */ 26 #define _BSD_SOURCE 1 /* need to see syscall() */ 27 #include <unistd.h> 28 29 #include <ctype.h> 30 #include <dirent.h> 31 #include <errno.h> 32 #include <fcntl.h> 33 #include <fts.h> 34 #include <limits.h> 35 #include <signal.h> 36 #include <stdint.h> 37 #include <stdio.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <sys/stat.h> 41 #include <sys/syscall.h> 42 #include <sys/types.h> 43 #include <time.h> 44 #include <utime.h> 45 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */ 46 47 #include "bitmask.h" 48 #include "cpuset.h" 49 #include "common.h" 50 #include "test.h" 51 #include "lapi/syscalls.h" 52 #include "config.h" 53 54 #if HAVE_LINUX_MEMPOLICY_H 55 #include <linux/mempolicy.h> 56 57 /* Bump version, and update Change History, when libcpuset API changes */ 58 #define CPUSET_VERSION 3 59 60 /* 61 * For a history of what changed in each version, see the "Change 62 * History" section, at the end of the libcpuset master document. 63 */ 64 65 int cpuset_version(void) 66 { 67 return CPUSET_VERSION; 68 } 69 70 struct cpuset { 71 struct bitmask *cpus; 72 struct bitmask *mems; 73 char cpu_exclusive; 74 char mem_exclusive; 75 char mem_hardwall; 76 char notify_on_release; 77 char memory_migrate; 78 char memory_pressure_enabled; 79 char memory_spread_page; 80 char memory_spread_slab; 81 char sched_load_balance; 82 int sched_relax_domain_level; 83 84 /* 85 * Each field 'x' above gets an 'x_valid' field below. 86 * The apply_cpuset_settings() will only set those fields whose 87 * corresponding *_valid flags are set. The cpuset_alloc() 88 * routine clears these flags as part of the clear in calloc(), 89 * and the various cpuset_set*() routines set these flags when 90 * setting the corresponding value. 91 * 92 * The purpose of these valid fields is to ensure that when 93 * we create a new cpuset, we don't accidentally overwrite 94 * some non-zero kernel default, such as an inherited 95 * memory_spread_* flag, just because the user application 96 * code didn't override the default zero settings resulting 97 * from the calloc() call in cpuset_alloc(). 98 * 99 * The choice of 'char' for the type of the flags above, 100 * but a bitfield for the flags below, is somewhat capricious. 101 */ 102 unsigned cpus_valid:1; 103 unsigned mems_valid:1; 104 unsigned cpu_exclusive_valid:1; 105 unsigned mem_exclusive_valid:1; 106 unsigned mem_hardwall_valid:1; 107 unsigned notify_on_release_valid:1; 108 unsigned memory_migrate_valid:1; 109 unsigned memory_pressure_enabled_valid:1; 110 unsigned memory_spread_page_valid:1; 111 unsigned memory_spread_slab_valid:1; 112 unsigned sched_load_balance_valid:1; 113 unsigned sched_relax_domain_level_valid:1; 114 115 /* 116 * if the relative variable was modified, use following flags 117 * to put a mark 118 */ 119 unsigned cpus_dirty:1; 120 unsigned mems_dirty:1; 121 unsigned cpu_exclusive_dirty:1; 122 unsigned mem_exclusive_dirty:1; 123 unsigned mem_hardwall_dirty:1; 124 unsigned notify_on_release_dirty:1; 125 unsigned memory_migrate_dirty:1; 126 unsigned memory_pressure_enabled_dirty:1; 127 unsigned memory_spread_page_dirty:1; 128 unsigned memory_spread_slab_dirty:1; 129 unsigned sched_load_balance_dirty:1; 130 unsigned sched_relax_domain_level_dirty:1; 131 }; 132 133 /* Presumed cpuset file system mount point */ 134 static const char *cpusetmnt = "/dev/cpuset"; 135 136 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */ 137 static const char *mapfile = "/var/run/cpunodemap"; 138 139 /* The primary source for the cpunodemap[] is available below here. */ 140 static const char *sysdevices = "/sys/devices/system"; 141 142 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */ 143 #define SMALL_BUFSZ 16 144 145 /* 146 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t 147 * and nodemask_t sizes. The lines in this file that begin with the 148 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask 149 * and nodemask string, respectively. The lengths of these strings 150 * reflect the kernel's internal cpumask_t and nodemask_t sizes, 151 * which sizes are needed to correctly call the sched_setaffinity 152 * and set_mempolicy system calls, and to size user level 153 * bitmasks to match the kernels. 154 */ 155 156 static const char *mask_size_file = "/proc/self/status"; 157 static const char *cpumask_prefix = "Cpus_allowed:\t"; 158 static const char *nodemask_prefix = "Mems_allowed:\t"; 159 160 /* 161 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits. 162 * 163 * The first time we need these, we parse the Cpus_allowed and 164 * Mems_allowed lines from mask_size_file ("/proc/self/status"). 165 */ 166 167 static int cpumask_sz; 168 static int nodemask_sz; 169 170 /* 171 * These defaults only kick in if we fail to size the kernel 172 * cpumask and nodemask by reading the Cpus_allowed and 173 * Mems_allowed fields from the /proc/self/status file. 174 */ 175 176 #define DEFCPUBITS (512) 177 #define DEFNODEBITS (DEFCPUBITS/2) 178 179 /* 180 * Arch-neutral API for obtaining NUMA distances between CPUs 181 * and Memory Nodes, via the files: 182 * /sys/devices/system/node/nodeN/distance 183 * which have lines such as: 184 * 46 66 10 20 185 * which say that for cpu on node N (from the path above), the 186 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20, 187 * respectively. 188 */ 189 190 static const char *distance_directory = "/sys/devices/system/node"; 191 192 /* 193 * Someday, we should disable, then later discard, the SN code 194 * marked ALTERNATE_SN_DISTMAP. 195 */ 196 197 #define ALTERNATE_SN_DISTMAP 1 198 #ifdef ALTERNATE_SN_DISTMAP 199 200 /* 201 * Alternative SN (SGI ia64) architecture specific API for obtaining 202 * NUMA distances between CPUs and Memory Nodes is via the file 203 * /proc/sgi_sn/sn_topology, which has lines such as: 204 * 205 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20 206 * 207 * which says that for each CPU on node 2, the distance to nodes 208 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively. 209 * 210 * This file has other lines as well, which start with other 211 * keywords than "node". Ignore these other lines. 212 */ 213 214 static const char *sn_topology = "/proc/sgi_sn/sn_topology"; 215 static const char *sn_top_node_prefix = "node "; 216 217 #endif 218 219 /* 220 * Check that cpusets supported, /dev/cpuset mounted. 221 * If ok, return 0. 222 * If not, return -1 and set errno: 223 * ENOSYS - kernel doesn't support cpusets 224 * ENODEV - /dev/cpuset not mounted 225 */ 226 227 static enum { 228 check_notdone, 229 check_enosys, 230 check_enodev, 231 check_ok 232 } check_state = check_notdone; 233 234 static int check() 235 { 236 if (check_state == check_notdone) { 237 struct stat statbuf; 238 239 if (stat("/proc/self/cpuset", &statbuf) < 0) { 240 check_state = check_enosys; 241 goto done; 242 } 243 244 if (stat("/dev/cpuset/tasks", &statbuf) < 0) { 245 check_state = check_enodev; 246 goto done; 247 } 248 249 check_state = check_ok; 250 } 251 done: 252 switch (check_state) { 253 case check_enosys: 254 errno = ENOSYS; 255 return -1; 256 case check_enodev: 257 errno = ENODEV; 258 return -1; 259 default: 260 break; 261 } 262 return 0; 263 } 264 265 static void chomp(char *s) 266 { 267 char *t; 268 269 for (t = s + strlen(s) - 1; t >= s; t--) { 270 if (*t == '\n' || *t == '\r') 271 *t = '\0'; 272 else 273 break; 274 } 275 } 276 277 /* 278 * Determine number of bytes in a seekable open file, without 279 * assuming that stat(2) on that file has a useful size. 280 * Has side affect of leaving the file rewound to the beginnning. 281 */ 282 static int filesize(FILE * fp) 283 { 284 int sz = 0; 285 rewind(fp); 286 while (fgetc(fp) != EOF) 287 sz++; 288 rewind(fp); 289 return sz; 290 } 291 292 /* Are strings s1 and s2 equal? */ 293 static int streq(const char *s1, const char *s2) 294 { 295 return strcmp(s1, s2) == 0; 296 } 297 298 /* Is string 'pre' a prefix of string 's'? */ 299 static int strprefix(const char *s, const char *pre) 300 { 301 return strncmp(s, pre, strlen(pre)) == 0; 302 } 303 304 /* 305 * char *flgets(char *buf, int buflen, FILE *fp) 306 * 307 * Obtain one line from input file fp. Copy up to first 308 * buflen-1 chars of line into buffer buf, discarding any remainder 309 * of line. Stop reading at newline, discarding newline. 310 * Nul terminate result and return pointer to buffer buf 311 * on success, or NULL if nothing more to read or failure. 312 */ 313 314 static char *flgets(char *buf, int buflen, FILE * fp) 315 { 316 int c = -1; 317 char *bp; 318 319 bp = buf; 320 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) { 321 if (c == '\n') 322 goto newline; 323 *bp++ = c; 324 } 325 if ((c < 0) && (bp == buf)) 326 return NULL; 327 328 if (c > 0) { 329 while ((c = getc(fp)) >= 0) { 330 if (c == '\n') 331 break; 332 } 333 } 334 335 newline: 336 *bp++ = '\0'; 337 return buf; 338 } 339 340 /* 341 * sgetc(const char *inputbuf, int *offsetptr) 342 * 343 * Return next char from nul-terminated input buffer inputbuf, 344 * starting at offset *offsetptr. Increment *offsetptr. 345 * If next char would be nul ('\0'), return EOF and don't 346 * increment *offsetptr. 347 */ 348 349 static int sgetc(const char *inputbuf, int *offsetptr) 350 { 351 char c; 352 353 if ((c = inputbuf[*offsetptr]) != 0) { 354 *offsetptr = *offsetptr + 1; 355 return c; 356 } else { 357 return EOF; 358 } 359 } 360 361 /* 362 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 363 * 364 * Obtain next line from nul-terminated input buffer 'inputbuf', 365 * starting at offset *offsetptr. Copy up to first buflen-1 366 * chars of line into output buffer buf, discarding any remainder 367 * of line. Stop reading at newline, discarding newline. 368 * Nul terminate result and return pointer to output buffer 369 * buf on success, or NULL if nothing more to read. 370 */ 371 372 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr) 373 { 374 int c = -1; 375 char *bp; 376 377 bp = buf; 378 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) { 379 if (c == '\n') 380 goto newline; 381 *bp++ = c; 382 } 383 if ((c < 0) && (bp == buf)) 384 return NULL; 385 386 if (c > 0) { 387 while ((c = sgetc(inputbuf, offsetptr)) >= 0) { 388 if (c == '\n') 389 break; 390 } 391 } 392 393 newline: 394 *bp++ = '\0'; 395 return buf; 396 } 397 398 /* 399 * time_t get_mtime(char *path) 400 * 401 * Return modtime of file at location path, else return 0. 402 */ 403 404 static time_t get_mtime(const char *path) 405 { 406 struct stat statbuf; 407 408 if (stat(path, &statbuf) != 0) 409 return 0; 410 return statbuf.st_mtime; 411 } 412 413 /* 414 * int set_mtime(const char *path, time_t mtime) 415 * 416 * Set modtime of file 'path' to 'mtime'. Return 0 on success, 417 * or -1 on error, setting errno. 418 */ 419 420 static int set_mtime(const char *path, time_t mtime) 421 { 422 struct utimbuf times; 423 424 times.actime = mtime; 425 times.modtime = mtime; 426 return utime(path, ×); 427 } 428 429 /* 430 * True if two pathnames resolve to same file. 431 * False if either path can not be stat'd, 432 * or if the two paths resolve to a different file. 433 */ 434 435 static int samefile(const char *path1, const char *path2) 436 { 437 struct stat sb1, sb2; 438 439 if (stat(path1, &sb1) != 0) 440 return 0; 441 if (stat(path2, &sb2) != 0) 442 return 0; 443 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev; 444 } 445 446 #define slash(c) (*(c) == '/') 447 #define eocomp(c) (slash(c) || !*(c)) 448 #define dot1(c) (*(c) == '.' && eocomp(c+1)) 449 450 /* In place path compression. Remove extra dots and slashes. */ 451 static char *pathcomp(char *p) 452 { 453 char *a = p; 454 char *b = p; 455 456 if (!p || !*p) 457 return p; 458 if (slash(p)) 459 *b++ = *a++; 460 for (;;) { 461 if (slash(a)) 462 while (slash(++a)) 463 continue; 464 if (!*a) { 465 if (b == p) 466 *b++ = '.'; 467 *b = '\0'; 468 return (p); 469 } else if (dot1(a)) { 470 a++; 471 } else { 472 if ((b != p) && !slash(b - 1)) 473 *b++ = '/'; 474 while (!eocomp(a)) 475 *b++ = *a++; 476 } 477 } 478 } 479 480 #undef slash 481 #undef eocomp 482 #undef dot1 483 484 /* 485 * pathcat2(buf, buflen, name1, name2) 486 * 487 * Return buf, of length buflen, with name1/name2 stored in it. 488 */ 489 490 static char *pathcat2(char *buf, int buflen, const char *name1, 491 const char *name2) 492 { 493 (void)snprintf(buf, buflen, "%s/%s", name1, name2); 494 return pathcomp(buf); 495 } 496 497 /* 498 * pathcat3(buf, buflen, name1, name2, name3) 499 * 500 * Return buf, of length buflen, with name1/name2/name3 stored in it. 501 */ 502 503 static char *pathcat3(char *buf, int buflen, const char *name1, 504 const char *name2, const char *name3) 505 { 506 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3); 507 return pathcomp(buf); 508 } 509 510 /* 511 * fullpath(buf, buflen, name) 512 * 513 * Put full path of cpuset 'name' in buffer 'buf'. If name 514 * starts with a slash (``/``) character, then this a path 515 * relative to ``/dev/cpuset``, otherwise it is relative to 516 * the current tasks cpuset. Return 0 on success, else 517 * -1 on error, setting errno. 518 */ 519 520 static int fullpath(char *buf, int buflen, const char *name) 521 { 522 int len; 523 524 /* easy case */ 525 if (*name == '/') { 526 pathcat2(buf, buflen, cpusetmnt, name); 527 pathcomp(buf); 528 return 0; 529 } 530 531 /* hard case */ 532 snprintf(buf, buflen, "%s/", cpusetmnt); 533 len = strlen(buf); 534 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL) 535 return -1; 536 if (strlen(buf) >= buflen - 1 - strlen(name)) { 537 errno = E2BIG; 538 return -1; 539 } 540 strcat(buf, "/"); 541 strcat(buf, name); 542 pathcomp(buf); 543 return 0; 544 } 545 546 /* 547 * fullpath2(buf, buflen, name1, name2) 548 * 549 * Like fullpath(), only concatenate two pathname components on end. 550 */ 551 552 static int fullpath2(char *buf, int buflen, const char *name1, 553 const char *name2) 554 { 555 if (fullpath(buf, buflen, name1) < 0) 556 return -1; 557 if (strlen(buf) >= buflen - 1 - strlen(name2)) { 558 errno = E2BIG; 559 return -1; 560 } 561 strcat(buf, "/"); 562 strcat(buf, name2); 563 pathcomp(buf); 564 return 0; 565 } 566 567 /* 568 * Convert the string length of an ascii hex mask to the number 569 * of bits represented by that mask. 570 * 571 * The cpumask and nodemask values in /proc/self/status are in an 572 * ascii format that uses 9 characters for each 32 bits of mask. 573 */ 574 static int s2nbits(const char *s) 575 { 576 return strlen(s) * 32 / 9; 577 } 578 579 static void update_mask_sizes() 580 { 581 FILE *fp = NULL; 582 char *buf = NULL; 583 int fsize; 584 585 if ((fp = fopen(mask_size_file, "r")) == NULL) 586 goto done; 587 fsize = filesize(fp); 588 if ((buf = malloc(fsize)) == NULL) 589 goto done; 590 591 /* 592 * Beware: mask sizing arithmetic is fussy. 593 * The trailing newline left by fgets() is required. 594 */ 595 while (fgets(buf, fsize, fp)) { 596 if (strprefix(buf, cpumask_prefix)) 597 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix)); 598 if (strprefix(buf, nodemask_prefix)) 599 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); 600 } 601 done: 602 free(buf); 603 if (fp != NULL) 604 fclose(fp); 605 if (cpumask_sz == 0) 606 cpumask_sz = DEFCPUBITS; 607 if (nodemask_sz == 0) 608 nodemask_sz = DEFNODEBITS; 609 } 610 611 /* Allocate a new struct cpuset */ 612 struct cpuset *cpuset_alloc() 613 { 614 struct cpuset *cp = NULL; 615 int nbits; 616 617 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL) 618 goto err; 619 620 nbits = cpuset_cpus_nbits(); 621 if ((cp->cpus = bitmask_alloc(nbits)) == NULL) 622 goto err; 623 624 nbits = cpuset_mems_nbits(); 625 if ((cp->mems = bitmask_alloc(nbits)) == NULL) 626 goto err; 627 628 return cp; 629 err: 630 if (cp && cp->cpus) 631 bitmask_free(cp->cpus); 632 if (cp && cp->mems) 633 bitmask_free(cp->mems); 634 free(cp); 635 return NULL; 636 } 637 638 /* Free struct cpuset *cp */ 639 void cpuset_free(struct cpuset *cp) 640 { 641 if (!cp) 642 return; 643 if (cp->cpus) 644 bitmask_free(cp->cpus); 645 if (cp->mems) 646 bitmask_free(cp->mems); 647 free(cp); 648 } 649 650 /* Number of bits in a CPU bitmask on current system */ 651 int cpuset_cpus_nbits() 652 { 653 if (cpumask_sz == 0) 654 update_mask_sizes(); 655 return cpumask_sz; 656 } 657 658 /* Number of bits in a Memory bitmask on current system */ 659 int cpuset_mems_nbits() 660 { 661 if (nodemask_sz == 0) 662 update_mask_sizes(); 663 return nodemask_sz; 664 } 665 666 /* Set CPUs in cpuset cp to bitmask cpus */ 667 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus) 668 { 669 if (cp->cpus) 670 bitmask_free(cp->cpus); 671 cp->cpus = bitmask_alloc(bitmask_nbits(cpus)); 672 if (cp->cpus == NULL) 673 return -1; 674 bitmask_copy(cp->cpus, cpus); 675 cp->cpus_valid = 1; 676 cp->cpus_dirty = 1; 677 return 0; 678 } 679 680 /* Set Memory Nodes in cpuset cp to bitmask mems */ 681 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems) 682 { 683 if (cp->mems) 684 bitmask_free(cp->mems); 685 cp->mems = bitmask_alloc(bitmask_nbits(mems)); 686 if (cp->mems == NULL) 687 return -1; 688 bitmask_copy(cp->mems, mems); 689 cp->mems_valid = 1; 690 cp->mems_dirty = 1; 691 return 0; 692 } 693 694 /* Set integer value optname of cpuset cp */ 695 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value) 696 { 697 if (streq(optionname, "cpu_exclusive")) { 698 cp->cpu_exclusive = ! !value; 699 cp->cpu_exclusive_valid = 1; 700 cp->cpu_exclusive_dirty = 1; 701 } else if (streq(optionname, "mem_exclusive")) { 702 cp->mem_exclusive = ! !value; 703 cp->mem_exclusive_valid = 1; 704 cp->mem_exclusive_dirty = 1; 705 } else if (streq(optionname, "mem_hardwall")) { 706 cp->mem_hardwall = ! !value; 707 cp->mem_hardwall_valid = 1; 708 cp->mem_hardwall_dirty = 1; 709 } else if (streq(optionname, "notify_on_release")) { 710 cp->notify_on_release = ! !value; 711 cp->notify_on_release_valid = 1; 712 cp->notify_on_release_dirty = 1; 713 } else if (streq(optionname, "memory_pressure_enabled")) { 714 cp->memory_pressure_enabled = ! !value; 715 cp->memory_pressure_enabled_valid = 1; 716 cp->memory_pressure_enabled_dirty = 1; 717 } else if (streq(optionname, "memory_migrate")) { 718 cp->memory_migrate = ! !value; 719 cp->memory_migrate_valid = 1; 720 cp->memory_migrate_dirty = 1; 721 } else if (streq(optionname, "memory_spread_page")) { 722 cp->memory_spread_page = ! !value; 723 cp->memory_spread_page_valid = 1; 724 cp->memory_spread_page_dirty = 1; 725 } else if (streq(optionname, "memory_spread_slab")) { 726 cp->memory_spread_slab = ! !value; 727 cp->memory_spread_slab_valid = 1; 728 cp->memory_spread_slab_dirty = 1; 729 } else if (streq(optionname, "sched_load_balance")) { 730 cp->sched_load_balance = ! !value; 731 cp->sched_load_balance_valid = 1; 732 cp->sched_load_balance_dirty = 1; 733 } else if (streq(optionname, "sched_relax_domain_level")) { 734 cp->sched_relax_domain_level = value; 735 cp->sched_relax_domain_level_valid = 1; 736 cp->sched_relax_domain_level_dirty = 1; 737 } else 738 return -2; /* optionname not recognized */ 739 return 0; 740 } 741 742 /* [optional] Set string value optname */ 743 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname, 744 UNUSED const char *value) 745 { 746 return -2; /* For now, all string options unrecognized */ 747 } 748 749 /* Return handle for reading memory_pressure. */ 750 int cpuset_open_memory_pressure(const char *cpusetpath) 751 { 752 char buf[PATH_MAX]; 753 754 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure"); 755 return open(buf, O_RDONLY); 756 } 757 758 /* Return current memory_pressure of cpuset. */ 759 int cpuset_read_memory_pressure(int han) 760 { 761 char buf[SMALL_BUFSZ]; 762 763 if (pread(han, buf, sizeof(buf), 0L) < 0) 764 return -1; 765 return atoi(buf); 766 } 767 768 /* Close handle for reading memory pressure. */ 769 void cpuset_close_memory_pressure(int han) 770 { 771 close(han); 772 } 773 774 /* 775 * Resolve cpuset pointer (to that of current task if cp == NULL). 776 * 777 * If cp not NULL, just return it. If cp is NULL, return pointer 778 * to temporary cpuset for current task, and set *cp_tofree to 779 * pointer to that same temporary cpuset, to be freed later. 780 * 781 * Return NULL and set errno on error. Errors can occur when 782 * resolving the current tasks cpuset. 783 */ 784 static const struct cpuset *resolve_cp(const struct cpuset *cp, 785 struct cpuset **cp_tofree) 786 { 787 const struct cpuset *rcp; 788 789 if (cp) { 790 rcp = cp; 791 } else { 792 struct cpuset *cp1 = cpuset_alloc(); 793 if (cp1 == NULL) 794 goto err; 795 if (cpuset_cpusetofpid(cp1, 0) < 0) { 796 cpuset_free(cp1); 797 goto err; 798 } 799 *cp_tofree = cp1; 800 rcp = cp1; 801 } 802 return rcp; 803 err: 804 return NULL; 805 } 806 807 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */ 808 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus) 809 { 810 struct cpuset *cp_tofree = NULL; 811 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 812 813 if (!cp1) 814 goto err; 815 if (cp1->cpus == NULL) { 816 errno = EINVAL; 817 goto err; 818 } 819 bitmask_copy(cpus, cp1->cpus); 820 cpuset_free(cp_tofree); 821 return 0; 822 err: 823 cpuset_free(cp_tofree); 824 return -1; 825 } 826 827 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */ 828 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems) 829 { 830 struct cpuset *cp_tofree = NULL; 831 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 832 833 if (!cp1) 834 goto err; 835 if (cp1->mems == NULL) { 836 errno = EINVAL; 837 goto err; 838 } 839 bitmask_copy(mems, cp1->mems); 840 cpuset_free(cp_tofree); 841 return 0; 842 err: 843 cpuset_free(cp_tofree); 844 return -1; 845 } 846 847 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */ 848 int cpuset_cpus_weight(const struct cpuset *cp) 849 { 850 struct cpuset *cp_tofree = NULL; 851 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 852 int w = -1; 853 854 if (!cp1) 855 goto err; 856 if (cp1->cpus == NULL) { 857 errno = EINVAL; 858 goto err; 859 } 860 w = bitmask_weight(cp1->cpus); 861 /* fall into ... */ 862 err: 863 cpuset_free(cp_tofree); 864 return w; 865 } 866 867 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */ 868 int cpuset_mems_weight(const struct cpuset *cp) 869 { 870 struct cpuset *cp_tofree = NULL; 871 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 872 int w = -1; 873 874 if (!cp1) 875 goto err; 876 if (cp1->mems == NULL) { 877 errno = EINVAL; 878 goto err; 879 } 880 w = bitmask_weight(cp1->mems); 881 /* fall into ... */ 882 err: 883 cpuset_free(cp_tofree); 884 return w; 885 } 886 887 /* Return integer value of option optname in cp */ 888 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname) 889 { 890 if (streq(optionname, "cpu_exclusive")) 891 return cp->cpu_exclusive; 892 else if (streq(optionname, "mem_exclusive")) 893 return cp->mem_exclusive; 894 else if (streq(optionname, "mem_hardwall")) 895 return cp->mem_hardwall; 896 else if (streq(optionname, "notify_on_release")) 897 return cp->notify_on_release; 898 else if (streq(optionname, "memory_pressure_enabled")) 899 return cp->memory_pressure_enabled; 900 else if (streq(optionname, "memory_migrate")) 901 return cp->memory_migrate; 902 else if (streq(optionname, "memory_spread_page")) 903 return cp->memory_spread_page; 904 else if (streq(optionname, "memory_spread_slab")) 905 return cp->memory_spread_slab; 906 else if (streq(optionname, "sched_load_balance")) 907 return cp->sched_load_balance; 908 else if (streq(optionname, "sched_relax_domain_level")) 909 return cp->sched_relax_domain_level; 910 else 911 return -2; /* optionname not recognized */ 912 } 913 914 /* [optional] Return string value of optname */ 915 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp, 916 UNUSED const char *optionname) 917 { 918 return NULL; /* For now, all string options unrecognized */ 919 } 920 921 static int read_flag(const char *filepath, char *flagp) 922 { 923 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */ 924 int fd = -1; 925 926 if ((fd = open(filepath, O_RDONLY)) < 0) 927 goto err; 928 if (read(fd, buf, sizeof(buf)) < 1) 929 goto err; 930 if (atoi(buf)) 931 *flagp = 1; 932 else 933 *flagp = 0; 934 close(fd); 935 return 0; 936 err: 937 if (fd >= 0) 938 close(fd); 939 return -1; 940 } 941 942 static int load_flag(const char *path, char *flagp, const char *flag) 943 { 944 char buf[PATH_MAX]; 945 946 pathcat2(buf, sizeof(buf), path, flag); 947 return read_flag(buf, flagp); 948 } 949 950 static int read_number(const char *filepath, int *numberp) 951 { 952 char buf[SMALL_BUFSZ]; 953 int fd = -1; 954 955 if ((fd = open(filepath, O_RDONLY)) < 0) 956 goto err; 957 if (read(fd, buf, sizeof(buf)) < 1) 958 goto err; 959 *numberp = atoi(buf); 960 close(fd); 961 return 0; 962 err: 963 if (fd >= 0) 964 close(fd); 965 return -1; 966 } 967 968 static int load_number(const char *path, int *numberp, const char *file) 969 { 970 char buf[PATH_MAX]; 971 972 pathcat2(buf, sizeof(buf), path, file); 973 return read_number(buf, numberp); 974 } 975 976 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits) 977 { 978 FILE *fp = NULL; 979 char *buf = NULL; 980 int buflen; 981 struct bitmask *bmp = NULL; 982 983 if ((fp = fopen(filepath, "r")) == NULL) 984 goto err; 985 buflen = filesize(fp) + 1; /* + 1 for nul term */ 986 if ((buf = malloc(buflen)) == NULL) 987 goto err; 988 if (flgets(buf, buflen, fp) == NULL) 989 goto err; 990 fclose(fp); 991 fp = NULL; 992 993 if ((bmp = bitmask_alloc(nbits)) == NULL) 994 goto err; 995 if (*buf && bitmask_parselist(buf, bmp) < 0) 996 goto err; 997 if (*bmpp) 998 bitmask_free(*bmpp); 999 *bmpp = bmp; 1000 free(buf); 1001 buf = NULL; 1002 return 0; 1003 err: 1004 if (buf != NULL) 1005 free(buf); 1006 if (fp != NULL) 1007 fclose(fp); 1008 if (bmp != NULL) 1009 bitmask_free(bmp); 1010 return -1; 1011 } 1012 1013 static int load_mask(const char *path, struct bitmask **bmpp, 1014 int nbits, const char *mask) 1015 { 1016 char buf[PATH_MAX]; 1017 1018 pathcat2(buf, sizeof(buf), path, mask); 1019 return read_mask(buf, bmpp, nbits); 1020 } 1021 1022 /* Write string to file at given filepath. Create or truncate file. */ 1023 static int write_string_file(const char *filepath, const char *str) 1024 { 1025 int fd = -1; 1026 1027 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0) 1028 goto err; 1029 if (write(fd, str, strlen(str)) < 0) 1030 goto err; 1031 close(fd); 1032 return 0; 1033 err: 1034 if (fd >= 0) 1035 close(fd); 1036 return -1; 1037 } 1038 1039 /* Size and allocate buffer. Write bitmask into it. Caller must free */ 1040 static char *sprint_mask_buf(const struct bitmask *bmp) 1041 { 1042 char *buf = NULL; 1043 int buflen; 1044 char c; 1045 1046 /* First bitmask_displaylist() call just to get the length */ 1047 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */ 1048 if ((buf = malloc(buflen)) == NULL) 1049 return NULL; 1050 bitmask_displaylist(buf, buflen, bmp); 1051 return buf; 1052 } 1053 1054 static int exists_flag(const char *path, const char *flag) 1055 { 1056 char buf[PATH_MAX]; 1057 struct stat statbuf; 1058 int rc; 1059 1060 pathcat2(buf, sizeof(buf), path, flag); 1061 rc = (stat(buf, &statbuf) == 0); 1062 errno = 0; 1063 return rc; 1064 } 1065 1066 static int store_flag(const char *path, const char *flag, int val) 1067 { 1068 char buf[PATH_MAX]; 1069 1070 pathcat2(buf, sizeof(buf), path, flag); 1071 return write_string_file(buf, val ? "1" : "0"); 1072 } 1073 1074 static int store_number(const char *path, const char *file, int val) 1075 { 1076 char buf[PATH_MAX]; 1077 char data[SMALL_BUFSZ]; 1078 1079 memset(data, 0, sizeof(data)); 1080 pathcat2(buf, sizeof(buf), path, file); 1081 snprintf(data, sizeof(data), "%d", val); 1082 return write_string_file(buf, data); 1083 } 1084 1085 static int store_mask(const char *path, const char *mask, 1086 const struct bitmask *bmp) 1087 { 1088 char maskpath[PATH_MAX]; 1089 char *bp = NULL; 1090 int rc; 1091 1092 if (bmp == NULL) 1093 return 0; 1094 pathcat2(maskpath, sizeof(maskpath), path, mask); 1095 if ((bp = sprint_mask_buf(bmp)) == NULL) 1096 return -1; 1097 rc = write_string_file(maskpath, bp); 1098 free(bp); 1099 return rc; 1100 } 1101 1102 /* 1103 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file 1104 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents 1105 * were N == cpu number. 1106 */ 1107 1108 char cpu_online(unsigned int cpu) 1109 { 1110 char online; 1111 char cpupath[PATH_MAX]; 1112 1113 (void)snprintf(cpupath, sizeof(cpupath), 1114 "/sys/devices/system/cpu/cpu%d/online", cpu); 1115 if (read_flag(cpupath, &online) < 0) 1116 return 0; /* oops - guess that cpu's not there */ 1117 return online; 1118 } 1119 1120 /* 1121 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()), 1122 * to the node on which that cpu resides or cpuset_mems_nbits(). 1123 * 1124 * To avoid every user having to recalculate this relation 1125 * from various clues in the sysfs file system (below the 1126 * path /sys/devices/system) a copy of this map is kept at 1127 * /var/run/cpunodemap. 1128 * 1129 * The system automatically cleans out files below 1130 * /var/run on each system reboot (see the init script 1131 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry 1132 * about stale data in this file across reboots. If the file 1133 * is missing, let the first process that needs it, and has 1134 * permission to write in the /var/run directory, rebuild it. 1135 * 1136 * If using this cached data, remember the mtime of the mapfile 1137 * the last time we read it in case something like a hotplug 1138 * event results in the file being removed and rebuilt, so we 1139 * can detect if we're using a stale cache, and need to reload. 1140 * 1141 * The mtime of this file is set to the time when we did 1142 * the recalculation of the map, from the clues beneath 1143 * /sys/devices/system. This is done so that a program 1144 * won't see the mapfile it just wrote as being newer than what 1145 * it just wrote out (store_map) and read the same map back in 1146 * (load_file). 1147 */ 1148 1149 /* 1150 * Hold flockfile(stdin) while using cpunodemap for posix thread safety. 1151 * 1152 * Note on locking and flockfile(FILE *): 1153 * 1154 * We use flockfile() and funlockfile() instead of directly 1155 * calling pthread_mutex_lock and pthread_mutex_unlock on 1156 * a pthread_mutex_t, because this avoids forcing the app 1157 * to link with libpthread. The glibc implementation of 1158 * flockfile/funlockfile will fall back to no-ops if libpthread 1159 * doesn't happen to be linked. 1160 * 1161 * Since flockfile already has the moderately convoluted 1162 * combination of weak and strong symbols required to accomplish 1163 * this, it is easier to use flockfile() on some handy FILE * 1164 * stream as a surrogate for pthread locking than it is to so 1165 * re-invent that wheel. 1166 * 1167 * Forcing all apps that use cpusets to link with libpthread 1168 * would force non-transparent initialization on apps that 1169 * might not be prepared to handle it. 1170 * 1171 * The application using libcpuset should never notice this 1172 * odd use of flockfile(), because we never return to the 1173 * application from any libcpuset call with any such lock held. 1174 * We just use this locking for guarding some non-atomic cached 1175 * data updates and accesses, internal to some libcpuset calls. 1176 * Also, flockfile() allows recursive nesting, so if the app 1177 * calls libcpuset holding such a file lock, we won't deadlock 1178 * if we go to acquire the same lock. We'll just get the lock 1179 * and increment its counter while we hold it. 1180 */ 1181 1182 static struct cpunodemap { 1183 int *map; /* map[cpumask_sz]: maps cpu to its node */ 1184 time_t mtime; /* modtime of mapfile when last read */ 1185 } cpunodemap; 1186 1187 /* 1188 * rebuild_map() - Rebuild cpunodemap[] from scratch. 1189 * 1190 * Situation: 1191 * Neither our in-memory cpunodemap[] array nor the 1192 * cache of it in mapfile is current. 1193 * Action: 1194 * Rebuild it from first principles and the information 1195 * available below /sys/devices/system. 1196 */ 1197 1198 static void rebuild_map() 1199 { 1200 char buf[PATH_MAX]; 1201 DIR *dir1, *dir2; 1202 struct dirent *dent1, *dent2; 1203 int ncpus = cpuset_cpus_nbits(); 1204 int nmems = cpuset_mems_nbits(); 1205 unsigned int cpu, mem; 1206 1207 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1208 cpunodemap.map[cpu] = -1; 1209 pathcat2(buf, sizeof(buf), sysdevices, "node"); 1210 if ((dir1 = opendir(buf)) == NULL) 1211 return; 1212 while ((dent1 = readdir(dir1)) != NULL) { 1213 if (sscanf(dent1->d_name, "node%u", &mem) < 1) 1214 continue; 1215 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name); 1216 if ((dir2 = opendir(buf)) == NULL) 1217 continue; 1218 while ((dent2 = readdir(dir2)) != NULL) { 1219 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1) 1220 continue; 1221 if (cpu >= (unsigned int)ncpus 1222 || mem >= (unsigned int)nmems) 1223 continue; 1224 cpunodemap.map[cpu] = mem; 1225 } 1226 closedir(dir2); 1227 } 1228 closedir(dir1); 1229 cpunodemap.mtime = time(0); 1230 } 1231 1232 /* 1233 * load_map() - Load cpunodemap[] from mapfile. 1234 * 1235 * Situation: 1236 * The cpunodemap in mapfile is more recent than 1237 * what we have in the cpunodemap[] array. 1238 * Action: 1239 * Reload the cpunodemap[] array from the file. 1240 */ 1241 1242 static void load_map() 1243 { 1244 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */ 1245 FILE *mapfp; /* File stream on mapfile */ 1246 int ncpus = cpuset_cpus_nbits(); 1247 int nmems = cpuset_mems_nbits(); 1248 unsigned int cpu, mem; 1249 1250 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL) 1251 return; 1252 cpunodemap.mtime = get_mtime(mapfile); 1253 if ((mapfp = fopen(mapfile, "r")) == NULL) 1254 return; 1255 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) 1256 cpunodemap.map[cpu] = nmems; 1257 while (flgets(buf, sizeof(buf), mapfp) != NULL) { 1258 if (sscanf(buf, "%u %u", &cpu, &mem) < 2) 1259 continue; 1260 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems) 1261 continue; 1262 cpunodemap.map[cpu] = mem; 1263 } 1264 fclose(mapfp); 1265 } 1266 1267 /* 1268 * store_map() - Write cpunodemap[] out to mapfile. 1269 * 1270 * Situation: 1271 * The cpunodemap in the cpunodemap[] array is 1272 * more recent than the one in mapfile. 1273 * Action: 1274 * Write cpunodemap[] out to mapfile. 1275 */ 1276 1277 static void store_map() 1278 { 1279 char buf[PATH_MAX]; 1280 int fd = -1; 1281 FILE *mapfp = NULL; 1282 int ncpus = cpuset_cpus_nbits(); 1283 int nmems = cpuset_mems_nbits(); 1284 unsigned int cpu, mem; 1285 1286 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX"); 1287 if ((fd = mkstemp(buf)) < 0) 1288 goto err; 1289 if ((mapfp = fdopen(fd, "w")) == NULL) 1290 goto err; 1291 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1292 mem = cpunodemap.map[cpu]; 1293 if (mem < (unsigned int)nmems) 1294 fprintf(mapfp, "%u %u\n", cpu, mem); 1295 } 1296 fclose(mapfp); 1297 set_mtime(buf, cpunodemap.mtime); 1298 if (rename(buf, mapfile) < 0) 1299 goto err; 1300 /* mkstemp() creates mode 0600 - change to world readable */ 1301 (void)chmod(mapfile, 0444); 1302 return; 1303 err: 1304 if (mapfp != NULL) { 1305 fclose(mapfp); 1306 fd = -1; 1307 } 1308 if (fd >= 0) 1309 close(fd); 1310 (void)unlink(buf); 1311 } 1312 1313 /* 1314 * Load and gain thread safe access to the <cpu, node> map. 1315 * 1316 * Return 0 on success with flockfile(stdin) held. 1317 * Each successful get_map() call must be matched with a 1318 * following put_map() call to release the lock. 1319 * 1320 * On error, return -1 with errno set and no lock held. 1321 */ 1322 1323 static int get_map() 1324 { 1325 time_t file_mtime; 1326 1327 flockfile(stdin); 1328 1329 if (cpunodemap.map == NULL) { 1330 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int)); 1331 if (cpunodemap.map == NULL) 1332 goto err; 1333 } 1334 1335 /* If no one has a good cpunodemap, rebuild from scratch */ 1336 file_mtime = get_mtime(mapfile); 1337 if (cpunodemap.mtime == 0 && file_mtime == 0) 1338 rebuild_map(); 1339 1340 /* If either cpunodemap[] or mapfile newer, update other with it */ 1341 file_mtime = get_mtime(mapfile); 1342 if (cpunodemap.mtime < file_mtime) 1343 load_map(); 1344 else if (cpunodemap.mtime > file_mtime) 1345 store_map(); 1346 return 0; 1347 err: 1348 funlockfile(stdin); 1349 return -1; 1350 } 1351 1352 static void put_map() 1353 { 1354 funlockfile(stdin); 1355 } 1356 1357 /* Set cpus to those local to Memory Nodes mems */ 1358 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus) 1359 { 1360 int ncpus = cpuset_cpus_nbits(); 1361 unsigned int cpu; 1362 1363 if (check() < 0) 1364 return -1; 1365 1366 get_map(); 1367 bitmask_clearall(cpus); 1368 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1369 if (bitmask_isbitset(mems, cpunodemap.map[cpu])) 1370 bitmask_setbit(cpus, cpu); 1371 } 1372 put_map(); 1373 return 0; 1374 } 1375 1376 /* Set mems to those local to CPUs cpus */ 1377 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems) 1378 { 1379 int ncpus = cpuset_cpus_nbits(); 1380 unsigned int cpu; 1381 1382 if (check() < 0) 1383 return -1; 1384 1385 get_map(); 1386 bitmask_clearall(mems); 1387 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) { 1388 if (bitmask_isbitset(cpus, cpu)) 1389 bitmask_setbit(mems, cpunodemap.map[cpu]); 1390 } 1391 put_map(); 1392 return 0; 1393 } 1394 1395 /* 1396 * distmap[] 1397 * 1398 * Array of ints of size cpumask_sz by nodemask_sz. 1399 * 1400 * Element distmap[cpu][mem] is the distance between CPU cpu 1401 * and Memory Node mem. Distances are weighted to roughly 1402 * approximate the cost of memory references, and scaled so that 1403 * the distance from a CPU to its local Memory Node is ten (10). 1404 * 1405 * The first call to cpuset_cpumemdist() builds this map, from 1406 * whatever means the kernel provides to obtain these distances. 1407 * 1408 * These distances derive from ACPI SLIT table entries, which are 1409 * eight bits in size. 1410 * 1411 * Hold flockfile(stdout) while using distmap for posix thread safety. 1412 */ 1413 1414 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */ 1415 1416 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */ 1417 1418 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */ 1419 1420 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */ 1421 1422 /* 1423 * Parse arch neutral lines from 'distance' files of form: 1424 * 1425 * 46 66 10 20 1426 * 1427 * The lines contain a space separated list of distances, which is parsed 1428 * into array dists[] of each nodes distance from the specified node. 1429 * 1430 * Result is placed in distmap[ncpus][nmems]: 1431 * 1432 * For each cpu c on node: 1433 * For each node position n in list of distances: 1434 * distmap[c][n] = dists[n] 1435 */ 1436 1437 static int parse_distmap_line(unsigned int node, char *buf) 1438 { 1439 char *p, *q; 1440 int ncpus = cpuset_cpus_nbits(); 1441 int nmems = cpuset_mems_nbits(); 1442 unsigned int c, n; 1443 distmap_entry_t *dists = NULL; 1444 struct bitmask *cpus = NULL, *mems = NULL; 1445 int ret = -1; 1446 1447 p = buf; 1448 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1449 goto err; 1450 for (n = 0; n < (unsigned int)nmems; n++) 1451 dists[n] = DISTMAP_MAX; 1452 1453 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1454 unsigned int d; 1455 1456 if ((p = strpbrk(p, "0123456789")) == NULL) 1457 break; 1458 d = strtoul(p, &q, 10); 1459 if (p == q) 1460 break; 1461 if (d < DISTMAP_MAX) 1462 dists[n] = (distmap_entry_t) d; 1463 } 1464 1465 if ((mems = bitmask_alloc(nmems)) == NULL) 1466 goto err; 1467 bitmask_setbit(mems, node); 1468 1469 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1470 goto err; 1471 cpuset_localcpus(mems, cpus); 1472 1473 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1474 c = bitmask_next(cpus, c + 1)) 1475 for (n = 0; n < (unsigned int)nmems; n++) 1476 distmap[I(c, n)] = dists[n]; 1477 ret = 0; 1478 /* fall into ... */ 1479 err: 1480 bitmask_free(mems); 1481 bitmask_free(cpus); 1482 free(dists); 1483 return ret; 1484 } 1485 1486 static int parse_distance_file(unsigned int node, const char *path) 1487 { 1488 FILE *fp; 1489 char *buf = NULL; 1490 int buflen; 1491 1492 if ((fp = fopen(path, "r")) == NULL) 1493 goto err; 1494 1495 buflen = filesize(fp); 1496 1497 if ((buf = malloc(buflen)) == NULL) 1498 goto err; 1499 1500 if (flgets(buf, buflen, fp) == NULL) 1501 goto err; 1502 1503 if (parse_distmap_line(node, buf) < 0) 1504 goto err; 1505 1506 free(buf); 1507 fclose(fp); 1508 return 0; 1509 err: 1510 free(buf); 1511 if (fp) 1512 fclose(fp); 1513 return -1; 1514 } 1515 1516 static void build_distmap() 1517 { 1518 static int tried_before = 0; 1519 int ncpus = cpuset_cpus_nbits(); 1520 int nmems = cpuset_mems_nbits(); 1521 int c, m; 1522 DIR *dir = NULL; 1523 struct dirent *dent; 1524 1525 if (tried_before) 1526 goto err; 1527 tried_before = 1; 1528 1529 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1530 goto err; 1531 1532 for (c = 0; c < ncpus; c++) 1533 for (m = 0; m < nmems; m++) 1534 distmap[I(c, m)] = DISTMAP_MAX; 1535 1536 if ((dir = opendir(distance_directory)) == NULL) 1537 goto err; 1538 while ((dent = readdir(dir)) != NULL) { 1539 char buf[PATH_MAX]; 1540 unsigned int node; 1541 1542 if (sscanf(dent->d_name, "node%u", &node) < 1) 1543 continue; 1544 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name, 1545 "distance"); 1546 if (parse_distance_file(node, buf) < 0) 1547 goto err; 1548 } 1549 closedir(dir); 1550 return; 1551 err: 1552 if (dir) 1553 closedir(dir); 1554 free(distmap); 1555 distmap = NULL; 1556 } 1557 1558 #ifdef ALTERNATE_SN_DISTMAP 1559 1560 /* 1561 * Parse SN architecture specific line of form: 1562 * 1563 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10 1564 * 1565 * Second field is node number. The "dist" field is the colon separated list 1566 * of distances, which is parsed into array dists[] of each nodes distance 1567 * from that node. 1568 * 1569 * Result is placed in distmap[ncpus][nmems]: 1570 * 1571 * For each cpu c on that node: 1572 * For each node position n in list of distances: 1573 * distmap[c][n] = dists[n] 1574 */ 1575 1576 static void parse_distmap_line_sn(char *buf) 1577 { 1578 char *p, *pend, *q; 1579 int ncpus = cpuset_cpus_nbits(); 1580 int nmems = cpuset_mems_nbits(); 1581 unsigned long c, n, node; 1582 distmap_entry_t *dists = NULL; 1583 struct bitmask *cpus = NULL, *mems = NULL; 1584 1585 if ((p = strchr(buf, ' ')) == NULL) 1586 goto err; 1587 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems) 1588 goto err; 1589 if ((p = strstr(q, " dist ")) == NULL) 1590 goto err; 1591 p += strlen(" dist "); 1592 if ((pend = strchr(p, ' ')) != NULL) 1593 *pend = '\0'; 1594 if ((dists = calloc(nmems, sizeof(*dists))) == NULL) 1595 goto err; 1596 for (n = 0; n < (unsigned int)nmems; n++) 1597 dists[n] = DISTMAP_MAX; 1598 1599 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) { 1600 unsigned long d; 1601 1602 if ((p = strpbrk(p, "0123456789")) == NULL) 1603 break; 1604 d = strtoul(p, &q, 10); 1605 if (p == q) 1606 break; 1607 if (d < DISTMAP_MAX) 1608 dists[n] = (distmap_entry_t) d; 1609 } 1610 1611 if ((mems = bitmask_alloc(nmems)) == NULL) 1612 goto err; 1613 bitmask_setbit(mems, node); 1614 1615 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1616 goto err; 1617 cpuset_localcpus(mems, cpus); 1618 1619 for (c = bitmask_first(cpus); c < (unsigned int)ncpus; 1620 c = bitmask_next(cpus, c + 1)) 1621 for (n = 0; n < (unsigned int)nmems; n++) 1622 distmap[I(c, n)] = dists[n]; 1623 /* fall into ... */ 1624 err: 1625 bitmask_free(mems); 1626 bitmask_free(cpus); 1627 free(dists); 1628 } 1629 1630 static void build_distmap_sn() 1631 { 1632 int ncpus = cpuset_cpus_nbits(); 1633 int nmems = cpuset_mems_nbits(); 1634 int c, m; 1635 static int tried_before = 0; 1636 FILE *fp = NULL; 1637 char *buf = NULL; 1638 int buflen; 1639 1640 if (tried_before) 1641 goto err; 1642 tried_before = 1; 1643 1644 if ((fp = fopen(sn_topology, "r")) == NULL) 1645 goto err; 1646 1647 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL) 1648 goto err; 1649 1650 for (c = 0; c < ncpus; c++) 1651 for (m = 0; m < nmems; m++) 1652 distmap[I(c, m)] = DISTMAP_MAX; 1653 1654 buflen = filesize(fp); 1655 if ((buf = malloc(buflen)) == NULL) 1656 goto err; 1657 1658 while (flgets(buf, buflen, fp) != NULL) 1659 if (strprefix(buf, sn_top_node_prefix)) 1660 parse_distmap_line_sn(buf); 1661 1662 free(buf); 1663 fclose(fp); 1664 return; 1665 err: 1666 free(buf); 1667 free(distmap); 1668 distmap = NULL; 1669 if (fp) 1670 fclose(fp); 1671 } 1672 1673 #endif 1674 1675 /* [optional] Hardware distance from CPU to Memory Node */ 1676 unsigned int cpuset_cpumemdist(int cpu, int mem) 1677 { 1678 int ncpus = cpuset_cpus_nbits(); 1679 int nmems = cpuset_mems_nbits(); 1680 distmap_entry_t r = DISTMAP_MAX; 1681 1682 flockfile(stdout); 1683 1684 if (check() < 0) 1685 goto err; 1686 1687 if (distmap == NULL) 1688 build_distmap(); 1689 1690 #ifdef ALTERNATE_SN_DISTMAP 1691 if (distmap == NULL) 1692 build_distmap_sn(); 1693 #endif 1694 1695 if (distmap == NULL) 1696 goto err; 1697 1698 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems) 1699 goto err; 1700 1701 r = distmap[I(cpu, mem)]; 1702 /* fall into ... */ 1703 err: 1704 funlockfile(stdout); 1705 return r; 1706 } 1707 1708 /* [optional] Return Memory Node closest to cpu */ 1709 int cpuset_cpu2node(int cpu) 1710 { 1711 int ncpus = cpuset_cpus_nbits(); 1712 int nmems = cpuset_mems_nbits(); 1713 struct bitmask *cpus = NULL, *mems = NULL; 1714 int r = -1; 1715 1716 if (check() < 0) 1717 goto err; 1718 1719 if ((cpus = bitmask_alloc(ncpus)) == NULL) 1720 goto err; 1721 bitmask_setbit(cpus, cpu); 1722 1723 if ((mems = bitmask_alloc(nmems)) == NULL) 1724 goto err; 1725 cpuset_localmems(cpus, mems); 1726 r = bitmask_first(mems); 1727 /* fall into ... */ 1728 err: 1729 bitmask_free(cpus); 1730 bitmask_free(mems); 1731 return r; 1732 } 1733 1734 static int apply_cpuset_settings(const char *path, const struct cpuset *cp) 1735 { 1736 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) { 1737 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0) 1738 goto err; 1739 } 1740 1741 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) { 1742 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0) 1743 goto err; 1744 } 1745 1746 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) { 1747 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0) 1748 goto err; 1749 } 1750 1751 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) { 1752 if (store_flag(path, "notify_on_release", cp->notify_on_release) 1753 < 0) 1754 goto err; 1755 } 1756 1757 if (cp->memory_migrate_valid && 1758 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) { 1759 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0) 1760 goto err; 1761 } 1762 1763 if (cp->memory_pressure_enabled_valid && 1764 cp->memory_pressure_enabled_dirty && 1765 exists_flag(path, "memory_pressure_enabled")) { 1766 if (store_flag 1767 (path, "memory_pressure_enabled", 1768 cp->memory_pressure_enabled) < 0) 1769 goto err; 1770 } 1771 1772 if (cp->memory_spread_page_valid && 1773 cp->memory_spread_page_dirty && 1774 exists_flag(path, "memory_spread_page")) { 1775 if (store_flag 1776 (path, "memory_spread_page", cp->memory_spread_page) < 0) 1777 goto err; 1778 } 1779 1780 if (cp->memory_spread_slab_valid && 1781 cp->memory_spread_slab_dirty && 1782 exists_flag(path, "memory_spread_slab")) { 1783 if (store_flag 1784 (path, "memory_spread_slab", cp->memory_spread_slab) < 0) 1785 goto err; 1786 } 1787 1788 if (cp->sched_load_balance_valid && 1789 cp->sched_load_balance_dirty && 1790 exists_flag(path, "sched_load_balance")) { 1791 if (store_flag 1792 (path, "sched_load_balance", cp->sched_load_balance) < 0) 1793 goto err; 1794 } 1795 1796 if (cp->sched_relax_domain_level_valid && 1797 cp->sched_relax_domain_level_dirty && 1798 exists_flag(path, "sched_relax_domain_level")) { 1799 if (store_number 1800 (path, "sched_relax_domain_level", 1801 cp->sched_relax_domain_level) < 0) 1802 goto err; 1803 } 1804 1805 if (cp->cpus_valid && cp->cpus_dirty) { 1806 if (store_mask(path, "cpus", cp->cpus) < 0) 1807 goto err; 1808 } 1809 1810 if (cp->mems_valid && cp->mems_dirty) { 1811 if (store_mask(path, "mems", cp->mems) < 0) 1812 goto err; 1813 } 1814 return 0; 1815 err: 1816 return -1; 1817 } 1818 1819 /* 1820 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below. 1821 * 1822 * Extract max value of any 'siblings' field in /proc/cpuinfo. 1823 * Cache the result - only need to extract once in lifetime of task. 1824 * 1825 * The siblings field is the number of logical CPUs in a physical 1826 * processor package. It is equal to the product of the number of 1827 * cores in that package, times the number of hyper-threads per core. 1828 * The bug that cpuset_would_crash_kernel() is detecting arises 1829 * when a cpu_exclusive cpuset tries to include just some, not all, 1830 * of the sibling logical CPUs available in a processor package. 1831 * 1832 * In the improbable case that a system has mixed values of siblings 1833 * (some processor packages have more than others, perhaps due to 1834 * partially enabling Hyper-Threading), we take the worse case value, 1835 * the largest siblings value. This might be overkill. I don't know 1836 * if this kernel bug considers each processor package's siblings 1837 * separately or not. But it sure is easier this way ... 1838 * 1839 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from 1840 * open to close, the first time called. 1841 */ 1842 1843 static int get_siblings() 1844 { 1845 static int siblings; 1846 char buf[32]; /* big enough for one 'siblings' line */ 1847 FILE *fp; 1848 1849 if (siblings) 1850 return siblings; 1851 1852 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) 1853 return 4; /* wing it - /proc not mounted ? */ 1854 while (flgets(buf, sizeof(buf), fp) != NULL) { 1855 int s; 1856 1857 if (sscanf(buf, "siblings : %d", &s) < 1) 1858 continue; 1859 if (s > siblings) 1860 siblings = s; 1861 } 1862 fclose(fp); 1863 if (siblings == 0) 1864 siblings = 1; /* old kernel, no siblings, default to 1 */ 1865 return siblings; 1866 } 1867 1868 /* 1869 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic 1870 * scheduler domain code invoked for cpu_exclusive cpusets that causes 1871 * the kernel to freeze, requiring a hardware reset. 1872 * 1873 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive' 1874 * cpuset is defined where that cpusets 'cpus' are not on package 1875 * boundaries then the kernel will freeze, usually as soon as this 1876 * cpuset is created, requiring a hardware reset. 1877 * 1878 * A cpusets 'cpus' are not on package boundaries if the cpuset 1879 * includes a proper non-empty subset (some, but not all) of the 1880 * logical cpus on a processor package. This requires multiple 1881 * logical CPUs per package, available with either Hyper-Thread or 1882 * Multi-Core support. Without one of these features, there is only 1883 * one logical CPU per physical package, and it's not possible to 1884 * have a proper, non-empty subset of a set of cardinality one. 1885 * 1886 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC 1887 * on i386 and x86_64 arch's. 1888 * 1889 * The objective of this routine cpuset_would_crash_kernel() is to 1890 * determine if a proposed cpuset setting would crash the kernel due 1891 * to this bug, so that the caller can avoid the crash. 1892 * 1893 * Ideally we'd check for exactly these conditions here, but computing 1894 * the package (identified by the 'physical id' field of /proc/cpuinfo) 1895 * of each cpu in a cpuset is more effort than it's worth here. 1896 * 1897 * Also there is no obvious way to identify exactly whether the kernel 1898 * one is executing on has this bug, short of trying it, and seeing 1899 * if the kernel just crashed. 1900 * 1901 * So for now, we look for a simpler set of conditions, that meets 1902 * our immediate need - avoid this crash on SUSE SLES10 systems that 1903 * are susceptible to it. We look for the kernel version 2.6.16.*, 1904 * which is the base kernel of SUSE SLES10, and for i386 or x86_64 1905 * processors, which had CONFIG_SCHED_MC enabled. 1906 * 1907 * If these simpler conditions are met, we further simplify the check, 1908 * by presuming that the logical CPUs are numbered on processor 1909 * package boundaries. If each package has S siblings, we assume 1910 * that CPUs numbered N through N + S -1 are on the same package, 1911 * for any CPU N such that N mod S == 0. 1912 * 1913 * Yes, this is a hack, focused on avoiding kernel freezes on 1914 * susceptible SUSE SLES10 systems. 1915 */ 1916 1917 static int cpuset_would_crash_kernel(const struct cpuset *cp) 1918 { 1919 static int susceptible_system = -1; 1920 1921 if (!cp->cpu_exclusive) 1922 goto ok; 1923 1924 if (susceptible_system == -1) { 1925 struct utsname u; 1926 int rel_2_6_16, arch_i386, arch_x86_64; 1927 1928 if (uname(&u) < 0) 1929 goto fail; 1930 rel_2_6_16 = strprefix(u.release, "2.6.16."); 1931 arch_i386 = streq(u.machine, "i386"); 1932 arch_x86_64 = streq(u.machine, "x86_64"); 1933 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64); 1934 } 1935 1936 if (susceptible_system) { 1937 int ncpus = cpuset_cpus_nbits(); 1938 int siblings = get_siblings(); 1939 unsigned int cpu; 1940 1941 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) { 1942 int s, num_set = 0; 1943 1944 for (s = 0; s < siblings; s++) { 1945 if (bitmask_isbitset(cp->cpus, cpu + s)) 1946 num_set++; 1947 } 1948 1949 /* If none or all siblings set, we're still ok */ 1950 if (num_set == 0 || num_set == siblings) 1951 continue; 1952 1953 /* Found one that would crash kernel. Fail. */ 1954 errno = ENXIO; 1955 goto fail; 1956 } 1957 } 1958 /* If not susceptible, or if all ok, fall into "ok" ... */ 1959 ok: 1960 return 0; /* would not crash */ 1961 fail: 1962 return 1; /* would crash */ 1963 } 1964 1965 /* compare two cpuset and mark the dirty variable */ 1966 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2) 1967 { 1968 if (cp1->cpu_exclusive_valid && 1969 cp1->cpu_exclusive != cp2->cpu_exclusive) 1970 cp1->cpu_exclusive_dirty = 1; 1971 1972 if (cp1->mem_exclusive_valid && 1973 cp1->mem_exclusive != cp2->mem_exclusive) 1974 cp1->mem_exclusive_dirty = 1; 1975 1976 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall) 1977 cp1->mem_hardwall_dirty = 1; 1978 1979 if (cp1->notify_on_release_valid && 1980 cp1->notify_on_release != cp2->notify_on_release) 1981 cp1->notify_on_release_dirty = 1; 1982 1983 if (cp1->memory_migrate_valid && 1984 cp1->memory_migrate != cp2->memory_migrate) 1985 cp1->memory_migrate_dirty = 1; 1986 1987 if (cp1->memory_pressure_enabled_valid && 1988 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled) 1989 cp1->memory_pressure_enabled_dirty = 1; 1990 1991 if (cp1->memory_spread_page_valid && 1992 cp1->memory_spread_page != cp2->memory_spread_page) 1993 cp1->memory_spread_page_dirty = 1; 1994 1995 if (cp1->memory_spread_slab_valid && 1996 cp1->memory_spread_slab != cp2->memory_spread_slab) 1997 cp1->memory_spread_slab_dirty = 1; 1998 1999 if (cp1->sched_load_balance_valid && 2000 cp1->sched_load_balance != cp2->sched_load_balance) 2001 cp1->sched_load_balance_dirty = 1; 2002 2003 if (cp1->sched_relax_domain_level_valid && 2004 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level) 2005 cp1->sched_relax_domain_level_dirty = 1; 2006 2007 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus)) 2008 cp1->cpus_dirty = 1; 2009 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems)) 2010 cp1->mems_dirty = 1; 2011 } 2012 2013 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */ 2014 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new) 2015 { 2016 char buf[PATH_MAX]; 2017 int do_rmdir_on_err = 0; 2018 int do_restore_cp_sav_on_err = 0; 2019 struct cpuset *cp_sav = NULL; 2020 int sav_errno; 2021 2022 if (check() < 0) 2023 goto err; 2024 2025 if (cpuset_would_crash_kernel(cp)) 2026 goto err; 2027 2028 fullpath(buf, sizeof(buf), relpath); 2029 2030 if (new) { 2031 if (mkdir(buf, 0755) < 0) 2032 goto err; 2033 /* we made it, so we should remove it on error */ 2034 do_rmdir_on_err = 1; 2035 } 2036 2037 if ((cp_sav = cpuset_alloc()) == NULL) 2038 goto err; 2039 if (cpuset_query(cp_sav, relpath) < 0) 2040 goto err; 2041 /* we have old settings to restore on error */ 2042 do_restore_cp_sav_on_err = 1; 2043 2044 /* check which variable need to restore on error */ 2045 mark_dirty_variable(cp_sav, cp); 2046 2047 if (apply_cpuset_settings(buf, cp) < 0) 2048 goto err; 2049 2050 cpuset_free(cp_sav); 2051 return 0; 2052 err: 2053 sav_errno = errno; 2054 if (do_restore_cp_sav_on_err) 2055 (void)apply_cpuset_settings(buf, cp_sav); 2056 if (cp_sav) 2057 cpuset_free(cp_sav); 2058 if (do_rmdir_on_err) 2059 (void)rmdir(buf); 2060 errno = sav_errno; 2061 return -1; 2062 } 2063 2064 /* Create cpuset 'cp' at location 'relpath' */ 2065 int cpuset_create(const char *relpath, const struct cpuset *cp) 2066 { 2067 return cr_or_mod(relpath, cp, 1); 2068 } 2069 2070 /* Delete cpuset at location 'path' (if empty) */ 2071 int cpuset_delete(const char *relpath) 2072 { 2073 char buf[PATH_MAX]; 2074 2075 if (check() < 0) 2076 goto err; 2077 2078 fullpath(buf, sizeof(buf), relpath); 2079 if (rmdir(buf) < 0) 2080 goto err; 2081 2082 return 0; 2083 err: 2084 return -1; 2085 } 2086 2087 /* Set cpuset cp to the cpuset at location 'path' */ 2088 int cpuset_query(struct cpuset *cp, const char *relpath) 2089 { 2090 char buf[PATH_MAX]; 2091 2092 if (check() < 0) 2093 goto err; 2094 2095 fullpath(buf, sizeof(buf), relpath); 2096 2097 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0) 2098 goto err; 2099 cp->cpu_exclusive_valid = 1; 2100 2101 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0) 2102 goto err; 2103 cp->mem_exclusive_valid = 1; 2104 2105 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0) 2106 goto err; 2107 cp->notify_on_release_valid = 1; 2108 2109 if (exists_flag(buf, "memory_migrate")) { 2110 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0) 2111 goto err; 2112 cp->memory_migrate_valid = 1; 2113 } 2114 2115 if (exists_flag(buf, "mem_hardwall")) { 2116 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0) 2117 goto err; 2118 cp->mem_hardwall_valid = 1; 2119 } 2120 2121 if (exists_flag(buf, "memory_pressure_enabled")) { 2122 if (load_flag 2123 (buf, &cp->memory_pressure_enabled, 2124 "memory_pressure_enabled") < 0) 2125 goto err; 2126 cp->memory_pressure_enabled_valid = 1; 2127 } 2128 2129 if (exists_flag(buf, "memory_spread_page")) { 2130 if (load_flag 2131 (buf, &cp->memory_spread_page, "memory_spread_page") < 0) 2132 goto err; 2133 cp->memory_spread_page_valid = 1; 2134 } 2135 2136 if (exists_flag(buf, "memory_spread_slab")) { 2137 if (load_flag 2138 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0) 2139 goto err; 2140 cp->memory_spread_slab_valid = 1; 2141 } 2142 2143 if (exists_flag(buf, "sched_load_balance")) { 2144 if (load_flag 2145 (buf, &cp->sched_load_balance, "sched_load_balance") < 0) 2146 goto err; 2147 cp->sched_load_balance_valid = 1; 2148 } 2149 2150 if (exists_flag(buf, "sched_relax_domain_level")) { 2151 if (load_number 2152 (buf, &cp->sched_relax_domain_level, 2153 "sched_relax_domain_level") < 0) 2154 goto err; 2155 cp->sched_relax_domain_level_valid = 1; 2156 } 2157 2158 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0) 2159 goto err; 2160 cp->cpus_valid = 1; 2161 2162 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0) 2163 goto err; 2164 cp->mems_valid = 1; 2165 2166 return 0; 2167 err: 2168 return -1; 2169 } 2170 2171 /* Modify cpuset at location 'relpath' to values of 'cp' */ 2172 int cpuset_modify(const char *relpath, const struct cpuset *cp) 2173 { 2174 return cr_or_mod(relpath, cp, 0); 2175 } 2176 2177 /* Get cpuset path of pid into buf */ 2178 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size) 2179 { 2180 int fd; /* dual use: cpuset file for pid and self */ 2181 int rc; /* dual use: snprintf and read return codes */ 2182 2183 if (check() < 0) 2184 return NULL; 2185 2186 /* borrow result buf[] to build cpuset file path */ 2187 if (pid == 0) 2188 rc = snprintf(buf, size, "/proc/self/cpuset"); 2189 else 2190 rc = snprintf(buf, size, "/proc/%d/cpuset", pid); 2191 if (rc >= (int)size) { 2192 errno = E2BIG; 2193 return NULL; 2194 } 2195 if ((fd = open(buf, O_RDONLY)) < 0) { 2196 int e = errno; 2197 if (e == ENOENT) 2198 e = ESRCH; 2199 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0) 2200 e = ENOSYS; 2201 else 2202 close(fd); 2203 errno = e; 2204 return NULL; 2205 } 2206 rc = read(fd, buf, size); 2207 close(fd); 2208 if (rc < 0) 2209 return NULL; 2210 if (rc >= (int)size) { 2211 errno = E2BIG; 2212 return NULL; 2213 } 2214 buf[rc] = 0; 2215 chomp(buf); 2216 return buf; 2217 2218 } 2219 2220 /* Get cpuset 'cp' of pid */ 2221 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid) 2222 { 2223 char buf[PATH_MAX]; 2224 2225 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 2226 return -1; 2227 if (cpuset_query(cp, buf) < 0) 2228 return -1; 2229 return 0; 2230 } 2231 2232 /* [optional] Return mountpoint of cpuset filesystem */ 2233 const char *cpuset_mountpoint() 2234 { 2235 if (check() < 0) { 2236 switch (errno) { 2237 case ENODEV: 2238 return "[cpuset filesystem not mounted]"; 2239 default: 2240 return "[cpuset filesystem not supported]"; 2241 } 2242 } 2243 return cpusetmnt; 2244 } 2245 2246 /* Return true if path is a directory. */ 2247 static int isdir(const char *path) 2248 { 2249 struct stat statbuf; 2250 2251 if (stat(path, &statbuf) < 0) 2252 return 0; 2253 return S_ISDIR(statbuf.st_mode); 2254 } 2255 2256 /* 2257 * [optional] cpuset_collides_exclusive() - True if would collide exclusive. 2258 * 2259 * Return true iff the specified cpuset would overlap with any 2260 * sibling cpusets in either cpus or mems, where either this 2261 * cpuset or the sibling is cpu_exclusive or mem_exclusive. 2262 * 2263 * cpuset_create() fails with errno == EINVAL if the requested cpuset 2264 * would overlap with any sibling, where either one is cpu_exclusive or 2265 * mem_exclusive. This is a common, and not obvious error. The 2266 * following routine checks for this particular case, so that code 2267 * creating cpusets can better identify the situation, perhaps to issue 2268 * a more informative error message. 2269 * 2270 * Can also be used to diagnose cpuset_modify failures. This 2271 * routine ignores any existing cpuset with the same path as the 2272 * given 'cpusetpath', and only looks for exclusive collisions with 2273 * sibling cpusets of that path. 2274 * 2275 * In case of any error, returns (0) -- does not collide. Presumably 2276 * any actual attempt to create or modify a cpuset will encounter the 2277 * same error, and report it usefully. 2278 * 2279 * This routine is not particularly efficient; most likely code creating or 2280 * modifying a cpuset will want to try the operation first, and then if that 2281 * fails with errno EINVAL, perhaps call this routine to determine if an 2282 * exclusive cpuset collision caused the error. 2283 */ 2284 2285 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1) 2286 { 2287 char parent[PATH_MAX]; 2288 char *p; 2289 char *pathcopy = NULL; 2290 char *base; 2291 DIR *dir = NULL; 2292 struct dirent *dent; 2293 struct cpuset *cp2 = NULL; 2294 struct bitmask *cpus1 = NULL, *cpus2 = NULL; 2295 struct bitmask *mems1 = NULL, *mems2 = NULL; 2296 int ret; 2297 2298 if (check() < 0) 2299 goto err; 2300 2301 fullpath(parent, sizeof(parent), cpusetpath); 2302 if (streq(parent, cpusetmnt)) 2303 goto err; /* only one cpuset root - can't collide */ 2304 pathcopy = strdup(parent); 2305 p = strrchr(parent, '/'); 2306 if (!p) 2307 goto err; /* huh? - impossible - run and hide */ 2308 *p = 0; /* now parent is dirname of fullpath */ 2309 2310 p = strrchr(pathcopy, '/'); 2311 base = p + 1; /* now base is basename of fullpath */ 2312 if (!*base) 2313 goto err; /* this is also impossible - run away */ 2314 2315 if ((dir = opendir(parent)) == NULL) 2316 goto err; 2317 if ((cp2 = cpuset_alloc()) == NULL) 2318 goto err; 2319 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2320 goto err; 2321 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 2322 goto err; 2323 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2324 goto err; 2325 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL) 2326 goto err; 2327 2328 while ((dent = readdir(dir)) != NULL) { 2329 char child[PATH_MAX]; 2330 2331 if (streq(dent->d_name, ".") || streq(dent->d_name, "..")) 2332 continue; 2333 if (streq(dent->d_name, base)) 2334 continue; 2335 pathcat2(child, sizeof(child), parent, dent->d_name); 2336 if (!isdir(child)) 2337 continue; 2338 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0) 2339 goto err; 2340 if (cp1->cpu_exclusive || cp2->cpu_exclusive) { 2341 cpuset_getcpus(cp1, cpus1); 2342 cpuset_getcpus(cp2, cpus2); 2343 if (bitmask_intersects(cpus1, cpus2)) 2344 goto collides; 2345 } 2346 if (cp1->mem_exclusive || cp2->mem_exclusive) { 2347 cpuset_getmems(cp1, mems1); 2348 cpuset_getmems(cp2, mems2); 2349 if (bitmask_intersects(mems1, mems2)) 2350 goto collides; 2351 } 2352 } 2353 err: 2354 /* error, or did not collide */ 2355 ret = 0; 2356 goto done; 2357 collides: 2358 /* collides */ 2359 ret = 1; 2360 /* fall into ... */ 2361 done: 2362 if (dir) 2363 closedir(dir); 2364 cpuset_free(cp2); 2365 free(pathcopy); 2366 bitmask_free(cpus1); 2367 bitmask_free(cpus2); 2368 bitmask_free(mems1); 2369 bitmask_free(mems2); 2370 return ret; 2371 } 2372 2373 /* 2374 * [optional] cpuset_nuke() - Remove cpuset anyway possible 2375 * 2376 * Remove a cpuset, including killing tasks in it, and 2377 * removing any descendent cpusets and killing their tasks. 2378 * 2379 * Tasks can take a long time (minutes on some configurations) 2380 * to exit. Loop up to 'seconds' seconds, trying to kill them. 2381 * 2382 * How we do it: 2383 * 1) First, kill all the pids, looping until there are 2384 * no more pids in this cpuset or below, or until the 2385 * 'seconds' timeout limit is exceeded. 2386 * 2) Then depth first recursively rmdir the cpuset directories. 2387 * 3) If by this point the original cpuset is gone, we succeeded. 2388 * 2389 * If the timeout is exceeded, and tasks still exist, fail with 2390 * errno == ETIME. 2391 * 2392 * We sleep a variable amount of time. After the first attempt to 2393 * kill all the tasks in the cpuset or its descendents, we sleep 1 2394 * second, the next time 2 seconds, increasing 1 second each loop 2395 * up to a max of 10 seconds. If more loops past 10 are required 2396 * to kill all the tasks, we sleep 10 seconds each subsequent loop. 2397 * In any case, before the last loop, we sleep however many seconds 2398 * remain of the original timeout 'seconds' requested. The total 2399 * time of all sleeps will be no more than the requested 'seconds'. 2400 * 2401 * If the cpuset started out empty of any tasks, or if the passed in 2402 * 'seconds' was zero, then this routine will return quickly, having 2403 * not slept at all. Otherwise, this routine will at a minimum send 2404 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one 2405 * second, before looking to see if any tasks remain. If tasks remain 2406 * in the cpuset subtree, and a longer 'seconds' timeout was requested 2407 * (more than one), it will continue to kill remaining tasks and sleep, 2408 * in a loop, for as long as time and tasks remain. 2409 * 2410 * The signal sent for the kill is hardcoded to SIGKILL (9). If some 2411 * other signal should be sent first, use a separate code loop, 2412 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to 2413 * scan the task pids in a cpuset. If SIGKILL should -not- be sent, 2414 * this cpuset_nuke() routine can still be called to recursively 2415 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'. 2416 * 2417 * On success, returns 0 with errno == 0. 2418 * 2419 * On failure, returns -1, with errno possibly one of: 2420 * EACCES - search permission denied on intervening directory 2421 * ETIME - timed out - tasks remain after 'seconds' timeout 2422 * EMFILE - too many open files 2423 * ENODEV - /dev/cpuset not mounted 2424 * ENOENT - component of cpuset path doesn't exist 2425 * ENOMEM - out of memory 2426 * ENOSYS - kernel doesn't support cpusets 2427 * ENOTDIR - component of cpuset path is not a directory 2428 * EPERM - lacked permission to kill a task 2429 * EPERM - lacked permission to read cpusets or files therein 2430 */ 2431 2432 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree); 2433 2434 int cpuset_nuke(const char *relpath, unsigned int seconds) 2435 { 2436 unsigned int secs_left = seconds; /* total sleep seconds left */ 2437 unsigned int secs_loop = 1; /* how much sleep next loop */ 2438 unsigned int secs_slept; /* seconds slept in sleep() */ 2439 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */ 2440 struct cpuset_fts_tree *cs_tree; 2441 const struct cpuset_fts_entry *cs_entry; 2442 int ret, sav_errno = 0; 2443 2444 if (check() < 0) 2445 return -1; 2446 2447 if (seconds == 0) 2448 goto rmdir_cpusets; 2449 2450 while (1) { 2451 int plen, j; 2452 2453 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) { 2454 /* missing cpuset is as good as if already nuked */ 2455 if (errno == ENOENT) { 2456 ret = 0; 2457 goto no_more_cpuset; 2458 } 2459 2460 /* other problems reading cpuset are bad news */ 2461 sav_errno = errno; 2462 goto failed; 2463 } 2464 2465 if ((plen = cpuset_pidlist_length(pl)) == 0) 2466 goto rmdir_cpusets; 2467 2468 for (j = 0; j < plen; j++) { 2469 pid_t pid; 2470 2471 if ((pid = cpuset_get_pidlist(pl, j)) > 1) { 2472 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { 2473 sav_errno = errno; 2474 goto failed; 2475 } 2476 } 2477 } 2478 2479 if (secs_left == 0) 2480 goto took_too_long; 2481 2482 cpuset_freepidlist(pl); 2483 pl = NULL; 2484 2485 secs_slept = secs_loop - sleep(secs_loop); 2486 2487 /* Ensure forward progress */ 2488 if (secs_slept == 0) 2489 secs_slept = 1; 2490 2491 /* Ensure sane sleep() return (unnecessary?) */ 2492 if (secs_slept > secs_loop) 2493 secs_slept = secs_loop; 2494 2495 secs_left -= secs_slept; 2496 2497 if (secs_loop < 10) 2498 secs_loop++; 2499 2500 secs_loop = MIN(secs_left, secs_loop); 2501 } 2502 2503 took_too_long: 2504 sav_errno = ETIME; 2505 /* fall into ... */ 2506 failed: 2507 cpuset_freepidlist(pl); 2508 errno = sav_errno; 2509 return -1; 2510 2511 rmdir_cpusets: 2512 /* Let's try removing cpuset(s) now. */ 2513 cpuset_freepidlist(pl); 2514 2515 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT) 2516 return -1; 2517 ret = 0; 2518 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */ 2519 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2520 char buf[PATH_MAX]; 2521 2522 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry)); 2523 if (rmdir(buf) < 0 && errno != ENOENT) { 2524 sav_errno = errno; 2525 ret = -1; 2526 } 2527 } 2528 cpuset_fts_close(cs_tree); 2529 /* fall into ... */ 2530 no_more_cpuset: 2531 if (ret == 0) 2532 errno = 0; 2533 else 2534 errno = sav_errno; 2535 return ret; 2536 } 2537 2538 /* 2539 * When recursively reading all the tasks files from a subtree, 2540 * chain together the read results, one pidblock per tasks file, 2541 * containing the raw unprocessed ascii as read(2) in. After 2542 * we gather up this raw data, we then go back to count how 2543 * many pid's there are in total, allocate an array of pid_t 2544 * of that size, and transform the raw ascii data into this 2545 * array of pid_t's. 2546 */ 2547 2548 struct pidblock { 2549 char *buf; 2550 int buflen; 2551 struct pidblock *next; 2552 }; 2553 2554 /* 2555 * Chain the raw contents of a file onto the pbhead list. 2556 * 2557 * We malloc "+ 1" extra byte for a nul-terminator, so that 2558 * the strtoul() loop in pid_transform() won't scan past 2559 * the end of pb->buf[] and accidentally find more pids. 2560 */ 2561 static void add_pidblock(const char *file, struct pidblock **ppbhead) 2562 { 2563 FILE *fp = NULL; 2564 struct pidblock *pb = NULL; 2565 int fsz; 2566 2567 if ((fp = fopen(file, "r")) == NULL) 2568 goto err; 2569 fsz = filesize(fp); 2570 if (fsz == 0) 2571 goto err; 2572 if ((pb = calloc(1, sizeof(*pb))) == NULL) 2573 goto err; 2574 pb->buflen = fsz; 2575 if ((pb->buf = malloc(pb->buflen + 1)) == NULL) 2576 goto err; 2577 if (fread(pb->buf, 1, pb->buflen, fp) > 0) { 2578 pb->buf[pb->buflen] = '\0'; 2579 pb->next = *ppbhead; 2580 *ppbhead = pb; 2581 } 2582 fclose(fp); 2583 return; 2584 err: 2585 if (fp) 2586 fclose(fp); 2587 free(pb); 2588 } 2589 2590 static void read_task_file(const char *relpath, struct pidblock **ppbhead) 2591 { 2592 char buf[PATH_MAX]; 2593 2594 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2595 add_pidblock(buf, ppbhead); 2596 } 2597 2598 struct cpuset_pidlist { 2599 pid_t *pids; 2600 int npids; 2601 }; 2602 2603 /* Count how many pids in buf (one per line - just count newlines) */ 2604 static int pidcount(const char *buf, int buflen) 2605 { 2606 int n = 0; 2607 const char *cp; 2608 2609 for (cp = buf; cp < buf + buflen; cp++) { 2610 if (*cp == '\n') 2611 n++; 2612 } 2613 return n; 2614 } 2615 2616 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */ 2617 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n) 2618 { 2619 char *a, *b; 2620 2621 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) { 2622 pid_t p = strtoul(a, &b, 10); 2623 if (a == b) 2624 break; 2625 pl->pids[n++] = p; 2626 } 2627 return n; 2628 } 2629 2630 static void free_pidblocks(struct pidblock *pbhead) 2631 { 2632 struct pidblock *pb, *nextpb; 2633 2634 for (pb = pbhead; pb; pb = nextpb) { 2635 nextpb = pb->next; 2636 free(pb->buf); 2637 free(pb); 2638 } 2639 } 2640 2641 /* numeric comparison routine for qsort */ 2642 static int numericsort(const void *m1, const void *m2) 2643 { 2644 pid_t p1 = *(pid_t *) m1; 2645 pid_t p2 = *(pid_t *) m2; 2646 2647 return p1 - p2; 2648 } 2649 2650 /* Return list pids in cpuset 'path' */ 2651 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath, 2652 int recursiveflag) 2653 { 2654 struct pidblock *pb = NULL; 2655 struct cpuset_pidlist *pl = NULL; 2656 struct pidblock *pbhead = NULL; 2657 int n; 2658 2659 if (check() < 0) 2660 goto err; 2661 2662 if (recursiveflag) { 2663 struct cpuset_fts_tree *cs_tree; 2664 const struct cpuset_fts_entry *cs_entry; 2665 2666 if ((cs_tree = cpuset_fts_open(relpath)) == NULL) 2667 goto err; 2668 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) { 2669 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET) 2670 continue; 2671 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead); 2672 } 2673 cpuset_fts_close(cs_tree); 2674 } else { 2675 read_task_file(relpath, &pbhead); 2676 } 2677 2678 if ((pl = calloc(1, sizeof(*pl))) == NULL) 2679 goto err; 2680 pl->npids = 0; 2681 for (pb = pbhead; pb; pb = pb->next) 2682 pl->npids += pidcount(pb->buf, pb->buflen); 2683 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL) 2684 goto err; 2685 n = 0; 2686 for (pb = pbhead; pb; pb = pb->next) 2687 n = pid_transform(pb, pl, n); 2688 free_pidblocks(pbhead); 2689 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort); 2690 return pl; 2691 err: 2692 cpuset_freepidlist(pl); 2693 free_pidblocks(pbhead); 2694 return NULL; 2695 } 2696 2697 /* Return number of elements in pidlist */ 2698 int cpuset_pidlist_length(const struct cpuset_pidlist *pl) 2699 { 2700 if (pl) 2701 return pl->npids; 2702 else 2703 return 0; 2704 } 2705 2706 /* Return i'th element of pidlist */ 2707 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i) 2708 { 2709 if (pl && i >= 0 && i < pl->npids) 2710 return pl->pids[i]; 2711 else 2712 return (pid_t) - 1; 2713 } 2714 2715 /* Free pidlist */ 2716 void cpuset_freepidlist(struct cpuset_pidlist *pl) 2717 { 2718 if (pl && pl->pids) 2719 free(pl->pids); 2720 free(pl); 2721 } 2722 2723 static int __cpuset_move(pid_t pid, const char *path) 2724 { 2725 char buf[SMALL_BUFSZ]; 2726 2727 snprintf(buf, sizeof(buf), "%u", pid); 2728 return write_string_file(path, buf); 2729 } 2730 2731 /* Move task (pid == 0 for current) to a cpuset */ 2732 int cpuset_move(pid_t pid, const char *relpath) 2733 { 2734 char buf[PATH_MAX]; 2735 2736 if (check() < 0) 2737 return -1; 2738 2739 if (pid == 0) 2740 pid = getpid(); 2741 2742 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2743 return __cpuset_move(pid, buf); 2744 } 2745 2746 /* Move all tasks in pidlist to a cpuset */ 2747 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath) 2748 { 2749 int i; 2750 char buf[PATH_MAX]; 2751 int ret; 2752 2753 if (check() < 0) 2754 return -1; 2755 2756 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2757 2758 ret = 0; 2759 for (i = 0; i < pl->npids; i++) 2760 if (__cpuset_move(pl->pids[i], buf) < 0) 2761 ret = -1; 2762 return ret; 2763 } 2764 2765 /* 2766 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a 2767 * cpuset to another cpuset 2768 * 2769 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may 2770 * race with tasks being added to or forking into fromrelpath. Loop 2771 * repeatedly, reading the tasks file of cpuset fromrelpath and writing 2772 * any task pid's found there to the tasks file of cpuset torelpath, 2773 * up to ten attempts, or until the tasks file of cpuset fromrelpath 2774 * is empty, or until fromrelpath is no longer present. 2775 * 2776 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset 2777 * fromrelpath. Of course it is still possible that some independent 2778 * task could add another task to cpuset fromrelpath at the same time 2779 * that such a successful result is being returned, so there can be 2780 * no guarantee that a successful return means that fromrelpath is 2781 * still empty of tasks. 2782 * 2783 * We are careful to allow for the possibility that the cpuset 2784 * fromrelpath might disappear out from under us, perhaps because it 2785 * has notify_on_release set and gets automatically removed as soon 2786 * as we detach its last task from it. Consider a missing fromrelpath 2787 * to be a successful move. 2788 * 2789 * If called with fromrelpath and torelpath pathnames that evaluate to 2790 * the same cpuset, then treat that as if cpuset_reattach() was called, 2791 * rebinding each task in this cpuset one time, and return success or 2792 * failure depending on the return of that cpuset_reattach() call. 2793 * 2794 * On failure, returns -1, with errno possibly one of: 2795 * EACCES - search permission denied on intervening directory 2796 * ENOTEMPTY - tasks remain after multiple attempts to move them 2797 * EMFILE - too many open files 2798 * ENODEV - /dev/cpuset not mounted 2799 * ENOENT - component of cpuset path doesn't exist 2800 * ENOMEM - out of memory 2801 * ENOSYS - kernel doesn't support cpusets 2802 * ENOTDIR - component of cpuset path is not a directory 2803 * EPERM - lacked permission to kill a task 2804 * EPERM - lacked permission to read cpusets or files therein 2805 * 2806 * This is an [optional] function. Use cpuset_function to invoke it. 2807 */ 2808 2809 #define NUMBER_MOVE_TASK_ATTEMPTS 10 2810 2811 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath) 2812 { 2813 char fromfullpath[PATH_MAX]; 2814 char tofullpath[PATH_MAX]; 2815 int i; 2816 struct cpuset_pidlist *pl = NULL; 2817 int sav_errno; 2818 2819 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath); 2820 fullpath(tofullpath, sizeof(tofullpath), torelpath); 2821 2822 if (samefile(fromfullpath, tofullpath)) 2823 return cpuset_reattach(fromrelpath); 2824 2825 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) { 2826 int plen, j; 2827 2828 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) { 2829 /* missing cpuset is as good as if all moved */ 2830 if (errno == ENOENT) 2831 goto no_more_cpuset; 2832 2833 /* other problems reading cpuset are bad news */ 2834 sav_errno = errno; 2835 goto failed; 2836 } 2837 2838 if ((plen = cpuset_pidlist_length(pl)) == 0) 2839 goto no_more_pids; 2840 2841 for (j = 0; j < plen; j++) { 2842 pid_t pid; 2843 2844 pid = cpuset_get_pidlist(pl, j); 2845 if (cpuset_move(pid, torelpath) < 0) { 2846 /* missing task is as good as if moved */ 2847 if (errno == ESRCH) 2848 continue; 2849 2850 /* other per-task errors are bad news */ 2851 sav_errno = errno; 2852 goto failed; 2853 } 2854 } 2855 2856 cpuset_freepidlist(pl); 2857 pl = NULL; 2858 } 2859 2860 sav_errno = ENOTEMPTY; 2861 /* fall into ... */ 2862 failed: 2863 cpuset_freepidlist(pl); 2864 errno = sav_errno; 2865 return -1; 2866 2867 no_more_pids: 2868 no_more_cpuset: 2869 /* Success - all tasks (or entire cpuset ;) gone. */ 2870 cpuset_freepidlist(pl); 2871 errno = 0; 2872 return 0; 2873 } 2874 2875 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */ 2876 int cpuset_migrate(pid_t pid, const char *relpath) 2877 { 2878 char buf[PATH_MAX]; 2879 char buf2[PATH_MAX]; 2880 char memory_migrate_flag; 2881 int r; 2882 2883 if (check() < 0) 2884 return -1; 2885 2886 if (pid == 0) 2887 pid = getpid(); 2888 2889 fullpath(buf2, sizeof(buf2), relpath); 2890 2891 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2892 return -1; 2893 if (store_flag(buf2, "memory_migrate", 1) < 0) 2894 return -1; 2895 2896 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2897 2898 r = __cpuset_move(pid, buf); 2899 2900 store_flag(buf2, "memory_migrate", memory_migrate_flag); 2901 return r; 2902 } 2903 2904 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */ 2905 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath) 2906 { 2907 int i; 2908 char buf[PATH_MAX]; 2909 char buf2[PATH_MAX]; 2910 char memory_migrate_flag; 2911 int ret; 2912 2913 if (check() < 0) 2914 return -1; 2915 2916 fullpath(buf2, sizeof(buf2), relpath); 2917 2918 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0) 2919 return -1; 2920 if (store_flag(buf2, "memory_migrate", 1) < 0) 2921 return -1; 2922 2923 fullpath2(buf, sizeof(buf), relpath, "tasks"); 2924 2925 ret = 0; 2926 for (i = 0; i < pl->npids; i++) 2927 if (__cpuset_move(pl->pids[i], buf) < 0) 2928 ret = -1; 2929 2930 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0) 2931 ret = -1; 2932 return ret; 2933 } 2934 2935 /* Rebind cpus_allowed of each task in cpuset 'path' */ 2936 int cpuset_reattach(const char *relpath) 2937 { 2938 struct cpuset_pidlist *pl; 2939 int rc; 2940 2941 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL) 2942 return -1; 2943 rc = cpuset_move_all(pl, relpath); 2944 cpuset_freepidlist(pl); 2945 return rc; 2946 } 2947 2948 /* Map cpuset relative cpu number to system wide cpu number */ 2949 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu) 2950 { 2951 struct cpuset *cp_tofree = NULL; 2952 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2953 int pos = -1; 2954 2955 if (!cp1) 2956 goto err; 2957 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu); 2958 /* fall into ... */ 2959 err: 2960 cpuset_free(cp_tofree); 2961 return pos; 2962 } 2963 2964 /* Map system wide cpu number to cpuset relative cpu number */ 2965 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu) 2966 { 2967 struct cpuset *cp_tofree = NULL; 2968 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2969 int pos = -1; 2970 2971 if (!cp1) 2972 goto err; 2973 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu); 2974 /* fall into ... */ 2975 err: 2976 cpuset_free(cp_tofree); 2977 return pos; 2978 } 2979 2980 /* Map cpuset relative mem number to system wide mem number */ 2981 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem) 2982 { 2983 struct cpuset *cp_tofree = NULL; 2984 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 2985 int pos = -1; 2986 2987 if (!cp1) 2988 goto err; 2989 pos = bitmask_rel_to_abs_pos(cp1->mems, mem); 2990 /* fall into ... */ 2991 err: 2992 cpuset_free(cp_tofree); 2993 return pos; 2994 } 2995 2996 /* Map system wide mem number to cpuset relative mem number */ 2997 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem) 2998 { 2999 struct cpuset *cp_tofree = NULL; 3000 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree); 3001 int pos = -1; 3002 3003 if (!cp1) 3004 goto err; 3005 pos = bitmask_abs_to_rel_pos(cp1->mems, mem); 3006 /* fall into ... */ 3007 err: 3008 cpuset_free(cp_tofree); 3009 return pos; 3010 } 3011 3012 /* Map pid's cpuset relative cpu number to system wide cpu number */ 3013 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu) 3014 { 3015 struct cpuset *cp; 3016 int rc = -1; 3017 3018 if ((cp = cpuset_alloc()) == NULL) 3019 goto done; 3020 if (cpuset_cpusetofpid(cp, pid) < 0) 3021 goto done; 3022 rc = cpuset_c_rel_to_sys_cpu(cp, cpu); 3023 done: 3024 cpuset_free(cp); 3025 return rc; 3026 } 3027 3028 /* Map system wide cpu number to pid's cpuset relative cpu number */ 3029 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu) 3030 { 3031 struct cpuset *cp; 3032 int rc = -1; 3033 3034 if ((cp = cpuset_alloc()) == NULL) 3035 goto done; 3036 if (cpuset_cpusetofpid(cp, pid) < 0) 3037 goto done; 3038 rc = cpuset_c_sys_to_rel_cpu(cp, cpu); 3039 done: 3040 cpuset_free(cp); 3041 return rc; 3042 } 3043 3044 /* Map pid's cpuset relative mem number to system wide mem number */ 3045 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem) 3046 { 3047 struct cpuset *cp; 3048 int rc = -1; 3049 3050 if ((cp = cpuset_alloc()) == NULL) 3051 goto done; 3052 if (cpuset_cpusetofpid(cp, pid) < 0) 3053 goto done; 3054 rc = cpuset_c_rel_to_sys_mem(cp, mem); 3055 done: 3056 cpuset_free(cp); 3057 return rc; 3058 } 3059 3060 /* Map system wide mem number to pid's cpuset relative mem number */ 3061 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem) 3062 { 3063 struct cpuset *cp; 3064 int rc = -1; 3065 3066 if ((cp = cpuset_alloc()) == NULL) 3067 goto done; 3068 if (cpuset_cpusetofpid(cp, pid) < 0) 3069 goto done; 3070 rc = cpuset_c_sys_to_rel_mem(cp, mem); 3071 done: 3072 cpuset_free(cp); 3073 return rc; 3074 } 3075 3076 /* 3077 * Override glibc's calls for get/set affinity - they have 3078 * something using cpu_set_t that will die when NR_CPUS > 1024. 3079 * Go directly to the 'real' system calls. Also override calls 3080 * for get_mempolicy and set_mempolicy. None of these 3081 * calls are yet (July 2004) guaranteed to be in all glibc versions 3082 * that we care about. 3083 */ 3084 3085 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask) 3086 { 3087 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask); 3088 } 3089 3090 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3091 static int get_mempolicy(int *policy, unsigned long *nmask, 3092 unsigned long maxnode, void *addr, int flags) 3093 { 3094 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode, 3095 addr, flags); 3096 } 3097 #endif 3098 3099 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT 3100 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode) 3101 { 3102 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode); 3103 } 3104 #endif 3105 3106 struct cpuset_placement { 3107 struct bitmask *cpus; 3108 struct bitmask *mems; 3109 char *path; 3110 }; 3111 3112 /* Allocate and fill in a placement struct - cpatures current placement */ 3113 struct cpuset_placement *cpuset_get_placement(pid_t pid) 3114 { 3115 struct cpuset_placement *plc; 3116 struct cpuset *cp = NULL; 3117 char buf[PATH_MAX]; 3118 int nbits; 3119 3120 if ((plc = calloc(1, sizeof(*plc))) == NULL) 3121 goto err; 3122 3123 nbits = cpuset_cpus_nbits(); 3124 if ((plc->cpus = bitmask_alloc(nbits)) == NULL) 3125 goto err; 3126 3127 nbits = cpuset_mems_nbits(); 3128 if ((plc->mems = bitmask_alloc(nbits)) == NULL) 3129 goto err; 3130 3131 if ((cp = cpuset_alloc()) == NULL) 3132 goto err; 3133 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL) 3134 goto err; 3135 if (cpuset_query(cp, buf) < 0) 3136 goto err; 3137 3138 bitmask_copy(plc->cpus, cp->cpus); 3139 bitmask_copy(plc->mems, cp->mems); 3140 plc->path = strdup(buf); 3141 3142 cpuset_free(cp); 3143 return plc; 3144 err: 3145 cpuset_free(cp); 3146 cpuset_free_placement(plc); 3147 return NULL; 3148 } 3149 3150 /* Compare two placement structs - use to detect changes in placement */ 3151 int cpuset_equal_placement(const struct cpuset_placement *plc1, 3152 const struct cpuset_placement *plc2) 3153 { 3154 return bitmask_equal(plc1->cpus, plc2->cpus) && 3155 bitmask_equal(plc1->mems, plc2->mems) && 3156 streq(plc1->path, plc2->path); 3157 } 3158 3159 /* Free a placement struct */ 3160 void cpuset_free_placement(struct cpuset_placement *plc) 3161 { 3162 if (!plc) 3163 return; 3164 bitmask_free(plc->cpus); 3165 bitmask_free(plc->mems); 3166 free(plc->path); 3167 free(plc); 3168 } 3169 3170 /* 3171 * A cpuset_fts_open() call constructs a linked list of entries 3172 * called a "cpuset_fts_tree", with one entry per cpuset below 3173 * the specified path. The cpuset_fts_read() routine returns the 3174 * next entry on this list. The various cpuset_fts_get_*() calls 3175 * return attributes of the specified entry. The cpuset_fts_close() 3176 * call frees the linked list and all associated data. All cpuset 3177 * entries and attributes for the cpuset_fts_tree returned from a 3178 * given cpuset_fts_open() call remain allocated and unchanged until 3179 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any 3180 * subsequent changes to the cpuset filesystem will go unnoticed 3181 * (not affect open cpuset_fts_tree's.) 3182 */ 3183 3184 struct cpuset_fts_entry; 3185 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree); 3186 3187 struct cpuset_fts_tree { 3188 struct cpuset_fts_entry *head; /* head of linked entry list */ 3189 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */ 3190 }; 3191 3192 struct cpuset_fts_entry { 3193 struct cpuset_fts_entry *next; /* linked entry list chain */ 3194 struct cpuset *cpuset; 3195 struct stat *stat; 3196 char *path; 3197 int info; 3198 int err; 3199 }; 3200 3201 /* Open a handle on a cpuset hierarchy. All the real work is done here. */ 3202 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath) 3203 { 3204 FTS *fts = NULL; 3205 FTSENT *ftsent; 3206 char *path_argv[2]; 3207 char buf[PATH_MAX]; 3208 struct cpuset_fts_tree *cs_tree = NULL; 3209 struct cpuset_fts_entry *ep; /* the latest new list entry */ 3210 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */ 3211 char *relpath; 3212 int fts_flags; 3213 3214 fullpath(buf, sizeof(buf), cpusetpath); 3215 path_argv[0] = buf; 3216 path_argv[1] = NULL; 3217 3218 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV; 3219 fts = fts_open(path_argv, fts_flags, NULL); 3220 if (fts == NULL) 3221 goto err; 3222 3223 cs_tree = malloc(sizeof(*cs_tree)); 3224 if (cs_tree == NULL) 3225 goto err; 3226 pnlep = &cs_tree->head; 3227 *pnlep = NULL; 3228 3229 while ((ftsent = fts_read(fts)) != NULL) { 3230 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR) 3231 continue; 3232 3233 /* ftsent is a directory (perhaps unreadable) ==> cpuset */ 3234 ep = calloc(1, sizeof(*ep)); 3235 if (ep == NULL) 3236 goto err; 3237 *pnlep = ep; 3238 pnlep = &ep->next; 3239 3240 /* Set entry's path, and if DNR, error */ 3241 relpath = ftsent->fts_path + strlen(cpusetmnt); 3242 if (strlen(relpath) == 0) 3243 relpath = "/"; 3244 ep->path = strdup(relpath); 3245 if (ep->path == NULL) 3246 goto err; 3247 if (ftsent->fts_info == FTS_DNR) { 3248 ep->info = CPUSET_FTS_ERR_DNR; 3249 ep->err = ftsent->fts_errno; 3250 continue; 3251 } 3252 3253 /* ftsent is a -readable- cpuset: set entry's stat, etc */ 3254 ep->stat = calloc(1, sizeof(struct stat)); 3255 if (ep->stat == NULL) 3256 goto err; 3257 if (stat(ftsent->fts_path, ep->stat) < 0) { 3258 ep->info = CPUSET_FTS_ERR_STAT; 3259 ep->err = ftsent->fts_errno; 3260 continue; 3261 } 3262 3263 ep->cpuset = calloc(1, sizeof(struct cpuset)); 3264 if (ep->cpuset == NULL) 3265 goto err; 3266 if (cpuset_query(ep->cpuset, relpath) < 0) { 3267 ep->info = CPUSET_FTS_ERR_CPUSET; 3268 ep->err = errno; 3269 continue; 3270 } 3271 ep->info = CPUSET_FTS_CPUSET; 3272 } 3273 3274 (void)fts_close(fts); 3275 cpuset_fts_rewind(cs_tree); 3276 return cs_tree; 3277 3278 err: 3279 if (cs_tree) 3280 cpuset_fts_close(cs_tree); 3281 if (fts) 3282 (void)fts_close(fts); 3283 return NULL; 3284 } 3285 3286 /* Return pointer to next cpuset entry in hierarchy */ 3287 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree) 3288 { 3289 const struct cpuset_fts_entry *cs_entry = cs_tree->next; 3290 if (cs_tree->next != NULL) /* seek to next entry */ 3291 cs_tree->next = cs_tree->next->next; 3292 return cs_entry; 3293 } 3294 3295 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */ 3296 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree) 3297 { 3298 struct cpuset_fts_entry *cs1, *cs2, *cs3; 3299 3300 /* 3301 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer 3302 * is redirected from cs3 to cs1. 3303 */ 3304 3305 cs1 = cs2 = NULL; 3306 cs3 = cs_tree->head; 3307 while (cs3) { 3308 cs1 = cs2; 3309 cs2 = cs3; 3310 cs3 = cs3->next; 3311 cs2->next = cs1; 3312 } 3313 cs_tree->head = cs2; 3314 cpuset_fts_rewind(cs_tree); 3315 } 3316 3317 /* Rewind cpuset list to beginning */ 3318 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree) 3319 { 3320 cs_tree->next = cs_tree->head; 3321 } 3322 3323 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */ 3324 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry) 3325 { 3326 return cs_entry->path; 3327 } 3328 3329 /* Return pointer to stat(2) structure of a cpuset entry's directory */ 3330 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry) 3331 { 3332 return cs_entry->stat; 3333 } 3334 3335 /* Return pointer to cpuset structure of a cpuset entry */ 3336 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry 3337 *cs_entry) 3338 { 3339 return cs_entry->cpuset; 3340 } 3341 3342 /* Return value of errno (0 if no error) on attempted cpuset operations */ 3343 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry) 3344 { 3345 return cs_entry->err; 3346 } 3347 3348 /* Return operation identity causing error */ 3349 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry) 3350 { 3351 return cs_entry->info; 3352 } 3353 3354 /* Close a cpuset hierarchy handle (free's all associated memory) */ 3355 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree) 3356 { 3357 struct cpuset_fts_entry *cs_entry = cs_tree->head; 3358 3359 while (cs_entry) { 3360 struct cpuset_fts_entry *ep = cs_entry; 3361 3362 cs_entry = cs_entry->next; 3363 free(ep->path); 3364 free(ep->stat); 3365 cpuset_free(ep->cpuset); 3366 free(ep); 3367 } 3368 free(cs_tree); 3369 } 3370 3371 /* Bind current task to cpu (uses sched_setaffinity(2)) */ 3372 int cpuset_cpubind(int cpu) 3373 { 3374 struct bitmask *bmp; 3375 int r; 3376 3377 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3378 return -1; 3379 bitmask_setbit(bmp, cpu); 3380 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp)); 3381 bitmask_free(bmp); 3382 return r; 3383 } 3384 3385 /* 3386 * int cpuset_latestcpu(pid_t pid) 3387 * 3388 * Return most recent CPU on which task pid executed. If pid == 0, 3389 * examine current task. 3390 * 3391 * The last used CPU is visible for a given pid as field #39 (starting 3392 * with #1) in the file /proc/pid/stat. Currently this file has 41 3393 * fields, in which case this is the 3rd to the last field. 3394 * 3395 * Unfortunately field #2 is a command name and might have embedded 3396 * whitespace. So we can't just count white space separated fields. 3397 * Fortunately, this command name is surrounded by parentheses, as 3398 * for example "(sh)", and that closing parenthesis is the last ')' 3399 * character in the line. No remaining fields can have embedded 3400 * whitespace or parentheses. So instead of looking for the 39th 3401 * white space separated field, we can look for the 37th white space 3402 * separated field past the last ')' character on the line. 3403 */ 3404 3405 /* Return most recent CPU on which task pid executed */ 3406 int cpuset_latestcpu(pid_t pid) 3407 { 3408 char buf[PATH_MAX]; 3409 char *bp; 3410 int fd = -1; 3411 int cpu = -1; 3412 3413 if (pid == 0) 3414 snprintf(buf, sizeof(buf), "/proc/self/stat"); 3415 else 3416 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid); 3417 3418 if ((fd = open(buf, O_RDONLY)) < 0) 3419 goto err; 3420 if (read(fd, buf, sizeof(buf)) < 1) 3421 goto err; 3422 close(fd); 3423 3424 bp = strrchr(buf, ')'); 3425 if (bp) 3426 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */ 3427 &cpu); 3428 if (cpu < 0) 3429 errno = EINVAL; 3430 return cpu; 3431 err: 3432 if (fd >= 0) 3433 close(fd); 3434 return -1; 3435 } 3436 3437 /* Bind current task to memory (uses set_mempolicy(2)) */ 3438 int cpuset_membind(int mem) 3439 { 3440 struct bitmask *bmp; 3441 int r; 3442 3443 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3444 return -1; 3445 bitmask_setbit(bmp, mem); 3446 #if HAVE_DECL_MPOL_BIND 3447 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1); 3448 #else 3449 r = -1; 3450 errno = ENOSYS; 3451 #endif 3452 bitmask_free(bmp); 3453 return r; 3454 } 3455 3456 /* [optional] Return Memory Node holding page at specified addr */ 3457 int cpuset_addr2node(void *addr) 3458 { 3459 int node = -1; 3460 3461 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE 3462 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) { 3463 /* I realize this seems redundant, but I _want_ to make sure 3464 * that this value is -1. */ 3465 node = -1; 3466 } 3467 #endif 3468 return node; 3469 } 3470 3471 /* 3472 * Transform cpuset into Text Format Representation in buffer 'buf', 3473 * of length 'buflen', nul-terminated if space allows. Return number 3474 * of characters that would have been written, if enough space had 3475 * been available, in the same way that snprintf() does. 3476 */ 3477 3478 /* Export cpuset settings to a regular file */ 3479 int cpuset_export(const struct cpuset *cp, char *buf, int buflen) 3480 { 3481 char *tmp = NULL; 3482 int n = 0; 3483 3484 if (cp->cpu_exclusive) 3485 n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n"); 3486 3487 if (cp->mem_exclusive) 3488 n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n"); 3489 3490 if (cp->notify_on_release) 3491 n += snprintf(buf + n, MAX(buflen - n, 0), 3492 "notify_on_release\n"); 3493 3494 if (cp->memory_pressure_enabled) 3495 n += snprintf(buf + n, MAX(buflen - n, 0), 3496 "memory_pressure_enabled\n"); 3497 3498 if (cp->memory_migrate) 3499 n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n"); 3500 3501 if (cp->memory_spread_page) 3502 n += snprintf(buf + n, MAX(buflen - n, 0), 3503 "memory_spread_page\n"); 3504 3505 if (cp->memory_spread_slab) 3506 n += snprintf(buf + n, MAX(buflen - n, 0), 3507 "memory_spread_slab\n"); 3508 3509 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL) 3510 return -1; 3511 n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp); 3512 free(tmp); 3513 tmp = NULL; 3514 3515 if ((tmp = sprint_mask_buf(cp->mems)) == NULL) 3516 return -1; 3517 n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp); 3518 free(tmp); 3519 tmp = NULL; 3520 3521 return n; 3522 } 3523 3524 static int import_list(UNUSED const char *tok, const char *arg, 3525 struct bitmask *bmp, char *emsg, int elen) 3526 { 3527 if (bitmask_parselist(arg, bmp) < 0) { 3528 if (emsg) 3529 snprintf(emsg, elen, "Invalid list format: %s", arg); 3530 return -1; 3531 } 3532 return 0; 3533 } 3534 3535 static void stolower(char *s) 3536 { 3537 while (*s) { 3538 unsigned char c = *s; 3539 *s = tolower(c); 3540 s++; 3541 } 3542 } 3543 3544 /* Import cpuset settings from a regular file */ 3545 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum, 3546 char *emsg, int elen) 3547 { 3548 char *linebuf = NULL; 3549 int linebuflen; 3550 int linenum = 0; 3551 int offset = 0; 3552 3553 linebuflen = strlen(buf) + 1; 3554 if ((linebuf = malloc(linebuflen)) == NULL) { 3555 if (emsg) 3556 snprintf(emsg, elen, "Insufficient memory"); 3557 goto err; 3558 } 3559 3560 while (slgets(linebuf, linebuflen, buf, &offset)) { 3561 char *tok, *arg; 3562 char *ptr; /* for strtok_r */ 3563 3564 linenum++; 3565 if ((tok = strchr(linebuf, '#')) != NULL) 3566 *tok = 0; 3567 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL) 3568 continue; 3569 stolower(tok); 3570 3571 arg = strtok_r(0, " \t", &ptr); 3572 3573 if (streq(tok, "cpu_exclusive")) { 3574 cp->cpu_exclusive = 1; 3575 goto eol; 3576 } 3577 if (streq(tok, "mem_exclusive")) { 3578 cp->mem_exclusive = 1; 3579 goto eol; 3580 } 3581 if (streq(tok, "notify_on_release")) { 3582 cp->notify_on_release = 1; 3583 goto eol; 3584 } 3585 if (streq(tok, "memory_pressure_enabled")) { 3586 cp->memory_pressure_enabled = 1; 3587 goto eol; 3588 } 3589 if (streq(tok, "memory_migrate")) { 3590 cp->memory_migrate = 1; 3591 goto eol; 3592 } 3593 if (streq(tok, "memory_spread_page")) { 3594 cp->memory_spread_page = 1; 3595 goto eol; 3596 } 3597 if (streq(tok, "memory_spread_slab")) { 3598 cp->memory_spread_slab = 1; 3599 goto eol; 3600 } 3601 if (streq(tok, "cpu") || streq(tok, "cpus")) { 3602 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0) 3603 goto err; 3604 goto eol; 3605 } 3606 if (streq(tok, "mem") || streq(tok, "mems")) { 3607 if (import_list(tok, arg, cp->mems, emsg, elen) < 0) 3608 goto err; 3609 goto eol; 3610 } 3611 if (emsg) 3612 snprintf(emsg, elen, "Unrecognized token: '%s'", tok); 3613 goto err; 3614 eol: 3615 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) { 3616 if (emsg) 3617 snprintf(emsg, elen, "Surplus token: '%s'", 3618 tok); 3619 goto err; 3620 } 3621 continue; 3622 } 3623 3624 free(linebuf); 3625 3626 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems)) 3627 cpuset_localcpus(cp->mems, cp->cpus); 3628 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems)) 3629 cpuset_localmems(cp->cpus, cp->mems); 3630 3631 /* 3632 * All cpuset attributes are determined in an import. 3633 * Those that aren't explicitly specified are presumed 3634 * to be unchanged (zero, if it's a freshly allocated 3635 * struct cpuset.) 3636 */ 3637 3638 cp->cpus_valid = 1; 3639 cp->mems_valid = 1; 3640 cp->cpu_exclusive_valid = 1; 3641 cp->mem_exclusive_valid = 1; 3642 cp->notify_on_release_valid = 1; 3643 cp->memory_migrate_valid = 1; 3644 cp->memory_pressure_enabled_valid = 1; 3645 cp->memory_spread_page_valid = 1; 3646 cp->memory_spread_slab_valid = 1; 3647 3648 return 0; 3649 err: 3650 if (elinenum) 3651 *elinenum = linenum; 3652 free(linebuf); 3653 return -1; 3654 } 3655 3656 /* Pin current task CPU (and memory) */ 3657 int cpuset_pin(int relcpu) 3658 { 3659 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3660 int cpu, r; 3661 3662 if (check() < 0) 3663 return -1; 3664 3665 do { 3666 cpuset_free_placement(plc1); 3667 plc1 = cpuset_get_placement(0); 3668 3669 r = 0; 3670 if (cpuset_unpin() < 0) 3671 r = -1; 3672 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu); 3673 if (cpuset_cpubind(cpu) < 0) 3674 r = -1; 3675 3676 cpuset_free_placement(plc2); 3677 plc2 = cpuset_get_placement(0); 3678 } while (!cpuset_equal_placement(plc1, plc2)); 3679 3680 cpuset_free_placement(plc1); 3681 cpuset_free_placement(plc2); 3682 return r; 3683 } 3684 3685 /* Return number CPUs in current tasks cpuset */ 3686 int cpuset_size() 3687 { 3688 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3689 int r; 3690 3691 if (check() < 0) 3692 return -1; 3693 3694 do { 3695 cpuset_free_placement(plc1); 3696 plc1 = cpuset_get_placement(0); 3697 3698 r = cpuset_cpus_weight(0); 3699 3700 cpuset_free_placement(plc2); 3701 plc2 = cpuset_get_placement(0); 3702 } while (!cpuset_equal_placement(plc1, plc2)); 3703 3704 cpuset_free_placement(plc1); 3705 cpuset_free_placement(plc2); 3706 return r; 3707 } 3708 3709 /* Return relative CPU number, within current cpuset, last executed on */ 3710 int cpuset_where() 3711 { 3712 struct cpuset_placement *plc1 = NULL, *plc2 = NULL; 3713 int r; 3714 3715 if (check() < 0) 3716 return -1; 3717 3718 do { 3719 cpuset_free_placement(plc1); 3720 plc1 = cpuset_get_placement(0); 3721 3722 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0)); 3723 3724 cpuset_free_placement(plc2); 3725 plc2 = cpuset_get_placement(0); 3726 } while (!cpuset_equal_placement(plc1, plc2)); 3727 3728 cpuset_free_placement(plc1); 3729 cpuset_free_placement(plc2); 3730 return r; 3731 } 3732 3733 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */ 3734 int cpuset_unpin() 3735 { 3736 struct bitmask *cpus = NULL, *mems = NULL; 3737 int r = -1; 3738 3739 if (check() < 0) 3740 goto err; 3741 3742 /* 3743 * Don't need cpuset_*_placement() guard against concurrent 3744 * cpuset migration, because none of the following depends 3745 * on the tasks cpuset placement. 3746 */ 3747 3748 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL) 3749 goto err; 3750 bitmask_setall(cpus); 3751 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0) 3752 goto err; 3753 3754 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL) 3755 goto err; 3756 #if HAVE_DECL_MPOL_DEFAULT 3757 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems), 3758 bitmask_nbits(mems) + 1) < 0) 3759 goto err; 3760 r = 0; 3761 #endif 3762 /* fall into ... */ 3763 err: 3764 bitmask_free(cpus); 3765 bitmask_free(mems); 3766 return r; 3767 3768 } 3769 3770 struct cpuset_function_list { 3771 const char *fname; 3772 void *func; 3773 } flist[] = { 3774 { 3775 "cpuset_version", cpuset_version}, { 3776 "cpuset_alloc", cpuset_alloc}, { 3777 "cpuset_free", cpuset_free}, { 3778 "cpuset_cpus_nbits", cpuset_cpus_nbits}, { 3779 "cpuset_mems_nbits", cpuset_mems_nbits}, { 3780 "cpuset_setcpus", cpuset_setcpus}, { 3781 "cpuset_setmems", cpuset_setmems}, { 3782 "cpuset_set_iopt", cpuset_set_iopt}, { 3783 "cpuset_set_sopt", cpuset_set_sopt}, { 3784 "cpuset_getcpus", cpuset_getcpus}, { 3785 "cpuset_getmems", cpuset_getmems}, { 3786 "cpuset_cpus_weight", cpuset_cpus_weight}, { 3787 "cpuset_mems_weight", cpuset_mems_weight}, { 3788 "cpuset_get_iopt", cpuset_get_iopt}, { 3789 "cpuset_get_sopt", cpuset_get_sopt}, { 3790 "cpuset_localcpus", cpuset_localcpus}, { 3791 "cpuset_localmems", cpuset_localmems}, { 3792 "cpuset_cpumemdist", cpuset_cpumemdist}, { 3793 "cpuset_cpu2node", cpuset_cpu2node}, { 3794 "cpuset_addr2node", cpuset_addr2node}, { 3795 "cpuset_create", cpuset_create}, { 3796 "cpuset_delete", cpuset_delete}, { 3797 "cpuset_query", cpuset_query}, { 3798 "cpuset_modify", cpuset_modify}, { 3799 "cpuset_getcpusetpath", cpuset_getcpusetpath}, { 3800 "cpuset_cpusetofpid", cpuset_cpusetofpid}, { 3801 "cpuset_mountpoint", cpuset_mountpoint}, { 3802 "cpuset_collides_exclusive", cpuset_collides_exclusive}, { 3803 "cpuset_nuke", cpuset_nuke}, { 3804 "cpuset_init_pidlist", cpuset_init_pidlist}, { 3805 "cpuset_pidlist_length", cpuset_pidlist_length}, { 3806 "cpuset_get_pidlist", cpuset_get_pidlist}, { 3807 "cpuset_freepidlist", cpuset_freepidlist}, { 3808 "cpuset_move", cpuset_move}, { 3809 "cpuset_move_all", cpuset_move_all}, { 3810 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, { 3811 "cpuset_migrate", cpuset_migrate}, { 3812 "cpuset_migrate_all", cpuset_migrate_all}, { 3813 "cpuset_reattach", cpuset_reattach}, { 3814 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, { 3815 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, { 3816 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, { 3817 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, { 3818 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, { 3819 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, { 3820 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, { 3821 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, { 3822 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, { 3823 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, { 3824 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, { 3825 "cpuset_get_placement", cpuset_get_placement}, { 3826 "cpuset_equal_placement", cpuset_equal_placement}, { 3827 "cpuset_free_placement", cpuset_free_placement}, { 3828 "cpuset_fts_open", cpuset_fts_open}, { 3829 "cpuset_fts_read", cpuset_fts_read}, { 3830 "cpuset_fts_reverse", cpuset_fts_reverse}, { 3831 "cpuset_fts_rewind", cpuset_fts_rewind}, { 3832 "cpuset_fts_get_path", cpuset_fts_get_path}, { 3833 "cpuset_fts_get_stat", cpuset_fts_get_stat}, { 3834 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, { 3835 "cpuset_fts_get_errno", cpuset_fts_get_errno}, { 3836 "cpuset_fts_get_info", cpuset_fts_get_info}, { 3837 "cpuset_fts_close", cpuset_fts_close}, { 3838 "cpuset_cpubind", cpuset_cpubind}, { 3839 "cpuset_latestcpu", cpuset_latestcpu}, { 3840 "cpuset_membind", cpuset_membind}, { 3841 "cpuset_export", cpuset_export}, { 3842 "cpuset_import", cpuset_import}, { 3843 "cpuset_function", cpuset_function}, { 3844 "cpuset_pin", cpuset_pin}, { 3845 "cpuset_size", cpuset_size}, { 3846 "cpuset_where", cpuset_where}, { 3847 "cpuset_unpin", cpuset_unpin},}; 3848 3849 /* Return pointer to a libcpuset.so function, or NULL */ 3850 void *cpuset_function(const char *function_name) 3851 { 3852 unsigned int i; 3853 3854 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++) 3855 if (streq(function_name, flist[i].fname)) 3856 return flist[i].func; 3857 return NULL; 3858 } 3859 3860 /* Fortran interface to basic cpuset routines */ 3861 int cpuset_pin_(int *ptr_relcpu) 3862 { 3863 return cpuset_pin(*ptr_relcpu); 3864 } 3865 3866 int cpuset_size_(void) 3867 { 3868 return cpuset_size(); 3869 } 3870 3871 int cpuset_where_(void) 3872 { 3873 return cpuset_where(); 3874 } 3875 3876 int cpuset_unpin_(void) 3877 { 3878 return cpuset_unpin(); 3879 } 3880 3881 #endif /* HAVE_LINUX_MEMPOLICY_H */ 3882