1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo (at) redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan (at) linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang (at) intel.com> 36 * Wu Fengguang <fengguang.wu (at) intel.com> 37 * Mike Galbraith <efault (at) gmx.de> 38 * Paul Mackerras <paulus (at) samba.org> 39 * Jaswinder Singh Rajput <jaswinder (at) kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/event.h" 50 #include "util/evlist.h" 51 #include "util/evsel.h" 52 #include "util/debug.h" 53 #include "util/color.h" 54 #include "util/header.h" 55 #include "util/cpumap.h" 56 #include "util/thread.h" 57 #include "util/thread_map.h" 58 59 /* ANDROID_CHANGE_BEGIN */ 60 #ifndef __APPLE__ 61 #include <sys/prctl.h> 62 #endif 63 /* ANDROID_CHANGE_END */ 64 #include <math.h> 65 #include <locale.h> 66 67 #define DEFAULT_SEPARATOR " " 68 69 static struct perf_event_attr default_attrs[] = { 70 71 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 72 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 73 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 74 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 75 76 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 77 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 78 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 79 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 80 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 81 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 82 83 }; 84 85 /* 86 * Detailed stats (-d), covering the L1 and last level data caches: 87 */ 88 static struct perf_event_attr detailed_attrs[] = { 89 90 { .type = PERF_TYPE_HW_CACHE, 91 .config = 92 PERF_COUNT_HW_CACHE_L1D << 0 | 93 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 94 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 95 96 { .type = PERF_TYPE_HW_CACHE, 97 .config = 98 PERF_COUNT_HW_CACHE_L1D << 0 | 99 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 100 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 101 102 { .type = PERF_TYPE_HW_CACHE, 103 .config = 104 PERF_COUNT_HW_CACHE_LL << 0 | 105 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 106 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 107 108 { .type = PERF_TYPE_HW_CACHE, 109 .config = 110 PERF_COUNT_HW_CACHE_LL << 0 | 111 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 112 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 113 }; 114 115 /* 116 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 117 */ 118 static struct perf_event_attr very_detailed_attrs[] = { 119 120 { .type = PERF_TYPE_HW_CACHE, 121 .config = 122 PERF_COUNT_HW_CACHE_L1I << 0 | 123 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 124 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 125 126 { .type = PERF_TYPE_HW_CACHE, 127 .config = 128 PERF_COUNT_HW_CACHE_L1I << 0 | 129 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 130 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 131 132 { .type = PERF_TYPE_HW_CACHE, 133 .config = 134 PERF_COUNT_HW_CACHE_DTLB << 0 | 135 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 136 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 137 138 { .type = PERF_TYPE_HW_CACHE, 139 .config = 140 PERF_COUNT_HW_CACHE_DTLB << 0 | 141 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 142 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 143 144 { .type = PERF_TYPE_HW_CACHE, 145 .config = 146 PERF_COUNT_HW_CACHE_ITLB << 0 | 147 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 148 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 149 150 { .type = PERF_TYPE_HW_CACHE, 151 .config = 152 PERF_COUNT_HW_CACHE_ITLB << 0 | 153 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 154 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 155 156 }; 157 158 /* 159 * Very, very detailed stats (-d -d -d), adding prefetch events: 160 */ 161 static struct perf_event_attr very_very_detailed_attrs[] = { 162 163 { .type = PERF_TYPE_HW_CACHE, 164 .config = 165 PERF_COUNT_HW_CACHE_L1D << 0 | 166 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 167 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 168 169 { .type = PERF_TYPE_HW_CACHE, 170 .config = 171 PERF_COUNT_HW_CACHE_L1D << 0 | 172 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 173 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 174 }; 175 176 177 178 struct perf_evlist *evsel_list; 179 180 static bool system_wide = false; 181 static int run_idx = 0; 182 183 static int run_count = 1; 184 static bool no_inherit = false; 185 static bool scale = true; 186 static bool no_aggr = false; 187 static pid_t target_pid = -1; 188 static pid_t target_tid = -1; 189 static pid_t child_pid = -1; 190 static bool null_run = false; 191 static int detailed_run = 0; 192 static bool sync_run = false; 193 static bool big_num = true; 194 static int big_num_opt = -1; 195 static const char *cpu_list; 196 static const char *csv_sep = NULL; 197 static bool csv_output = false; 198 199 static volatile int done = 0; 200 201 struct stats 202 { 203 double n, mean, M2; 204 }; 205 206 struct perf_stat { 207 struct stats res_stats[3]; 208 }; 209 210 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 211 { 212 evsel->priv = zalloc(sizeof(struct perf_stat)); 213 return evsel->priv == NULL ? -ENOMEM : 0; 214 } 215 216 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 217 { 218 free(evsel->priv); 219 evsel->priv = NULL; 220 } 221 222 static void update_stats(struct stats *stats, u64 val) 223 { 224 double delta; 225 226 stats->n++; 227 delta = val - stats->mean; 228 stats->mean += delta / stats->n; 229 stats->M2 += delta*(val - stats->mean); 230 } 231 232 static double avg_stats(struct stats *stats) 233 { 234 return stats->mean; 235 } 236 237 /* 238 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance 239 * 240 * (\Sum n_i^2) - ((\Sum n_i)^2)/n 241 * s^2 = ------------------------------- 242 * n - 1 243 * 244 * http://en.wikipedia.org/wiki/Stddev 245 * 246 * The std dev of the mean is related to the std dev by: 247 * 248 * s 249 * s_mean = ------- 250 * sqrt(n) 251 * 252 */ 253 static double stddev_stats(struct stats *stats) 254 { 255 double variance = stats->M2 / (stats->n - 1); 256 double variance_mean = variance / stats->n; 257 258 return sqrt(variance_mean); 259 } 260 261 struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 262 struct stats runtime_cycles_stats[MAX_NR_CPUS]; 263 struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 264 struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 265 struct stats runtime_branches_stats[MAX_NR_CPUS]; 266 struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 267 struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 268 struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 269 struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 270 struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 271 struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 272 struct stats walltime_nsecs_stats; 273 274 static int create_perf_stat_counter(struct perf_evsel *evsel) 275 { 276 struct perf_event_attr *attr = &evsel->attr; 277 278 if (scale) 279 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 280 PERF_FORMAT_TOTAL_TIME_RUNNING; 281 282 attr->inherit = !no_inherit; 283 284 if (system_wide) 285 return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false); 286 287 if (target_pid == -1 && target_tid == -1) { 288 attr->disabled = 1; 289 attr->enable_on_exec = 1; 290 } 291 292 return perf_evsel__open_per_thread(evsel, evsel_list->threads, false); 293 } 294 295 /* 296 * Does the counter have nsecs as a unit? 297 */ 298 static inline int nsec_counter(struct perf_evsel *evsel) 299 { 300 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 301 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 302 return 1; 303 304 return 0; 305 } 306 307 /* 308 * Update various tracking values we maintain to print 309 * more semantic information such as miss/hit ratios, 310 * instruction rates, etc: 311 */ 312 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 313 { 314 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 315 update_stats(&runtime_nsecs_stats[0], count[0]); 316 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 317 update_stats(&runtime_cycles_stats[0], count[0]); 318 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 319 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 320 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 321 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 322 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 323 update_stats(&runtime_branches_stats[0], count[0]); 324 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 325 update_stats(&runtime_cacherefs_stats[0], count[0]); 326 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 327 update_stats(&runtime_l1_dcache_stats[0], count[0]); 328 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 329 update_stats(&runtime_l1_icache_stats[0], count[0]); 330 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 331 update_stats(&runtime_ll_cache_stats[0], count[0]); 332 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 333 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 334 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 335 update_stats(&runtime_itlb_cache_stats[0], count[0]); 336 } 337 338 /* 339 * Read out the results of a single counter: 340 * aggregate counts across CPUs in system-wide mode 341 */ 342 static int read_counter_aggr(struct perf_evsel *counter) 343 { 344 struct perf_stat *ps = counter->priv; 345 u64 *count = counter->counts->aggr.values; 346 int i; 347 348 if (__perf_evsel__read(counter, evsel_list->cpus->nr, 349 evsel_list->threads->nr, scale) < 0) 350 return -1; 351 352 for (i = 0; i < 3; i++) 353 update_stats(&ps->res_stats[i], count[i]); 354 355 if (verbose) { 356 fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 357 event_name(counter), count[0], count[1], count[2]); 358 } 359 360 /* 361 * Save the full runtime - to allow normalization during printout: 362 */ 363 update_shadow_stats(counter, count); 364 365 return 0; 366 } 367 368 /* 369 * Read out the results of a single counter: 370 * do not aggregate counts across CPUs in system-wide mode 371 */ 372 static int read_counter(struct perf_evsel *counter) 373 { 374 u64 *count; 375 int cpu; 376 377 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { 378 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 379 return -1; 380 381 count = counter->counts->cpu[cpu].values; 382 383 update_shadow_stats(counter, count); 384 } 385 386 return 0; 387 } 388 389 static int run_perf_stat(int argc __used, const char **argv) 390 { 391 unsigned long long t0, t1; 392 struct perf_evsel *counter; 393 int status = 0; 394 int child_ready_pipe[2], go_pipe[2]; 395 const bool forks = (argc > 0); 396 char buf; 397 398 if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { 399 perror("failed to create pipes"); 400 exit(1); 401 } 402 403 if (forks) { 404 if ((child_pid = fork()) < 0) 405 perror("failed to fork"); 406 407 if (!child_pid) { 408 close(child_ready_pipe[0]); 409 close(go_pipe[1]); 410 fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); 411 412 /* 413 * Do a dummy execvp to get the PLT entry resolved, 414 * so we avoid the resolver overhead on the real 415 * execvp call. 416 */ 417 execvp("", (char **)argv); 418 419 /* 420 * Tell the parent we're ready to go 421 */ 422 close(child_ready_pipe[1]); 423 424 /* 425 * Wait until the parent tells us to go. 426 */ 427 if (read(go_pipe[0], &buf, 1) == -1) 428 perror("unable to read pipe"); 429 430 execvp(argv[0], (char **)argv); 431 432 perror(argv[0]); 433 exit(-1); 434 } 435 436 if (target_tid == -1 && target_pid == -1 && !system_wide) 437 evsel_list->threads->map[0] = child_pid; 438 439 /* 440 * Wait for the child to be ready to exec. 441 */ 442 close(child_ready_pipe[1]); 443 close(go_pipe[0]); 444 if (read(child_ready_pipe[0], &buf, 1) == -1) 445 perror("unable to read pipe"); 446 close(child_ready_pipe[0]); 447 } 448 449 list_for_each_entry(counter, &evsel_list->entries, node) { 450 if (create_perf_stat_counter(counter) < 0) { 451 if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { 452 if (verbose) 453 ui__warning("%s event is not supported by the kernel.\n", 454 event_name(counter)); 455 continue; 456 } 457 458 if (errno == EPERM || errno == EACCES) { 459 error("You may not have permission to collect %sstats.\n" 460 "\t Consider tweaking" 461 " /proc/sys/kernel/perf_event_paranoid or running as root.", 462 system_wide ? "system-wide " : ""); 463 } else { 464 error("open_counter returned with %d (%s). " 465 "/bin/dmesg may provide additional information.\n", 466 errno, strerror(errno)); 467 } 468 if (child_pid != -1) 469 kill(child_pid, SIGTERM); 470 die("Not all events could be opened.\n"); 471 return -1; 472 } 473 } 474 475 if (perf_evlist__set_filters(evsel_list)) { 476 error("failed to set filter with %d (%s)\n", errno, 477 strerror(errno)); 478 return -1; 479 } 480 481 /* 482 * Enable counters and exec the command: 483 */ 484 t0 = rdclock(); 485 486 if (forks) { 487 close(go_pipe[1]); 488 wait(&status); 489 } else { 490 while(!done) sleep(1); 491 } 492 493 t1 = rdclock(); 494 495 update_stats(&walltime_nsecs_stats, t1 - t0); 496 497 if (no_aggr) { 498 list_for_each_entry(counter, &evsel_list->entries, node) { 499 read_counter(counter); 500 perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1); 501 } 502 } else { 503 list_for_each_entry(counter, &evsel_list->entries, node) { 504 read_counter_aggr(counter); 505 perf_evsel__close_fd(counter, evsel_list->cpus->nr, 506 evsel_list->threads->nr); 507 } 508 } 509 510 return WEXITSTATUS(status); 511 } 512 513 static void print_noise_pct(double total, double avg) 514 { 515 double pct = 0.0; 516 517 if (avg) 518 pct = 100.0*total/avg; 519 520 fprintf(stderr, " ( +-%6.2f%% )", pct); 521 } 522 523 static void print_noise(struct perf_evsel *evsel, double avg) 524 { 525 struct perf_stat *ps; 526 527 if (run_count == 1) 528 return; 529 530 ps = evsel->priv; 531 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 532 } 533 534 static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) 535 { 536 double msecs = avg / 1e6; 537 char cpustr[16] = { '\0', }; 538 const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s"; 539 540 if (no_aggr) 541 sprintf(cpustr, "CPU%*d%s", 542 csv_output ? 0 : -4, 543 evsel_list->cpus->map[cpu], csv_sep); 544 545 fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel)); 546 547 if (evsel->cgrp) 548 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); 549 550 if (csv_output) 551 return; 552 553 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 554 fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); 555 } 556 557 static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) 558 { 559 double total, ratio = 0.0; 560 const char *color; 561 562 total = avg_stats(&runtime_cycles_stats[cpu]); 563 564 if (total) 565 ratio = avg / total * 100.0; 566 567 color = PERF_COLOR_NORMAL; 568 if (ratio > 50.0) 569 color = PERF_COLOR_RED; 570 else if (ratio > 30.0) 571 color = PERF_COLOR_MAGENTA; 572 else if (ratio > 10.0) 573 color = PERF_COLOR_YELLOW; 574 575 fprintf(stderr, " # "); 576 color_fprintf(stderr, color, "%6.2f%%", ratio); 577 fprintf(stderr, " frontend cycles idle "); 578 } 579 580 static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) 581 { 582 double total, ratio = 0.0; 583 const char *color; 584 585 total = avg_stats(&runtime_cycles_stats[cpu]); 586 587 if (total) 588 ratio = avg / total * 100.0; 589 590 color = PERF_COLOR_NORMAL; 591 if (ratio > 75.0) 592 color = PERF_COLOR_RED; 593 else if (ratio > 50.0) 594 color = PERF_COLOR_MAGENTA; 595 else if (ratio > 20.0) 596 color = PERF_COLOR_YELLOW; 597 598 fprintf(stderr, " # "); 599 color_fprintf(stderr, color, "%6.2f%%", ratio); 600 fprintf(stderr, " backend cycles idle "); 601 } 602 603 static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) 604 { 605 double total, ratio = 0.0; 606 const char *color; 607 608 total = avg_stats(&runtime_branches_stats[cpu]); 609 610 if (total) 611 ratio = avg / total * 100.0; 612 613 color = PERF_COLOR_NORMAL; 614 if (ratio > 20.0) 615 color = PERF_COLOR_RED; 616 else if (ratio > 10.0) 617 color = PERF_COLOR_MAGENTA; 618 else if (ratio > 5.0) 619 color = PERF_COLOR_YELLOW; 620 621 fprintf(stderr, " # "); 622 color_fprintf(stderr, color, "%6.2f%%", ratio); 623 fprintf(stderr, " of all branches "); 624 } 625 626 static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 627 { 628 double total, ratio = 0.0; 629 const char *color; 630 631 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 632 633 if (total) 634 ratio = avg / total * 100.0; 635 636 color = PERF_COLOR_NORMAL; 637 if (ratio > 20.0) 638 color = PERF_COLOR_RED; 639 else if (ratio > 10.0) 640 color = PERF_COLOR_MAGENTA; 641 else if (ratio > 5.0) 642 color = PERF_COLOR_YELLOW; 643 644 fprintf(stderr, " # "); 645 color_fprintf(stderr, color, "%6.2f%%", ratio); 646 fprintf(stderr, " of all L1-dcache hits "); 647 } 648 649 static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 650 { 651 double total, ratio = 0.0; 652 const char *color; 653 654 total = avg_stats(&runtime_l1_icache_stats[cpu]); 655 656 if (total) 657 ratio = avg / total * 100.0; 658 659 color = PERF_COLOR_NORMAL; 660 if (ratio > 20.0) 661 color = PERF_COLOR_RED; 662 else if (ratio > 10.0) 663 color = PERF_COLOR_MAGENTA; 664 else if (ratio > 5.0) 665 color = PERF_COLOR_YELLOW; 666 667 fprintf(stderr, " # "); 668 color_fprintf(stderr, color, "%6.2f%%", ratio); 669 fprintf(stderr, " of all L1-icache hits "); 670 } 671 672 static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 673 { 674 double total, ratio = 0.0; 675 const char *color; 676 677 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 678 679 if (total) 680 ratio = avg / total * 100.0; 681 682 color = PERF_COLOR_NORMAL; 683 if (ratio > 20.0) 684 color = PERF_COLOR_RED; 685 else if (ratio > 10.0) 686 color = PERF_COLOR_MAGENTA; 687 else if (ratio > 5.0) 688 color = PERF_COLOR_YELLOW; 689 690 fprintf(stderr, " # "); 691 color_fprintf(stderr, color, "%6.2f%%", ratio); 692 fprintf(stderr, " of all dTLB cache hits "); 693 } 694 695 static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 696 { 697 double total, ratio = 0.0; 698 const char *color; 699 700 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 701 702 if (total) 703 ratio = avg / total * 100.0; 704 705 color = PERF_COLOR_NORMAL; 706 if (ratio > 20.0) 707 color = PERF_COLOR_RED; 708 else if (ratio > 10.0) 709 color = PERF_COLOR_MAGENTA; 710 else if (ratio > 5.0) 711 color = PERF_COLOR_YELLOW; 712 713 fprintf(stderr, " # "); 714 color_fprintf(stderr, color, "%6.2f%%", ratio); 715 fprintf(stderr, " of all iTLB cache hits "); 716 } 717 718 static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg) 719 { 720 double total, ratio = 0.0; 721 const char *color; 722 723 total = avg_stats(&runtime_ll_cache_stats[cpu]); 724 725 if (total) 726 ratio = avg / total * 100.0; 727 728 color = PERF_COLOR_NORMAL; 729 if (ratio > 20.0) 730 color = PERF_COLOR_RED; 731 else if (ratio > 10.0) 732 color = PERF_COLOR_MAGENTA; 733 else if (ratio > 5.0) 734 color = PERF_COLOR_YELLOW; 735 736 fprintf(stderr, " # "); 737 color_fprintf(stderr, color, "%6.2f%%", ratio); 738 fprintf(stderr, " of all LL-cache hits "); 739 } 740 741 static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) 742 { 743 double total, ratio = 0.0; 744 char cpustr[16] = { '\0', }; 745 const char *fmt; 746 747 if (csv_output) 748 fmt = "%s%.0f%s%s"; 749 else if (big_num) 750 /* ANDROID_CHANGE_BEGIN */ 751 #ifdef __BIONIC__ 752 /* bionic doesn't like "'" */ 753 fmt = "%s%18.0f%s%-25s"; 754 #else 755 fmt = "%s%'18.0f%s%-25s"; 756 #endif 757 /* ANDROID_CHANGE_END */ 758 else 759 fmt = "%s%18.0f%s%-25s"; 760 761 if (no_aggr) 762 sprintf(cpustr, "CPU%*d%s", 763 csv_output ? 0 : -4, 764 evsel_list->cpus->map[cpu], csv_sep); 765 else 766 cpu = 0; 767 768 fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel)); 769 770 if (evsel->cgrp) 771 fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name); 772 773 if (csv_output) 774 return; 775 776 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 777 total = avg_stats(&runtime_cycles_stats[cpu]); 778 779 if (total) 780 ratio = avg / total; 781 782 fprintf(stderr, " # %5.2f insns per cycle ", ratio); 783 784 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 785 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 786 787 if (total && avg) { 788 ratio = total / avg; 789 fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); 790 } 791 792 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 793 runtime_branches_stats[cpu].n != 0) { 794 print_branch_misses(cpu, evsel, avg); 795 } else if ( 796 evsel->attr.type == PERF_TYPE_HW_CACHE && 797 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 798 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 799 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 800 runtime_l1_dcache_stats[cpu].n != 0) { 801 print_l1_dcache_misses(cpu, evsel, avg); 802 } else if ( 803 evsel->attr.type == PERF_TYPE_HW_CACHE && 804 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 805 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 806 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 807 runtime_l1_icache_stats[cpu].n != 0) { 808 print_l1_icache_misses(cpu, evsel, avg); 809 } else if ( 810 evsel->attr.type == PERF_TYPE_HW_CACHE && 811 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 812 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 813 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 814 runtime_dtlb_cache_stats[cpu].n != 0) { 815 print_dtlb_cache_misses(cpu, evsel, avg); 816 } else if ( 817 evsel->attr.type == PERF_TYPE_HW_CACHE && 818 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 819 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 820 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 821 runtime_itlb_cache_stats[cpu].n != 0) { 822 print_itlb_cache_misses(cpu, evsel, avg); 823 } else if ( 824 evsel->attr.type == PERF_TYPE_HW_CACHE && 825 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 826 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 827 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 828 runtime_ll_cache_stats[cpu].n != 0) { 829 print_ll_cache_misses(cpu, evsel, avg); 830 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 831 runtime_cacherefs_stats[cpu].n != 0) { 832 total = avg_stats(&runtime_cacherefs_stats[cpu]); 833 834 if (total) 835 ratio = avg * 100 / total; 836 837 fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); 838 839 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 840 print_stalled_cycles_frontend(cpu, evsel, avg); 841 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 842 print_stalled_cycles_backend(cpu, evsel, avg); 843 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 844 total = avg_stats(&runtime_nsecs_stats[cpu]); 845 846 if (total) 847 ratio = 1.0 * avg / total; 848 849 fprintf(stderr, " # %8.3f GHz ", ratio); 850 } else if (runtime_nsecs_stats[cpu].n != 0) { 851 total = avg_stats(&runtime_nsecs_stats[cpu]); 852 853 if (total) 854 ratio = 1000.0 * avg / total; 855 856 fprintf(stderr, " # %8.3f M/sec ", ratio); 857 } else { 858 fprintf(stderr, " "); 859 } 860 } 861 862 /* 863 * Print out the results of a single counter: 864 * aggregated counts in system-wide mode 865 */ 866 static void print_counter_aggr(struct perf_evsel *counter) 867 { 868 struct perf_stat *ps = counter->priv; 869 double avg = avg_stats(&ps->res_stats[0]); 870 int scaled = counter->counts->scaled; 871 872 if (scaled == -1) { 873 fprintf(stderr, "%*s%s%*s", 874 csv_output ? 0 : 18, 875 "<not counted>", 876 csv_sep, 877 csv_output ? 0 : -24, 878 event_name(counter)); 879 880 if (counter->cgrp) 881 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); 882 883 fputc('\n', stderr); 884 return; 885 } 886 887 if (nsec_counter(counter)) 888 nsec_printout(-1, counter, avg); 889 else 890 abs_printout(-1, counter, avg); 891 892 if (csv_output) { 893 fputc('\n', stderr); 894 return; 895 } 896 897 print_noise(counter, avg); 898 899 if (scaled) { 900 double avg_enabled, avg_running; 901 902 avg_enabled = avg_stats(&ps->res_stats[1]); 903 avg_running = avg_stats(&ps->res_stats[2]); 904 905 fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled); 906 } 907 fprintf(stderr, "\n"); 908 } 909 910 /* 911 * Print out the results of a single counter: 912 * does not use aggregated count in system-wide 913 */ 914 static void print_counter(struct perf_evsel *counter) 915 { 916 u64 ena, run, val; 917 int cpu; 918 919 for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) { 920 val = counter->counts->cpu[cpu].val; 921 ena = counter->counts->cpu[cpu].ena; 922 run = counter->counts->cpu[cpu].run; 923 if (run == 0 || ena == 0) { 924 fprintf(stderr, "CPU%*d%s%*s%s%*s", 925 csv_output ? 0 : -4, 926 evsel_list->cpus->map[cpu], csv_sep, 927 csv_output ? 0 : 18, 928 "<not counted>", csv_sep, 929 csv_output ? 0 : -24, 930 event_name(counter)); 931 932 if (counter->cgrp) 933 fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name); 934 935 fputc('\n', stderr); 936 continue; 937 } 938 939 if (nsec_counter(counter)) 940 nsec_printout(cpu, counter, val); 941 else 942 abs_printout(cpu, counter, val); 943 944 if (!csv_output) { 945 print_noise(counter, 1.0); 946 947 if (run != ena) 948 fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); 949 } 950 fputc('\n', stderr); 951 } 952 } 953 954 static void print_stat(int argc, const char **argv) 955 { 956 struct perf_evsel *counter; 957 int i; 958 959 fflush(stdout); 960 961 if (!csv_output) { 962 fprintf(stderr, "\n"); 963 fprintf(stderr, " Performance counter stats for "); 964 if(target_pid == -1 && target_tid == -1) { 965 fprintf(stderr, "\'%s", argv[0]); 966 for (i = 1; i < argc; i++) 967 fprintf(stderr, " %s", argv[i]); 968 } else if (target_pid != -1) 969 fprintf(stderr, "process id \'%d", target_pid); 970 else 971 fprintf(stderr, "thread id \'%d", target_tid); 972 973 fprintf(stderr, "\'"); 974 if (run_count > 1) 975 fprintf(stderr, " (%d runs)", run_count); 976 fprintf(stderr, ":\n\n"); 977 } 978 979 if (no_aggr) { 980 list_for_each_entry(counter, &evsel_list->entries, node) 981 print_counter(counter); 982 } else { 983 list_for_each_entry(counter, &evsel_list->entries, node) 984 print_counter_aggr(counter); 985 } 986 987 if (!csv_output) { 988 if (!null_run) 989 fprintf(stderr, "\n"); 990 fprintf(stderr, " %17.9f seconds time elapsed", 991 avg_stats(&walltime_nsecs_stats)/1e9); 992 if (run_count > 1) { 993 fprintf(stderr, " "); 994 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 995 avg_stats(&walltime_nsecs_stats)); 996 } 997 fprintf(stderr, "\n\n"); 998 } 999 } 1000 1001 static volatile int signr = -1; 1002 1003 static void skip_signal(int signo) 1004 { 1005 if(child_pid == -1) 1006 done = 1; 1007 1008 signr = signo; 1009 } 1010 1011 static void sig_atexit(void) 1012 { 1013 if (child_pid != -1) 1014 kill(child_pid, SIGTERM); 1015 1016 if (signr == -1) 1017 return; 1018 1019 signal(signr, SIG_DFL); 1020 kill(getpid(), signr); 1021 } 1022 1023 static const char * const stat_usage[] = { 1024 "perf stat [<options>] [<command>]", 1025 NULL 1026 }; 1027 1028 static int stat__set_big_num(const struct option *opt __used, 1029 const char *s __used, int unset) 1030 { 1031 big_num_opt = unset ? 0 : 1; 1032 return 0; 1033 } 1034 1035 static const struct option options[] = { 1036 OPT_CALLBACK('e', "event", &evsel_list, "event", 1037 "event selector. use 'perf list' to list available events", 1038 parse_events), 1039 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1040 "event filter", parse_filter), 1041 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1042 "child tasks do not inherit counters"), 1043 OPT_INTEGER('p', "pid", &target_pid, 1044 "stat events on existing process id"), 1045 OPT_INTEGER('t', "tid", &target_tid, 1046 "stat events on existing thread id"), 1047 OPT_BOOLEAN('a', "all-cpus", &system_wide, 1048 "system-wide collection from all CPUs"), 1049 OPT_BOOLEAN('c', "scale", &scale, 1050 "scale/normalize counters"), 1051 OPT_INCR('v', "verbose", &verbose, 1052 "be more verbose (show counter open errors, etc)"), 1053 OPT_INTEGER('r', "repeat", &run_count, 1054 "repeat command and print average + stddev (max: 100)"), 1055 OPT_BOOLEAN('n', "null", &null_run, 1056 "null run - dont start any counters"), 1057 OPT_INCR('d', "detailed", &detailed_run, 1058 "detailed run - start a lot of events"), 1059 OPT_BOOLEAN('S', "sync", &sync_run, 1060 "call sync() before starting a run"), 1061 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1062 "print large numbers with thousands\' separators", 1063 stat__set_big_num), 1064 OPT_STRING('C', "cpu", &cpu_list, "cpu", 1065 "list of cpus to monitor in system-wide"), 1066 OPT_BOOLEAN('A', "no-aggr", &no_aggr, 1067 "disable CPU count aggregation"), 1068 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1069 "print counts with custom separator"), 1070 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1071 "monitor event in cgroup name only", 1072 parse_cgroups), 1073 OPT_END() 1074 }; 1075 1076 /* 1077 * Add default attributes, if there were no attributes specified or 1078 * if -d/--detailed, -d -d or -d -d -d is used: 1079 */ 1080 static int add_default_attributes(void) 1081 { 1082 struct perf_evsel *pos; 1083 size_t attr_nr = 0; 1084 size_t c; 1085 1086 /* Set attrs if no event is selected and !null_run: */ 1087 if (null_run) 1088 return 0; 1089 1090 if (!evsel_list->nr_entries) { 1091 for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { 1092 pos = perf_evsel__new(default_attrs + c, c + attr_nr); 1093 if (pos == NULL) 1094 return -1; 1095 perf_evlist__add(evsel_list, pos); 1096 } 1097 attr_nr += c; 1098 } 1099 1100 /* Detailed events get appended to the event list: */ 1101 1102 if (detailed_run < 1) 1103 return 0; 1104 1105 /* Append detailed run extra attributes: */ 1106 for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { 1107 pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); 1108 if (pos == NULL) 1109 return -1; 1110 perf_evlist__add(evsel_list, pos); 1111 } 1112 attr_nr += c; 1113 1114 if (detailed_run < 2) 1115 return 0; 1116 1117 /* Append very detailed run extra attributes: */ 1118 for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { 1119 pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); 1120 if (pos == NULL) 1121 return -1; 1122 perf_evlist__add(evsel_list, pos); 1123 } 1124 1125 if (detailed_run < 3) 1126 return 0; 1127 1128 /* Append very, very detailed run extra attributes: */ 1129 for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { 1130 pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); 1131 if (pos == NULL) 1132 return -1; 1133 perf_evlist__add(evsel_list, pos); 1134 } 1135 1136 1137 return 0; 1138 } 1139 1140 int cmd_stat(int argc, const char **argv, const char *prefix __used) 1141 { 1142 /* ANDROID_CHANGE_BEGIN */ 1143 #ifndef __APPLE__ 1144 struct perf_evsel *pos; 1145 int status = -ENOMEM; 1146 1147 setlocale(LC_ALL, ""); 1148 1149 evsel_list = perf_evlist__new(NULL, NULL); 1150 if (evsel_list == NULL) 1151 return -ENOMEM; 1152 1153 argc = parse_options(argc, argv, options, stat_usage, 1154 PARSE_OPT_STOP_AT_NON_OPTION); 1155 1156 if (csv_sep) 1157 csv_output = true; 1158 else 1159 csv_sep = DEFAULT_SEPARATOR; 1160 1161 /* 1162 * let the spreadsheet do the pretty-printing 1163 */ 1164 if (csv_output) { 1165 /* User explicitely passed -B? */ 1166 if (big_num_opt == 1) { 1167 fprintf(stderr, "-B option not supported with -x\n"); 1168 usage_with_options(stat_usage, options); 1169 } else /* Nope, so disable big number formatting */ 1170 big_num = false; 1171 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1172 big_num = false; 1173 1174 if (!argc && target_pid == -1 && target_tid == -1) 1175 usage_with_options(stat_usage, options); 1176 if (run_count <= 0) 1177 usage_with_options(stat_usage, options); 1178 1179 /* no_aggr, cgroup are for system-wide only */ 1180 if ((no_aggr || nr_cgroups) && !system_wide) { 1181 fprintf(stderr, "both cgroup and no-aggregation " 1182 "modes only available in system-wide mode\n"); 1183 1184 usage_with_options(stat_usage, options); 1185 } 1186 1187 if (add_default_attributes()) 1188 goto out; 1189 1190 if (target_pid != -1) 1191 target_tid = target_pid; 1192 1193 evsel_list->threads = thread_map__new(target_pid, target_tid); 1194 if (evsel_list->threads == NULL) { 1195 pr_err("Problems finding threads of monitor\n"); 1196 usage_with_options(stat_usage, options); 1197 } 1198 1199 if (system_wide) 1200 evsel_list->cpus = cpu_map__new(cpu_list); 1201 else 1202 evsel_list->cpus = cpu_map__dummy_new(); 1203 1204 if (evsel_list->cpus == NULL) { 1205 perror("failed to parse CPUs map"); 1206 usage_with_options(stat_usage, options); 1207 return -1; 1208 } 1209 1210 list_for_each_entry(pos, &evsel_list->entries, node) { 1211 if (perf_evsel__alloc_stat_priv(pos) < 0 || 1212 perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || 1213 perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) 1214 goto out_free_fd; 1215 } 1216 1217 /* 1218 * We dont want to block the signals - that would cause 1219 * child tasks to inherit that and Ctrl-C would not work. 1220 * What we want is for Ctrl-C to work in the exec()-ed 1221 * task, but being ignored by perf stat itself: 1222 */ 1223 atexit(sig_atexit); 1224 signal(SIGINT, skip_signal); 1225 signal(SIGALRM, skip_signal); 1226 signal(SIGABRT, skip_signal); 1227 1228 status = 0; 1229 for (run_idx = 0; run_idx < run_count; run_idx++) { 1230 if (run_count != 1 && verbose) 1231 fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); 1232 1233 if (sync_run) 1234 sync(); 1235 1236 status = run_perf_stat(argc, argv); 1237 } 1238 1239 if (status != -1) 1240 print_stat(argc, argv); 1241 out_free_fd: 1242 list_for_each_entry(pos, &evsel_list->entries, node) 1243 perf_evsel__free_stat_priv(pos); 1244 perf_evlist__delete_maps(evsel_list); 1245 out: 1246 perf_evlist__delete(evsel_list); 1247 return status; 1248 #else 1249 return -1; 1250 #endif 1251 /* ANDROID_CHANGE_END */ 1252 } 1253