1 /* 2 * builtin-stat.c 3 * 4 * Builtin stat command: Give a precise performance counters summary 5 * overview about any workload, CPU or specific PID. 6 * 7 * Sample output: 8 9 $ perf stat ./hackbench 10 10 11 Time: 0.118 12 13 Performance counter stats for './hackbench 10': 14 15 1708.761321 task-clock # 11.037 CPUs utilized 16 41,190 context-switches # 0.024 M/sec 17 6,735 CPU-migrations # 0.004 M/sec 18 17,318 page-faults # 0.010 M/sec 19 5,205,202,243 cycles # 3.046 GHz 20 3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle 21 1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle 22 2,603,501,247 instructions # 0.50 insns per cycle 23 # 1.48 stalled cycles per insn 24 484,357,498 branches # 283.455 M/sec 25 6,388,934 branch-misses # 1.32% of all branches 26 27 0.154822978 seconds time elapsed 28 29 * 30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo (at) redhat.com> 31 * 32 * Improvements and fixes by: 33 * 34 * Arjan van de Ven <arjan (at) linux.intel.com> 35 * Yanmin Zhang <yanmin.zhang (at) intel.com> 36 * Wu Fengguang <fengguang.wu (at) intel.com> 37 * Mike Galbraith <efault (at) gmx.de> 38 * Paul Mackerras <paulus (at) samba.org> 39 * Jaswinder Singh Rajput <jaswinder (at) kernel.org> 40 * 41 * Released under the GPL v2. (and only v2, not any later version) 42 */ 43 44 #include "perf.h" 45 #include "builtin.h" 46 #include "util/util.h" 47 #include "util/parse-options.h" 48 #include "util/parse-events.h" 49 #include "util/event.h" 50 #include "util/evlist.h" 51 #include "util/evsel.h" 52 #include "util/debug.h" 53 #include "util/color.h" 54 #include "util/stat.h" 55 #include "util/header.h" 56 #include "util/cpumap.h" 57 #include "util/thread.h" 58 #include "util/thread_map.h" 59 60 #include <stdlib.h> 61 #include <sys/prctl.h> 62 #include <locale.h> 63 64 #define DEFAULT_SEPARATOR " " 65 #define CNTR_NOT_SUPPORTED "<not supported>" 66 #define CNTR_NOT_COUNTED "<not counted>" 67 68 static void print_stat(int argc, const char **argv); 69 static void print_counter_aggr(struct perf_evsel *counter, char *prefix); 70 static void print_counter(struct perf_evsel *counter, char *prefix); 71 static void print_aggr(char *prefix); 72 73 static struct perf_evlist *evsel_list; 74 75 static struct perf_target target = { 76 .uid = UINT_MAX, 77 }; 78 79 enum aggr_mode { 80 AGGR_NONE, 81 AGGR_GLOBAL, 82 AGGR_SOCKET, 83 AGGR_CORE, 84 }; 85 86 static int run_count = 1; 87 static bool no_inherit = false; 88 static bool scale = true; 89 static enum aggr_mode aggr_mode = AGGR_GLOBAL; 90 static volatile pid_t child_pid = -1; 91 static bool null_run = false; 92 static int detailed_run = 0; 93 static bool big_num = true; 94 static int big_num_opt = -1; 95 static const char *csv_sep = NULL; 96 static bool csv_output = false; 97 static bool group = false; 98 static FILE *output = NULL; 99 static const char *pre_cmd = NULL; 100 static const char *post_cmd = NULL; 101 static bool sync_run = false; 102 static unsigned int interval = 0; 103 static unsigned int initial_delay = 0; 104 static bool forever = false; 105 static struct timespec ref_time; 106 static struct cpu_map *aggr_map; 107 static int (*aggr_get_id)(struct cpu_map *m, int cpu); 108 109 static volatile int done = 0; 110 111 struct perf_stat { 112 struct stats res_stats[3]; 113 }; 114 115 static inline void diff_timespec(struct timespec *r, struct timespec *a, 116 struct timespec *b) 117 { 118 r->tv_sec = a->tv_sec - b->tv_sec; 119 if (a->tv_nsec < b->tv_nsec) { 120 r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec; 121 r->tv_sec--; 122 } else { 123 r->tv_nsec = a->tv_nsec - b->tv_nsec ; 124 } 125 } 126 127 static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel) 128 { 129 return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus; 130 } 131 132 static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel) 133 { 134 return perf_evsel__cpus(evsel)->nr; 135 } 136 137 static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel) 138 { 139 memset(evsel->priv, 0, sizeof(struct perf_stat)); 140 } 141 142 static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel) 143 { 144 evsel->priv = zalloc(sizeof(struct perf_stat)); 145 return evsel->priv == NULL ? -ENOMEM : 0; 146 } 147 148 static void perf_evsel__free_stat_priv(struct perf_evsel *evsel) 149 { 150 free(evsel->priv); 151 evsel->priv = NULL; 152 } 153 154 static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel) 155 { 156 void *addr; 157 size_t sz; 158 159 sz = sizeof(*evsel->counts) + 160 (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values)); 161 162 addr = zalloc(sz); 163 if (!addr) 164 return -ENOMEM; 165 166 evsel->prev_raw_counts = addr; 167 168 return 0; 169 } 170 171 static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel) 172 { 173 free(evsel->prev_raw_counts); 174 evsel->prev_raw_counts = NULL; 175 } 176 177 static void perf_evlist__free_stats(struct perf_evlist *evlist) 178 { 179 struct perf_evsel *evsel; 180 181 list_for_each_entry(evsel, &evlist->entries, node) { 182 perf_evsel__free_stat_priv(evsel); 183 perf_evsel__free_counts(evsel); 184 perf_evsel__free_prev_raw_counts(evsel); 185 } 186 } 187 188 static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw) 189 { 190 struct perf_evsel *evsel; 191 192 list_for_each_entry(evsel, &evlist->entries, node) { 193 if (perf_evsel__alloc_stat_priv(evsel) < 0 || 194 perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 || 195 (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0)) 196 goto out_free; 197 } 198 199 return 0; 200 201 out_free: 202 perf_evlist__free_stats(evlist); 203 return -1; 204 } 205 206 static struct stats runtime_nsecs_stats[MAX_NR_CPUS]; 207 static struct stats runtime_cycles_stats[MAX_NR_CPUS]; 208 static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; 209 static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; 210 static struct stats runtime_branches_stats[MAX_NR_CPUS]; 211 static struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; 212 static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; 213 static struct stats runtime_l1_icache_stats[MAX_NR_CPUS]; 214 static struct stats runtime_ll_cache_stats[MAX_NR_CPUS]; 215 static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS]; 216 static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS]; 217 static struct stats walltime_nsecs_stats; 218 219 static void perf_stat__reset_stats(struct perf_evlist *evlist) 220 { 221 struct perf_evsel *evsel; 222 223 list_for_each_entry(evsel, &evlist->entries, node) { 224 perf_evsel__reset_stat_priv(evsel); 225 perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel)); 226 } 227 228 memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats)); 229 memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats)); 230 memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats)); 231 memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats)); 232 memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats)); 233 memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats)); 234 memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats)); 235 memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats)); 236 memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats)); 237 memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats)); 238 memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats)); 239 memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats)); 240 } 241 242 static int create_perf_stat_counter(struct perf_evsel *evsel) 243 { 244 struct perf_event_attr *attr = &evsel->attr; 245 246 if (scale) 247 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | 248 PERF_FORMAT_TOTAL_TIME_RUNNING; 249 250 attr->inherit = !no_inherit; 251 252 if (perf_target__has_cpu(&target)) 253 return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel)); 254 255 if (!perf_target__has_task(&target) && 256 perf_evsel__is_group_leader(evsel)) { 257 attr->disabled = 1; 258 if (!initial_delay) 259 attr->enable_on_exec = 1; 260 } 261 262 return perf_evsel__open_per_thread(evsel, evsel_list->threads); 263 } 264 265 /* 266 * Does the counter have nsecs as a unit? 267 */ 268 static inline int nsec_counter(struct perf_evsel *evsel) 269 { 270 if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) || 271 perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 272 return 1; 273 274 return 0; 275 } 276 277 /* 278 * Update various tracking values we maintain to print 279 * more semantic information such as miss/hit ratios, 280 * instruction rates, etc: 281 */ 282 static void update_shadow_stats(struct perf_evsel *counter, u64 *count) 283 { 284 if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) 285 update_stats(&runtime_nsecs_stats[0], count[0]); 286 else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) 287 update_stats(&runtime_cycles_stats[0], count[0]); 288 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) 289 update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); 290 else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) 291 update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); 292 else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) 293 update_stats(&runtime_branches_stats[0], count[0]); 294 else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) 295 update_stats(&runtime_cacherefs_stats[0], count[0]); 296 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) 297 update_stats(&runtime_l1_dcache_stats[0], count[0]); 298 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) 299 update_stats(&runtime_l1_icache_stats[0], count[0]); 300 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL)) 301 update_stats(&runtime_ll_cache_stats[0], count[0]); 302 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) 303 update_stats(&runtime_dtlb_cache_stats[0], count[0]); 304 else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) 305 update_stats(&runtime_itlb_cache_stats[0], count[0]); 306 } 307 308 /* 309 * Read out the results of a single counter: 310 * aggregate counts across CPUs in system-wide mode 311 */ 312 static int read_counter_aggr(struct perf_evsel *counter) 313 { 314 struct perf_stat *ps = counter->priv; 315 u64 *count = counter->counts->aggr.values; 316 int i; 317 318 if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter), 319 thread_map__nr(evsel_list->threads), scale) < 0) 320 return -1; 321 322 for (i = 0; i < 3; i++) 323 update_stats(&ps->res_stats[i], count[i]); 324 325 if (verbose) { 326 fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", 327 perf_evsel__name(counter), count[0], count[1], count[2]); 328 } 329 330 /* 331 * Save the full runtime - to allow normalization during printout: 332 */ 333 update_shadow_stats(counter, count); 334 335 return 0; 336 } 337 338 /* 339 * Read out the results of a single counter: 340 * do not aggregate counts across CPUs in system-wide mode 341 */ 342 static int read_counter(struct perf_evsel *counter) 343 { 344 u64 *count; 345 int cpu; 346 347 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 348 if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0) 349 return -1; 350 351 count = counter->counts->cpu[cpu].values; 352 353 update_shadow_stats(counter, count); 354 } 355 356 return 0; 357 } 358 359 static void print_interval(void) 360 { 361 #ifndef __APPLE__ 362 static int num_print_interval; 363 struct perf_evsel *counter; 364 struct perf_stat *ps; 365 struct timespec ts, rs; 366 char prefix[64]; 367 368 if (aggr_mode == AGGR_GLOBAL) { 369 list_for_each_entry(counter, &evsel_list->entries, node) { 370 ps = counter->priv; 371 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 372 read_counter_aggr(counter); 373 } 374 } else { 375 list_for_each_entry(counter, &evsel_list->entries, node) { 376 ps = counter->priv; 377 memset(ps->res_stats, 0, sizeof(ps->res_stats)); 378 read_counter(counter); 379 } 380 } 381 382 clock_gettime(CLOCK_MONOTONIC, &ts); 383 diff_timespec(&rs, &ts, &ref_time); 384 sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep); 385 386 if (num_print_interval == 0 && !csv_output) { 387 switch (aggr_mode) { 388 case AGGR_SOCKET: 389 fprintf(output, "# time socket cpus counts events\n"); 390 break; 391 case AGGR_CORE: 392 fprintf(output, "# time core cpus counts events\n"); 393 break; 394 case AGGR_NONE: 395 fprintf(output, "# time CPU counts events\n"); 396 break; 397 case AGGR_GLOBAL: 398 default: 399 fprintf(output, "# time counts events\n"); 400 } 401 } 402 403 if (++num_print_interval == 25) 404 num_print_interval = 0; 405 406 switch (aggr_mode) { 407 case AGGR_CORE: 408 case AGGR_SOCKET: 409 print_aggr(prefix); 410 break; 411 case AGGR_NONE: 412 list_for_each_entry(counter, &evsel_list->entries, node) 413 print_counter(counter, prefix); 414 break; 415 case AGGR_GLOBAL: 416 default: 417 list_for_each_entry(counter, &evsel_list->entries, node) 418 print_counter_aggr(counter, prefix); 419 } 420 421 fflush(output); 422 #else 423 perror("print_interval not supported on MacOS"); 424 #endif 425 } 426 427 static void handle_initial_delay(void) 428 { 429 struct perf_evsel *counter; 430 431 if (initial_delay) { 432 const int ncpus = cpu_map__nr(evsel_list->cpus), 433 nthreads = thread_map__nr(evsel_list->threads); 434 435 usleep(initial_delay * 1000); 436 list_for_each_entry(counter, &evsel_list->entries, node) 437 perf_evsel__enable(counter, ncpus, nthreads); 438 } 439 } 440 441 static int __run_perf_stat(int argc, const char **argv) 442 { 443 #ifndef __APPLE__ 444 char msg[512]; 445 unsigned long long t0, t1; 446 struct perf_evsel *counter; 447 struct timespec ts; 448 int status = 0; 449 const bool forks = (argc > 0); 450 451 if (interval) { 452 ts.tv_sec = interval / 1000; 453 ts.tv_nsec = (interval % 1000) * 1000000; 454 } else { 455 ts.tv_sec = 1; 456 ts.tv_nsec = 0; 457 } 458 459 if (forks) { 460 if (perf_evlist__prepare_workload(evsel_list, &target, argv, 461 false, false) < 0) { 462 perror("failed to prepare workload"); 463 return -1; 464 } 465 child_pid = evsel_list->workload.pid; 466 } 467 468 if (group) 469 perf_evlist__set_leader(evsel_list); 470 471 list_for_each_entry(counter, &evsel_list->entries, node) { 472 if (create_perf_stat_counter(counter) < 0) { 473 /* 474 * PPC returns ENXIO for HW counters until 2.6.37 475 * (behavior changed with commit b0a873e). 476 */ 477 if (errno == EINVAL || errno == ENOSYS || 478 errno == ENOENT || errno == EOPNOTSUPP || 479 errno == ENXIO) { 480 if (verbose) 481 ui__warning("%s event is not supported by the kernel.\n", 482 perf_evsel__name(counter)); 483 counter->supported = false; 484 continue; 485 } 486 487 perf_evsel__open_strerror(counter, &target, 488 errno, msg, sizeof(msg)); 489 ui__error("%s\n", msg); 490 491 if (child_pid != -1) 492 kill(child_pid, SIGTERM); 493 494 return -1; 495 } 496 counter->supported = true; 497 } 498 499 if (perf_evlist__apply_filters(evsel_list)) { 500 error("failed to set filter with %d (%s)\n", errno, 501 strerror(errno)); 502 return -1; 503 } 504 505 /* 506 * Enable counters and exec the command: 507 */ 508 t0 = rdclock(); 509 clock_gettime(CLOCK_MONOTONIC, &ref_time); 510 511 if (forks) { 512 perf_evlist__start_workload(evsel_list); 513 handle_initial_delay(); 514 515 if (interval) { 516 while (!waitpid(child_pid, &status, WNOHANG)) { 517 nanosleep(&ts, NULL); 518 print_interval(); 519 } 520 } 521 wait(&status); 522 if (WIFSIGNALED(status)) 523 psignal(WTERMSIG(status), argv[0]); 524 } else { 525 handle_initial_delay(); 526 while (!done) { 527 nanosleep(&ts, NULL); 528 if (interval) 529 print_interval(); 530 } 531 } 532 533 t1 = rdclock(); 534 535 update_stats(&walltime_nsecs_stats, t1 - t0); 536 537 if (aggr_mode == AGGR_GLOBAL) { 538 list_for_each_entry(counter, &evsel_list->entries, node) { 539 read_counter_aggr(counter); 540 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 541 thread_map__nr(evsel_list->threads)); 542 } 543 } else { 544 list_for_each_entry(counter, &evsel_list->entries, node) { 545 read_counter(counter); 546 perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1); 547 } 548 } 549 550 return WEXITSTATUS(status); 551 #else 552 return -1; 553 #endif 554 } 555 556 static int run_perf_stat(int argc __maybe_unused, const char **argv) 557 { 558 int ret; 559 560 if (pre_cmd) { 561 ret = system(pre_cmd); 562 if (ret) 563 return ret; 564 } 565 566 if (sync_run) 567 sync(); 568 569 ret = __run_perf_stat(argc, argv); 570 if (ret) 571 return ret; 572 573 if (post_cmd) { 574 ret = system(post_cmd); 575 if (ret) 576 return ret; 577 } 578 579 return ret; 580 } 581 582 static void print_noise_pct(double total, double avg) 583 { 584 double pct = rel_stddev_stats(total, avg); 585 586 if (csv_output) 587 fprintf(output, "%s%.2f%%", csv_sep, pct); 588 else if (pct) 589 fprintf(output, " ( +-%6.2f%% )", pct); 590 } 591 592 static void print_noise(struct perf_evsel *evsel, double avg) 593 { 594 struct perf_stat *ps; 595 596 if (run_count == 1) 597 return; 598 599 ps = evsel->priv; 600 print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); 601 } 602 603 static void aggr_printout(struct perf_evsel *evsel, int id, int nr) 604 { 605 switch (aggr_mode) { 606 case AGGR_CORE: 607 fprintf(output, "S%d-C%*d%s%*d%s", 608 cpu_map__id_to_socket(id), 609 csv_output ? 0 : -8, 610 cpu_map__id_to_cpu(id), 611 csv_sep, 612 csv_output ? 0 : 4, 613 nr, 614 csv_sep); 615 break; 616 case AGGR_SOCKET: 617 fprintf(output, "S%*d%s%*d%s", 618 csv_output ? 0 : -5, 619 id, 620 csv_sep, 621 csv_output ? 0 : 4, 622 nr, 623 csv_sep); 624 break; 625 case AGGR_NONE: 626 fprintf(output, "CPU%*d%s", 627 csv_output ? 0 : -4, 628 perf_evsel__cpus(evsel)->map[id], csv_sep); 629 break; 630 case AGGR_GLOBAL: 631 default: 632 break; 633 } 634 } 635 636 static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 637 { 638 double msecs = avg / 1e6; 639 const char *fmt = csv_output ? "%.6f%s%s" : "%18.6f%s%-25s"; 640 641 aggr_printout(evsel, cpu, nr); 642 643 fprintf(output, fmt, msecs, csv_sep, perf_evsel__name(evsel)); 644 645 if (evsel->cgrp) 646 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 647 648 if (csv_output || interval) 649 return; 650 651 if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) 652 fprintf(output, " # %8.3f CPUs utilized ", 653 avg / avg_stats(&walltime_nsecs_stats)); 654 else 655 fprintf(output, " "); 656 } 657 658 /* used for get_ratio_color() */ 659 enum grc_type { 660 GRC_STALLED_CYCLES_FE, 661 GRC_STALLED_CYCLES_BE, 662 GRC_CACHE_MISSES, 663 GRC_MAX_NR 664 }; 665 666 static const char *get_ratio_color(enum grc_type type, double ratio) 667 { 668 static const double grc_table[GRC_MAX_NR][3] = { 669 [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, 670 [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, 671 [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, 672 }; 673 const char *color = PERF_COLOR_NORMAL; 674 675 if (ratio > grc_table[type][0]) 676 color = PERF_COLOR_RED; 677 else if (ratio > grc_table[type][1]) 678 color = PERF_COLOR_MAGENTA; 679 else if (ratio > grc_table[type][2]) 680 color = PERF_COLOR_YELLOW; 681 682 return color; 683 } 684 685 static void print_stalled_cycles_frontend(int cpu, 686 struct perf_evsel *evsel 687 __maybe_unused, double avg) 688 { 689 double total, ratio = 0.0; 690 const char *color; 691 692 total = avg_stats(&runtime_cycles_stats[cpu]); 693 694 if (total) 695 ratio = avg / total * 100.0; 696 697 color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); 698 699 fprintf(output, " # "); 700 color_fprintf(output, color, "%6.2f%%", ratio); 701 fprintf(output, " frontend cycles idle "); 702 } 703 704 static void print_stalled_cycles_backend(int cpu, 705 struct perf_evsel *evsel 706 __maybe_unused, double avg) 707 { 708 double total, ratio = 0.0; 709 const char *color; 710 711 total = avg_stats(&runtime_cycles_stats[cpu]); 712 713 if (total) 714 ratio = avg / total * 100.0; 715 716 color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); 717 718 fprintf(output, " # "); 719 color_fprintf(output, color, "%6.2f%%", ratio); 720 fprintf(output, " backend cycles idle "); 721 } 722 723 static void print_branch_misses(int cpu, 724 struct perf_evsel *evsel __maybe_unused, 725 double avg) 726 { 727 double total, ratio = 0.0; 728 const char *color; 729 730 total = avg_stats(&runtime_branches_stats[cpu]); 731 732 if (total) 733 ratio = avg / total * 100.0; 734 735 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 736 737 fprintf(output, " # "); 738 color_fprintf(output, color, "%6.2f%%", ratio); 739 fprintf(output, " of all branches "); 740 } 741 742 static void print_l1_dcache_misses(int cpu, 743 struct perf_evsel *evsel __maybe_unused, 744 double avg) 745 { 746 double total, ratio = 0.0; 747 const char *color; 748 749 total = avg_stats(&runtime_l1_dcache_stats[cpu]); 750 751 if (total) 752 ratio = avg / total * 100.0; 753 754 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 755 756 fprintf(output, " # "); 757 color_fprintf(output, color, "%6.2f%%", ratio); 758 fprintf(output, " of all L1-dcache hits "); 759 } 760 761 static void print_l1_icache_misses(int cpu, 762 struct perf_evsel *evsel __maybe_unused, 763 double avg) 764 { 765 double total, ratio = 0.0; 766 const char *color; 767 768 total = avg_stats(&runtime_l1_icache_stats[cpu]); 769 770 if (total) 771 ratio = avg / total * 100.0; 772 773 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 774 775 fprintf(output, " # "); 776 color_fprintf(output, color, "%6.2f%%", ratio); 777 fprintf(output, " of all L1-icache hits "); 778 } 779 780 static void print_dtlb_cache_misses(int cpu, 781 struct perf_evsel *evsel __maybe_unused, 782 double avg) 783 { 784 double total, ratio = 0.0; 785 const char *color; 786 787 total = avg_stats(&runtime_dtlb_cache_stats[cpu]); 788 789 if (total) 790 ratio = avg / total * 100.0; 791 792 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 793 794 fprintf(output, " # "); 795 color_fprintf(output, color, "%6.2f%%", ratio); 796 fprintf(output, " of all dTLB cache hits "); 797 } 798 799 static void print_itlb_cache_misses(int cpu, 800 struct perf_evsel *evsel __maybe_unused, 801 double avg) 802 { 803 double total, ratio = 0.0; 804 const char *color; 805 806 total = avg_stats(&runtime_itlb_cache_stats[cpu]); 807 808 if (total) 809 ratio = avg / total * 100.0; 810 811 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 812 813 fprintf(output, " # "); 814 color_fprintf(output, color, "%6.2f%%", ratio); 815 fprintf(output, " of all iTLB cache hits "); 816 } 817 818 static void print_ll_cache_misses(int cpu, 819 struct perf_evsel *evsel __maybe_unused, 820 double avg) 821 { 822 double total, ratio = 0.0; 823 const char *color; 824 825 total = avg_stats(&runtime_ll_cache_stats[cpu]); 826 827 if (total) 828 ratio = avg / total * 100.0; 829 830 color = get_ratio_color(GRC_CACHE_MISSES, ratio); 831 832 fprintf(output, " # "); 833 color_fprintf(output, color, "%6.2f%%", ratio); 834 fprintf(output, " of all LL-cache hits "); 835 } 836 837 static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg) 838 { 839 double total, ratio = 0.0; 840 const char *fmt; 841 842 if (csv_output) 843 fmt = "%.0f%s%s"; 844 else if (big_num) 845 fmt = "%'18.0f%s%-25s"; 846 else 847 fmt = "%18.0f%s%-25s"; 848 849 aggr_printout(evsel, cpu, nr); 850 851 if (aggr_mode == AGGR_GLOBAL) 852 cpu = 0; 853 854 fprintf(output, fmt, avg, csv_sep, perf_evsel__name(evsel)); 855 856 if (evsel->cgrp) 857 fprintf(output, "%s%s", csv_sep, evsel->cgrp->name); 858 859 if (csv_output || interval) 860 return; 861 862 if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { 863 total = avg_stats(&runtime_cycles_stats[cpu]); 864 if (total) 865 ratio = avg / total; 866 867 fprintf(output, " # %5.2f insns per cycle ", ratio); 868 869 total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); 870 total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); 871 872 if (total && avg) { 873 ratio = total / avg; 874 fprintf(output, "\n # %5.2f stalled cycles per insn", ratio); 875 } 876 877 } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && 878 runtime_branches_stats[cpu].n != 0) { 879 print_branch_misses(cpu, evsel, avg); 880 } else if ( 881 evsel->attr.type == PERF_TYPE_HW_CACHE && 882 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | 883 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 884 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 885 runtime_l1_dcache_stats[cpu].n != 0) { 886 print_l1_dcache_misses(cpu, evsel, avg); 887 } else if ( 888 evsel->attr.type == PERF_TYPE_HW_CACHE && 889 evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I | 890 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 891 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 892 runtime_l1_icache_stats[cpu].n != 0) { 893 print_l1_icache_misses(cpu, evsel, avg); 894 } else if ( 895 evsel->attr.type == PERF_TYPE_HW_CACHE && 896 evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB | 897 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 898 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 899 runtime_dtlb_cache_stats[cpu].n != 0) { 900 print_dtlb_cache_misses(cpu, evsel, avg); 901 } else if ( 902 evsel->attr.type == PERF_TYPE_HW_CACHE && 903 evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB | 904 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 905 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 906 runtime_itlb_cache_stats[cpu].n != 0) { 907 print_itlb_cache_misses(cpu, evsel, avg); 908 } else if ( 909 evsel->attr.type == PERF_TYPE_HW_CACHE && 910 evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL | 911 ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | 912 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && 913 runtime_ll_cache_stats[cpu].n != 0) { 914 print_ll_cache_misses(cpu, evsel, avg); 915 } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && 916 runtime_cacherefs_stats[cpu].n != 0) { 917 total = avg_stats(&runtime_cacherefs_stats[cpu]); 918 919 if (total) 920 ratio = avg * 100 / total; 921 922 fprintf(output, " # %8.3f %% of all cache refs ", ratio); 923 924 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { 925 print_stalled_cycles_frontend(cpu, evsel, avg); 926 } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { 927 print_stalled_cycles_backend(cpu, evsel, avg); 928 } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { 929 total = avg_stats(&runtime_nsecs_stats[cpu]); 930 931 if (total) 932 ratio = 1.0 * avg / total; 933 934 fprintf(output, " # %8.3f GHz ", ratio); 935 } else if (runtime_nsecs_stats[cpu].n != 0) { 936 char unit = 'M'; 937 938 total = avg_stats(&runtime_nsecs_stats[cpu]); 939 940 if (total) 941 ratio = 1000.0 * avg / total; 942 if (ratio < 0.001) { 943 ratio *= 1000; 944 unit = 'K'; 945 } 946 947 fprintf(output, " # %8.3f %c/sec ", ratio, unit); 948 } else { 949 fprintf(output, " "); 950 } 951 } 952 953 static void print_aggr(char *prefix) 954 { 955 struct perf_evsel *counter; 956 int cpu, cpu2, s, s2, id, nr; 957 u64 ena, run, val; 958 959 if (!(aggr_map || aggr_get_id)) 960 return; 961 962 for (s = 0; s < aggr_map->nr; s++) { 963 id = aggr_map->map[s]; 964 list_for_each_entry(counter, &evsel_list->entries, node) { 965 val = ena = run = 0; 966 nr = 0; 967 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 968 cpu2 = perf_evsel__cpus(counter)->map[cpu]; 969 s2 = aggr_get_id(evsel_list->cpus, cpu2); 970 if (s2 != id) 971 continue; 972 val += counter->counts->cpu[cpu].val; 973 ena += counter->counts->cpu[cpu].ena; 974 run += counter->counts->cpu[cpu].run; 975 nr++; 976 } 977 if (prefix) 978 fprintf(output, "%s", prefix); 979 980 if (run == 0 || ena == 0) { 981 aggr_printout(counter, id, nr); 982 983 fprintf(output, "%*s%s%*s", 984 csv_output ? 0 : 18, 985 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 986 csv_sep, 987 csv_output ? 0 : -24, 988 perf_evsel__name(counter)); 989 990 if (counter->cgrp) 991 fprintf(output, "%s%s", 992 csv_sep, counter->cgrp->name); 993 994 fputc('\n', output); 995 continue; 996 } 997 998 if (nsec_counter(counter)) 999 nsec_printout(id, nr, counter, val); 1000 else 1001 abs_printout(id, nr, counter, val); 1002 1003 if (!csv_output) { 1004 print_noise(counter, 1.0); 1005 1006 if (run != ena) 1007 fprintf(output, " (%.2f%%)", 1008 100.0 * run / ena); 1009 } 1010 fputc('\n', output); 1011 } 1012 } 1013 } 1014 1015 /* 1016 * Print out the results of a single counter: 1017 * aggregated counts in system-wide mode 1018 */ 1019 static void print_counter_aggr(struct perf_evsel *counter, char *prefix) 1020 { 1021 struct perf_stat *ps = counter->priv; 1022 double avg = avg_stats(&ps->res_stats[0]); 1023 int scaled = counter->counts->scaled; 1024 1025 if (prefix) 1026 fprintf(output, "%s", prefix); 1027 1028 if (scaled == -1) { 1029 fprintf(output, "%*s%s%*s", 1030 csv_output ? 0 : 18, 1031 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1032 csv_sep, 1033 csv_output ? 0 : -24, 1034 perf_evsel__name(counter)); 1035 1036 if (counter->cgrp) 1037 fprintf(output, "%s%s", csv_sep, counter->cgrp->name); 1038 1039 fputc('\n', output); 1040 return; 1041 } 1042 1043 if (nsec_counter(counter)) 1044 nsec_printout(-1, 0, counter, avg); 1045 else 1046 abs_printout(-1, 0, counter, avg); 1047 1048 print_noise(counter, avg); 1049 1050 if (csv_output) { 1051 fputc('\n', output); 1052 return; 1053 } 1054 1055 if (scaled) { 1056 double avg_enabled, avg_running; 1057 1058 avg_enabled = avg_stats(&ps->res_stats[1]); 1059 avg_running = avg_stats(&ps->res_stats[2]); 1060 1061 fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled); 1062 } 1063 fprintf(output, "\n"); 1064 } 1065 1066 /* 1067 * Print out the results of a single counter: 1068 * does not use aggregated count in system-wide 1069 */ 1070 static void print_counter(struct perf_evsel *counter, char *prefix) 1071 { 1072 u64 ena, run, val; 1073 int cpu; 1074 1075 for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) { 1076 val = counter->counts->cpu[cpu].val; 1077 ena = counter->counts->cpu[cpu].ena; 1078 run = counter->counts->cpu[cpu].run; 1079 1080 if (prefix) 1081 fprintf(output, "%s", prefix); 1082 1083 if (run == 0 || ena == 0) { 1084 fprintf(output, "CPU%*d%s%*s%s%*s", 1085 csv_output ? 0 : -4, 1086 perf_evsel__cpus(counter)->map[cpu], csv_sep, 1087 csv_output ? 0 : 18, 1088 counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED, 1089 csv_sep, 1090 csv_output ? 0 : -24, 1091 perf_evsel__name(counter)); 1092 1093 if (counter->cgrp) 1094 fprintf(output, "%s%s", 1095 csv_sep, counter->cgrp->name); 1096 1097 fputc('\n', output); 1098 continue; 1099 } 1100 1101 if (nsec_counter(counter)) 1102 nsec_printout(cpu, 0, counter, val); 1103 else 1104 abs_printout(cpu, 0, counter, val); 1105 1106 if (!csv_output) { 1107 print_noise(counter, 1.0); 1108 1109 if (run != ena) 1110 fprintf(output, " (%.2f%%)", 1111 100.0 * run / ena); 1112 } 1113 fputc('\n', output); 1114 } 1115 } 1116 1117 static void print_stat(int argc, const char **argv) 1118 { 1119 struct perf_evsel *counter; 1120 int i; 1121 1122 fflush(stdout); 1123 1124 if (!csv_output) { 1125 fprintf(output, "\n"); 1126 fprintf(output, " Performance counter stats for "); 1127 if (!perf_target__has_task(&target)) { 1128 fprintf(output, "\'%s", argv[0]); 1129 for (i = 1; i < argc; i++) 1130 fprintf(output, " %s", argv[i]); 1131 } else if (target.pid) 1132 fprintf(output, "process id \'%s", target.pid); 1133 else 1134 fprintf(output, "thread id \'%s", target.tid); 1135 1136 fprintf(output, "\'"); 1137 if (run_count > 1) 1138 fprintf(output, " (%d runs)", run_count); 1139 fprintf(output, ":\n\n"); 1140 } 1141 1142 switch (aggr_mode) { 1143 case AGGR_CORE: 1144 case AGGR_SOCKET: 1145 print_aggr(NULL); 1146 break; 1147 case AGGR_GLOBAL: 1148 list_for_each_entry(counter, &evsel_list->entries, node) 1149 print_counter_aggr(counter, NULL); 1150 break; 1151 case AGGR_NONE: 1152 list_for_each_entry(counter, &evsel_list->entries, node) 1153 print_counter(counter, NULL); 1154 break; 1155 default: 1156 break; 1157 } 1158 1159 if (!csv_output) { 1160 if (!null_run) 1161 fprintf(output, "\n"); 1162 fprintf(output, " %17.9f seconds time elapsed", 1163 avg_stats(&walltime_nsecs_stats)/1e9); 1164 if (run_count > 1) { 1165 fprintf(output, " "); 1166 print_noise_pct(stddev_stats(&walltime_nsecs_stats), 1167 avg_stats(&walltime_nsecs_stats)); 1168 } 1169 fprintf(output, "\n\n"); 1170 } 1171 } 1172 1173 static volatile int signr = -1; 1174 1175 static void skip_signal(int signo) 1176 { 1177 if ((child_pid == -1) || interval) 1178 done = 1; 1179 1180 signr = signo; 1181 /* 1182 * render child_pid harmless 1183 * won't send SIGTERM to a random 1184 * process in case of race condition 1185 * and fast PID recycling 1186 */ 1187 child_pid = -1; 1188 } 1189 1190 static void sig_atexit(void) 1191 { 1192 sigset_t set, oset; 1193 1194 /* 1195 * avoid race condition with SIGCHLD handler 1196 * in skip_signal() which is modifying child_pid 1197 * goal is to avoid send SIGTERM to a random 1198 * process 1199 */ 1200 sigemptyset(&set); 1201 sigaddset(&set, SIGCHLD); 1202 sigprocmask(SIG_BLOCK, &set, &oset); 1203 1204 if (child_pid != -1) 1205 kill(child_pid, SIGTERM); 1206 1207 sigprocmask(SIG_SETMASK, &oset, NULL); 1208 1209 if (signr == -1) 1210 return; 1211 1212 signal(signr, SIG_DFL); 1213 kill(getpid(), signr); 1214 } 1215 1216 static int stat__set_big_num(const struct option *opt __maybe_unused, 1217 const char *s __maybe_unused, int unset) 1218 { 1219 big_num_opt = unset ? 0 : 1; 1220 return 0; 1221 } 1222 1223 static int perf_stat_init_aggr_mode(void) 1224 { 1225 switch (aggr_mode) { 1226 case AGGR_SOCKET: 1227 if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) { 1228 perror("cannot build socket map"); 1229 return -1; 1230 } 1231 aggr_get_id = cpu_map__get_socket; 1232 break; 1233 case AGGR_CORE: 1234 if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) { 1235 perror("cannot build core map"); 1236 return -1; 1237 } 1238 aggr_get_id = cpu_map__get_core; 1239 break; 1240 case AGGR_NONE: 1241 case AGGR_GLOBAL: 1242 default: 1243 break; 1244 } 1245 return 0; 1246 } 1247 1248 1249 /* 1250 * Add default attributes, if there were no attributes specified or 1251 * if -d/--detailed, -d -d or -d -d -d is used: 1252 */ 1253 static int add_default_attributes(void) 1254 { 1255 struct perf_event_attr default_attrs[] = { 1256 1257 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, 1258 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, 1259 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, 1260 { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, 1261 1262 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, 1263 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, 1264 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, 1265 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, 1266 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, 1267 { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, 1268 1269 }; 1270 1271 /* 1272 * Detailed stats (-d), covering the L1 and last level data caches: 1273 */ 1274 struct perf_event_attr detailed_attrs[] = { 1275 1276 { .type = PERF_TYPE_HW_CACHE, 1277 .config = 1278 PERF_COUNT_HW_CACHE_L1D << 0 | 1279 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1280 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1281 1282 { .type = PERF_TYPE_HW_CACHE, 1283 .config = 1284 PERF_COUNT_HW_CACHE_L1D << 0 | 1285 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1286 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1287 1288 { .type = PERF_TYPE_HW_CACHE, 1289 .config = 1290 PERF_COUNT_HW_CACHE_LL << 0 | 1291 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1292 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1293 1294 { .type = PERF_TYPE_HW_CACHE, 1295 .config = 1296 PERF_COUNT_HW_CACHE_LL << 0 | 1297 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1298 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1299 }; 1300 1301 /* 1302 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches: 1303 */ 1304 struct perf_event_attr very_detailed_attrs[] = { 1305 1306 { .type = PERF_TYPE_HW_CACHE, 1307 .config = 1308 PERF_COUNT_HW_CACHE_L1I << 0 | 1309 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1310 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1311 1312 { .type = PERF_TYPE_HW_CACHE, 1313 .config = 1314 PERF_COUNT_HW_CACHE_L1I << 0 | 1315 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1316 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1317 1318 { .type = PERF_TYPE_HW_CACHE, 1319 .config = 1320 PERF_COUNT_HW_CACHE_DTLB << 0 | 1321 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1322 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1323 1324 { .type = PERF_TYPE_HW_CACHE, 1325 .config = 1326 PERF_COUNT_HW_CACHE_DTLB << 0 | 1327 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1328 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1329 1330 { .type = PERF_TYPE_HW_CACHE, 1331 .config = 1332 PERF_COUNT_HW_CACHE_ITLB << 0 | 1333 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1334 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1335 1336 { .type = PERF_TYPE_HW_CACHE, 1337 .config = 1338 PERF_COUNT_HW_CACHE_ITLB << 0 | 1339 (PERF_COUNT_HW_CACHE_OP_READ << 8) | 1340 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1341 1342 }; 1343 1344 /* 1345 * Very, very detailed stats (-d -d -d), adding prefetch events: 1346 */ 1347 struct perf_event_attr very_very_detailed_attrs[] = { 1348 1349 { .type = PERF_TYPE_HW_CACHE, 1350 .config = 1351 PERF_COUNT_HW_CACHE_L1D << 0 | 1352 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1353 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, 1354 1355 { .type = PERF_TYPE_HW_CACHE, 1356 .config = 1357 PERF_COUNT_HW_CACHE_L1D << 0 | 1358 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | 1359 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, 1360 }; 1361 1362 /* Set attrs if no event is selected and !null_run: */ 1363 if (null_run) 1364 return 0; 1365 1366 if (!evsel_list->nr_entries) { 1367 if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0) 1368 return -1; 1369 } 1370 1371 /* Detailed events get appended to the event list: */ 1372 1373 if (detailed_run < 1) 1374 return 0; 1375 1376 /* Append detailed run extra attributes: */ 1377 if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0) 1378 return -1; 1379 1380 if (detailed_run < 2) 1381 return 0; 1382 1383 /* Append very detailed run extra attributes: */ 1384 if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0) 1385 return -1; 1386 1387 if (detailed_run < 3) 1388 return 0; 1389 1390 /* Append very, very detailed run extra attributes: */ 1391 return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs); 1392 } 1393 1394 int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused) 1395 { 1396 #ifndef __APPLE__ 1397 bool append_file = false; 1398 int output_fd = 0; 1399 const char *output_name = NULL; 1400 const struct option options[] = { 1401 OPT_CALLBACK('e', "event", &evsel_list, "event", 1402 "event selector. use 'perf list' to list available events", 1403 parse_events_option), 1404 OPT_CALLBACK(0, "filter", &evsel_list, "filter", 1405 "event filter", parse_filter), 1406 OPT_BOOLEAN('i', "no-inherit", &no_inherit, 1407 "child tasks do not inherit counters"), 1408 OPT_STRING('p', "pid", &target.pid, "pid", 1409 "stat events on existing process id"), 1410 OPT_STRING('t', "tid", &target.tid, "tid", 1411 "stat events on existing thread id"), 1412 OPT_BOOLEAN('a', "all-cpus", &target.system_wide, 1413 "system-wide collection from all CPUs"), 1414 OPT_BOOLEAN('g', "group", &group, 1415 "put the counters into a counter group"), 1416 OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), 1417 OPT_INCR('v', "verbose", &verbose, 1418 "be more verbose (show counter open errors, etc)"), 1419 OPT_INTEGER('r', "repeat", &run_count, 1420 "repeat command and print average + stddev (max: 100, forever: 0)"), 1421 OPT_BOOLEAN('n', "null", &null_run, 1422 "null run - dont start any counters"), 1423 OPT_INCR('d', "detailed", &detailed_run, 1424 "detailed run - start a lot of events"), 1425 OPT_BOOLEAN('S', "sync", &sync_run, 1426 "call sync() before starting a run"), 1427 OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 1428 "print large numbers with thousands\' separators", 1429 stat__set_big_num), 1430 OPT_STRING('C', "cpu", &target.cpu_list, "cpu", 1431 "list of cpus to monitor in system-wide"), 1432 OPT_SET_UINT('A', "no-aggr", &aggr_mode, 1433 "disable CPU count aggregation", AGGR_NONE), 1434 OPT_STRING('x', "field-separator", &csv_sep, "separator", 1435 "print counts with custom separator"), 1436 OPT_CALLBACK('G', "cgroup", &evsel_list, "name", 1437 "monitor event in cgroup name only", parse_cgroups), 1438 OPT_STRING('o', "output", &output_name, "file", "output file name"), 1439 OPT_BOOLEAN(0, "append", &append_file, "append to the output file"), 1440 OPT_INTEGER(0, "log-fd", &output_fd, 1441 "log output to fd, instead of stderr"), 1442 OPT_STRING(0, "pre", &pre_cmd, "command", 1443 "command to run prior to the measured command"), 1444 OPT_STRING(0, "post", &post_cmd, "command", 1445 "command to run after to the measured command"), 1446 OPT_UINTEGER('I', "interval-print", &interval, 1447 "print counts at regular interval in ms (>= 100)"), 1448 OPT_SET_UINT(0, "per-socket", &aggr_mode, 1449 "aggregate counts per processor socket", AGGR_SOCKET), 1450 OPT_SET_UINT(0, "per-core", &aggr_mode, 1451 "aggregate counts per physical processor core", AGGR_CORE), 1452 OPT_UINTEGER('D', "delay", &initial_delay, 1453 "ms to wait before starting measurement after program start"), 1454 OPT_END() 1455 }; 1456 const char * const stat_usage[] = { 1457 "perf stat [<options>] [<command>]", 1458 NULL 1459 }; 1460 int status = -ENOMEM, run_idx; 1461 const char *mode; 1462 1463 setlocale(LC_ALL, ""); 1464 1465 evsel_list = perf_evlist__new(); 1466 if (evsel_list == NULL) 1467 return -ENOMEM; 1468 1469 argc = parse_options(argc, argv, options, stat_usage, 1470 PARSE_OPT_STOP_AT_NON_OPTION); 1471 1472 output = stderr; 1473 if (output_name && strcmp(output_name, "-")) 1474 output = NULL; 1475 1476 if (output_name && output_fd) { 1477 fprintf(stderr, "cannot use both --output and --log-fd\n"); 1478 usage_with_options(stat_usage, options); 1479 } 1480 1481 if (output_fd < 0) { 1482 fprintf(stderr, "argument to --log-fd must be a > 0\n"); 1483 usage_with_options(stat_usage, options); 1484 } 1485 1486 if (!output) { 1487 struct timespec tm; 1488 mode = append_file ? "a" : "w"; 1489 1490 output = fopen(output_name, mode); 1491 if (!output) { 1492 perror("failed to create output file"); 1493 return -1; 1494 } 1495 clock_gettime(CLOCK_REALTIME, &tm); 1496 fprintf(output, "# started on %s\n", ctime(&tm.tv_sec)); 1497 } else if (output_fd > 0) { 1498 mode = append_file ? "a" : "w"; 1499 output = fdopen(output_fd, mode); 1500 if (!output) { 1501 perror("Failed opening logfd"); 1502 return -errno; 1503 } 1504 } 1505 1506 if (csv_sep) { 1507 csv_output = true; 1508 if (!strcmp(csv_sep, "\\t")) 1509 csv_sep = "\t"; 1510 } else 1511 csv_sep = DEFAULT_SEPARATOR; 1512 1513 /* 1514 * let the spreadsheet do the pretty-printing 1515 */ 1516 if (csv_output) { 1517 /* User explicitly passed -B? */ 1518 if (big_num_opt == 1) { 1519 fprintf(stderr, "-B option not supported with -x\n"); 1520 usage_with_options(stat_usage, options); 1521 } else /* Nope, so disable big number formatting */ 1522 big_num = false; 1523 } else if (big_num_opt == 0) /* User passed --no-big-num */ 1524 big_num = false; 1525 1526 if (!argc && !perf_target__has_task(&target)) 1527 usage_with_options(stat_usage, options); 1528 if (run_count < 0) { 1529 usage_with_options(stat_usage, options); 1530 } else if (run_count == 0) { 1531 forever = true; 1532 run_count = 1; 1533 } 1534 1535 /* no_aggr, cgroup are for system-wide only */ 1536 if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) 1537 && !perf_target__has_cpu(&target)) { 1538 fprintf(stderr, "both cgroup and no-aggregation " 1539 "modes only available in system-wide mode\n"); 1540 1541 usage_with_options(stat_usage, options); 1542 return -1; 1543 } 1544 1545 if (add_default_attributes()) 1546 goto out; 1547 1548 perf_target__validate(&target); 1549 1550 if (perf_evlist__create_maps(evsel_list, &target) < 0) { 1551 if (perf_target__has_task(&target)) 1552 pr_err("Problems finding threads of monitor\n"); 1553 if (perf_target__has_cpu(&target)) 1554 perror("failed to parse CPUs map"); 1555 1556 usage_with_options(stat_usage, options); 1557 return -1; 1558 } 1559 if (interval && interval < 100) { 1560 pr_err("print interval must be >= 100ms\n"); 1561 usage_with_options(stat_usage, options); 1562 return -1; 1563 } 1564 1565 if (perf_evlist__alloc_stats(evsel_list, interval)) 1566 goto out_free_maps; 1567 1568 if (perf_stat_init_aggr_mode()) 1569 goto out; 1570 1571 /* 1572 * We dont want to block the signals - that would cause 1573 * child tasks to inherit that and Ctrl-C would not work. 1574 * What we want is for Ctrl-C to work in the exec()-ed 1575 * task, but being ignored by perf stat itself: 1576 */ 1577 atexit(sig_atexit); 1578 if (!forever) 1579 signal(SIGINT, skip_signal); 1580 signal(SIGCHLD, skip_signal); 1581 signal(SIGALRM, skip_signal); 1582 signal(SIGABRT, skip_signal); 1583 1584 status = 0; 1585 for (run_idx = 0; forever || run_idx < run_count; run_idx++) { 1586 if (run_count != 1 && verbose) 1587 fprintf(output, "[ perf stat: executing run #%d ... ]\n", 1588 run_idx + 1); 1589 1590 status = run_perf_stat(argc, argv); 1591 if (forever && status != -1) { 1592 print_stat(argc, argv); 1593 perf_stat__reset_stats(evsel_list); 1594 } 1595 } 1596 1597 if (!forever && status != -1 && !interval) 1598 print_stat(argc, argv); 1599 1600 perf_evlist__free_stats(evsel_list); 1601 out_free_maps: 1602 perf_evlist__delete_maps(evsel_list); 1603 out: 1604 perf_evlist__delete(evsel_list); 1605 return status; 1606 #else 1607 perror("cmd_stat not supported on MacOS"); 1608 return -1; 1609 #endif 1610 } 1611