1 /* 2 * block queue tracing parse application 3 * 4 * Copyright (C) 2005 Jens Axboe <axboe (at) suse.de> 5 * Copyright (C) 2006 Jens Axboe <axboe (at) kernel.dk> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or 10 * (at your option) any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 * 21 */ 22 #include <sys/types.h> 23 #include <sys/stat.h> 24 #include <unistd.h> 25 #include <stdio.h> 26 #include <fcntl.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <getopt.h> 30 #include <errno.h> 31 #include <signal.h> 32 #include <locale.h> 33 #include <libgen.h> 34 35 #include "blktrace.h" 36 #include "rbtree.h" 37 #include "jhash.h" 38 39 static char blkparse_version[] = "1.2.0"; 40 41 struct skip_info { 42 unsigned long start, end; 43 struct skip_info *prev, *next; 44 }; 45 46 struct per_dev_info { 47 dev_t dev; 48 char *name; 49 50 int backwards; 51 unsigned long long events; 52 unsigned long long first_reported_time; 53 unsigned long long last_reported_time; 54 unsigned long long last_read_time; 55 struct io_stats io_stats; 56 unsigned long skips; 57 unsigned long long seq_skips; 58 unsigned int max_depth[2]; 59 unsigned int cur_depth[2]; 60 61 struct rb_root rb_track; 62 63 int nfiles; 64 int ncpus; 65 66 unsigned long *cpu_map; 67 unsigned int cpu_map_max; 68 69 struct per_cpu_info *cpus; 70 }; 71 72 /* 73 * some duplicated effort here, we can unify this hash and the ppi hash later 74 */ 75 struct process_pid_map { 76 pid_t pid; 77 char comm[16]; 78 struct process_pid_map *hash_next, *list_next; 79 }; 80 81 #define PPM_HASH_SHIFT (8) 82 #define PPM_HASH_SIZE (1 << PPM_HASH_SHIFT) 83 #define PPM_HASH_MASK (PPM_HASH_SIZE - 1) 84 static struct process_pid_map *ppm_hash_table[PPM_HASH_SIZE]; 85 86 struct per_process_info { 87 struct process_pid_map *ppm; 88 struct io_stats io_stats; 89 struct per_process_info *hash_next, *list_next; 90 int more_than_one; 91 92 /* 93 * individual io stats 94 */ 95 unsigned long long longest_allocation_wait[2]; 96 unsigned long long longest_dispatch_wait[2]; 97 unsigned long long longest_completion_wait[2]; 98 }; 99 100 #define PPI_HASH_SHIFT (8) 101 #define PPI_HASH_SIZE (1 << PPI_HASH_SHIFT) 102 #define PPI_HASH_MASK (PPI_HASH_SIZE - 1) 103 static struct per_process_info *ppi_hash_table[PPI_HASH_SIZE]; 104 static struct per_process_info *ppi_list; 105 static int ppi_list_entries; 106 107 static struct option l_opts[] = { 108 { 109 .name = "act-mask", 110 .has_arg = required_argument, 111 .flag = NULL, 112 .val = 'a' 113 }, 114 { 115 .name = "set-mask", 116 .has_arg = required_argument, 117 .flag = NULL, 118 .val = 'A' 119 }, 120 { 121 .name = "batch", 122 .has_arg = required_argument, 123 .flag = NULL, 124 .val = 'b' 125 }, 126 { 127 .name = "input-directory", 128 .has_arg = required_argument, 129 .flag = NULL, 130 .val = 'D' 131 }, 132 { 133 .name = "dump-binary", 134 .has_arg = required_argument, 135 .flag = NULL, 136 .val = 'd' 137 }, 138 { 139 .name = "format", 140 .has_arg = required_argument, 141 .flag = NULL, 142 .val = 'f' 143 }, 144 { 145 .name = "format-spec", 146 .has_arg = required_argument, 147 .flag = NULL, 148 .val = 'F' 149 }, 150 { 151 .name = "hash-by-name", 152 .has_arg = no_argument, 153 .flag = NULL, 154 .val = 'h' 155 }, 156 { 157 .name = "input", 158 .has_arg = required_argument, 159 .flag = NULL, 160 .val = 'i' 161 }, 162 { 163 .name = "no-msgs", 164 .has_arg = no_argument, 165 .flag = NULL, 166 .val = 'M' 167 }, 168 { 169 .name = "output", 170 .has_arg = required_argument, 171 .flag = NULL, 172 .val = 'o' 173 }, 174 { 175 .name = "no-text-output", 176 .has_arg = no_argument, 177 .flag = NULL, 178 .val = 'O' 179 }, 180 { 181 .name = "quiet", 182 .has_arg = no_argument, 183 .flag = NULL, 184 .val = 'q' 185 }, 186 { 187 .name = "per-program-stats", 188 .has_arg = no_argument, 189 .flag = NULL, 190 .val = 's' 191 }, 192 { 193 .name = "track-ios", 194 .has_arg = no_argument, 195 .flag = NULL, 196 .val = 't' 197 }, 198 { 199 .name = "stopwatch", 200 .has_arg = required_argument, 201 .flag = NULL, 202 .val = 'w' 203 }, 204 { 205 .name = "verbose", 206 .has_arg = no_argument, 207 .flag = NULL, 208 .val = 'v' 209 }, 210 { 211 .name = "version", 212 .has_arg = no_argument, 213 .flag = NULL, 214 .val = 'V' 215 }, 216 { 217 .name = NULL, 218 } 219 }; 220 221 /* 222 * for sorting the displayed output 223 */ 224 struct trace { 225 struct blk_io_trace *bit; 226 struct rb_node rb_node; 227 struct trace *next; 228 unsigned long read_sequence; 229 }; 230 231 static struct rb_root rb_sort_root; 232 static unsigned long rb_sort_entries; 233 234 static struct trace *trace_list; 235 236 /* 237 * allocation cache 238 */ 239 static struct blk_io_trace *bit_alloc_list; 240 static struct trace *t_alloc_list; 241 242 /* 243 * for tracking individual ios 244 */ 245 struct io_track { 246 struct rb_node rb_node; 247 248 struct process_pid_map *ppm; 249 __u64 sector; 250 unsigned long long allocation_time; 251 unsigned long long queue_time; 252 unsigned long long dispatch_time; 253 unsigned long long completion_time; 254 }; 255 256 static int ndevices; 257 static struct per_dev_info *devices; 258 static char *get_dev_name(struct per_dev_info *, char *, int); 259 static int trace_rb_insert_last(struct per_dev_info *, struct trace *); 260 261 FILE *ofp = NULL; 262 static char *output_name; 263 static char *input_dir; 264 265 static unsigned long long genesis_time; 266 static unsigned long long last_allowed_time; 267 static unsigned long long stopwatch_start; /* start from zero by default */ 268 static unsigned long long stopwatch_end = -1ULL; /* "infinity" */ 269 static unsigned long read_sequence; 270 271 static int per_process_stats; 272 static int per_device_and_cpu_stats = 1; 273 static int track_ios; 274 static int ppi_hash_by_pid = 1; 275 static int verbose; 276 static unsigned int act_mask = -1U; 277 static int stats_printed; 278 static int bin_output_msgs = 1; 279 int data_is_native = -1; 280 281 static FILE *dump_fp; 282 static char *dump_binary; 283 284 static unsigned int t_alloc_cache; 285 static unsigned int bit_alloc_cache; 286 287 #define RB_BATCH_DEFAULT (512) 288 static unsigned int rb_batch = RB_BATCH_DEFAULT; 289 290 static int pipeline; 291 static char *pipename; 292 293 static int text_output = 1; 294 295 #define is_done() (*(volatile int *)(&done)) 296 static volatile int done; 297 298 struct timespec abs_start_time; 299 static unsigned long long start_timestamp; 300 301 static int have_drv_data = 0; 302 303 #define JHASH_RANDOM (0x3af5f2ee) 304 305 #define CPUS_PER_LONG (8 * sizeof(unsigned long)) 306 #define CPU_IDX(cpu) ((cpu) / CPUS_PER_LONG) 307 #define CPU_BIT(cpu) ((cpu) & (CPUS_PER_LONG - 1)) 308 309 static void output_binary(void *buf, int len) 310 { 311 if (dump_binary) { 312 size_t n = fwrite(buf, len, 1, dump_fp); 313 if (n != 1) { 314 perror(dump_binary); 315 fclose(dump_fp); 316 dump_binary = NULL; 317 } 318 } 319 } 320 321 static void resize_cpu_info(struct per_dev_info *pdi, int cpu) 322 { 323 struct per_cpu_info *cpus = pdi->cpus; 324 int ncpus = pdi->ncpus; 325 int new_count = cpu + 1; 326 int new_space, size; 327 char *new_start; 328 329 size = new_count * sizeof(struct per_cpu_info); 330 cpus = realloc(cpus, size); 331 if (!cpus) { 332 char name[20]; 333 fprintf(stderr, "Out of memory, CPU info for device %s (%d)\n", 334 get_dev_name(pdi, name, sizeof(name)), size); 335 exit(1); 336 } 337 338 new_start = (char *)cpus + (ncpus * sizeof(struct per_cpu_info)); 339 new_space = (new_count - ncpus) * sizeof(struct per_cpu_info); 340 memset(new_start, 0, new_space); 341 342 pdi->ncpus = new_count; 343 pdi->cpus = cpus; 344 345 for (new_count = 0; new_count < pdi->ncpus; new_count++) { 346 struct per_cpu_info *pci = &pdi->cpus[new_count]; 347 348 if (!pci->fd) { 349 pci->fd = -1; 350 memset(&pci->rb_last, 0, sizeof(pci->rb_last)); 351 pci->rb_last_entries = 0; 352 pci->last_sequence = -1; 353 } 354 } 355 } 356 357 static struct per_cpu_info *get_cpu_info(struct per_dev_info *pdi, int cpu) 358 { 359 struct per_cpu_info *pci; 360 361 if (cpu >= pdi->ncpus) 362 resize_cpu_info(pdi, cpu); 363 364 pci = &pdi->cpus[cpu]; 365 pci->cpu = cpu; 366 return pci; 367 } 368 369 370 static int resize_devices(char *name) 371 { 372 int size = (ndevices + 1) * sizeof(struct per_dev_info); 373 374 devices = realloc(devices, size); 375 if (!devices) { 376 fprintf(stderr, "Out of memory, device %s (%d)\n", name, size); 377 return 1; 378 } 379 memset(&devices[ndevices], 0, sizeof(struct per_dev_info)); 380 devices[ndevices].name = name; 381 ndevices++; 382 return 0; 383 } 384 385 static struct per_dev_info *get_dev_info(dev_t dev) 386 { 387 struct per_dev_info *pdi; 388 int i; 389 390 for (i = 0; i < ndevices; i++) { 391 if (!devices[i].dev) 392 devices[i].dev = dev; 393 if (devices[i].dev == dev) 394 return &devices[i]; 395 } 396 397 if (resize_devices(NULL)) 398 return NULL; 399 400 pdi = &devices[ndevices - 1]; 401 pdi->dev = dev; 402 pdi->first_reported_time = 0; 403 pdi->last_read_time = 0; 404 405 return pdi; 406 } 407 408 static void insert_skip(struct per_cpu_info *pci, unsigned long start, 409 unsigned long end) 410 { 411 struct skip_info *sip; 412 413 for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) { 414 if (end == (sip->start - 1)) { 415 sip->start = start; 416 return; 417 } else if (start == (sip->end + 1)) { 418 sip->end = end; 419 return; 420 } 421 } 422 423 sip = malloc(sizeof(struct skip_info)); 424 sip->start = start; 425 sip->end = end; 426 sip->prev = sip->next = NULL; 427 if (pci->skips_tail == NULL) 428 pci->skips_head = pci->skips_tail = sip; 429 else { 430 sip->prev = pci->skips_tail; 431 pci->skips_tail->next = sip; 432 pci->skips_tail = sip; 433 } 434 } 435 436 static void remove_sip(struct per_cpu_info *pci, struct skip_info *sip) 437 { 438 if (sip->prev == NULL) { 439 if (sip->next == NULL) 440 pci->skips_head = pci->skips_tail = NULL; 441 else { 442 pci->skips_head = sip->next; 443 sip->next->prev = NULL; 444 } 445 } else if (sip->next == NULL) { 446 pci->skips_tail = sip->prev; 447 sip->prev->next = NULL; 448 } else { 449 sip->prev->next = sip->next; 450 sip->next->prev = sip->prev; 451 } 452 453 sip->prev = sip->next = NULL; 454 free(sip); 455 } 456 457 #define IN_SKIP(sip,seq) (((sip)->start <= (seq)) && ((seq) <= sip->end)) 458 static int check_current_skips(struct per_cpu_info *pci, unsigned long seq) 459 { 460 struct skip_info *sip; 461 462 for (sip = pci->skips_tail; sip != NULL; sip = sip->prev) { 463 if (IN_SKIP(sip, seq)) { 464 if (sip->start == seq) { 465 if (sip->end == seq) 466 remove_sip(pci, sip); 467 else 468 sip->start += 1; 469 } else if (sip->end == seq) 470 sip->end -= 1; 471 else { 472 sip->end = seq - 1; 473 insert_skip(pci, seq + 1, sip->end); 474 } 475 return 1; 476 } 477 } 478 479 return 0; 480 } 481 482 static void collect_pdi_skips(struct per_dev_info *pdi) 483 { 484 struct skip_info *sip; 485 int cpu; 486 487 pdi->skips = 0; 488 pdi->seq_skips = 0; 489 490 for (cpu = 0; cpu < pdi->ncpus; cpu++) { 491 struct per_cpu_info *pci = &pdi->cpus[cpu]; 492 493 for (sip = pci->skips_head; sip != NULL; sip = sip->next) { 494 pdi->skips++; 495 pdi->seq_skips += (sip->end - sip->start + 1); 496 if (verbose) 497 fprintf(stderr,"(%d,%d): skipping %lu -> %lu\n", 498 MAJOR(pdi->dev), MINOR(pdi->dev), 499 sip->start, sip->end); 500 } 501 } 502 } 503 504 static void cpu_mark_online(struct per_dev_info *pdi, unsigned int cpu) 505 { 506 if (cpu >= pdi->cpu_map_max || !pdi->cpu_map) { 507 int new_max = (cpu + CPUS_PER_LONG) & ~(CPUS_PER_LONG - 1); 508 unsigned long *map = malloc(new_max / sizeof(long)); 509 510 memset(map, 0, new_max / sizeof(long)); 511 512 if (pdi->cpu_map) { 513 memcpy(map, pdi->cpu_map, pdi->cpu_map_max / sizeof(long)); 514 free(pdi->cpu_map); 515 } 516 517 pdi->cpu_map = map; 518 pdi->cpu_map_max = new_max; 519 } 520 521 pdi->cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu)); 522 } 523 524 static inline void cpu_mark_offline(struct per_dev_info *pdi, int cpu) 525 { 526 pdi->cpu_map[CPU_IDX(cpu)] &= ~(1UL << CPU_BIT(cpu)); 527 } 528 529 static inline int cpu_is_online(struct per_dev_info *pdi, int cpu) 530 { 531 return (pdi->cpu_map[CPU_IDX(cpu)] & (1UL << CPU_BIT(cpu))) != 0; 532 } 533 534 static inline int ppm_hash_pid(pid_t pid) 535 { 536 return jhash_1word(pid, JHASH_RANDOM) & PPM_HASH_MASK; 537 } 538 539 static struct process_pid_map *find_ppm(pid_t pid) 540 { 541 const int hash_idx = ppm_hash_pid(pid); 542 struct process_pid_map *ppm; 543 544 ppm = ppm_hash_table[hash_idx]; 545 while (ppm) { 546 if (ppm->pid == pid) 547 return ppm; 548 549 ppm = ppm->hash_next; 550 } 551 552 return NULL; 553 } 554 555 static struct process_pid_map *add_ppm_hash(pid_t pid, const char *name) 556 { 557 const int hash_idx = ppm_hash_pid(pid); 558 struct process_pid_map *ppm; 559 560 ppm = find_ppm(pid); 561 if (!ppm) { 562 ppm = malloc(sizeof(*ppm)); 563 memset(ppm, 0, sizeof(*ppm)); 564 ppm->pid = pid; 565 memset(ppm->comm, 0, sizeof(ppm->comm)); 566 strncpy(ppm->comm, name, sizeof(ppm->comm)); 567 ppm->comm[sizeof(ppm->comm) - 1] = '\0'; 568 ppm->hash_next = ppm_hash_table[hash_idx]; 569 ppm_hash_table[hash_idx] = ppm; 570 } 571 572 return ppm; 573 } 574 575 static void handle_notify(struct blk_io_trace *bit) 576 { 577 void *payload = (caddr_t) bit + sizeof(*bit); 578 __u32 two32[2]; 579 580 switch (bit->action) { 581 case BLK_TN_PROCESS: 582 add_ppm_hash(bit->pid, payload); 583 break; 584 585 case BLK_TN_TIMESTAMP: 586 if (bit->pdu_len != sizeof(two32)) 587 return; 588 memcpy(two32, payload, sizeof(two32)); 589 if (!data_is_native) { 590 two32[0] = be32_to_cpu(two32[0]); 591 two32[1] = be32_to_cpu(two32[1]); 592 } 593 start_timestamp = bit->time; 594 abs_start_time.tv_sec = two32[0]; 595 abs_start_time.tv_nsec = two32[1]; 596 if (abs_start_time.tv_nsec < 0) { 597 abs_start_time.tv_sec--; 598 abs_start_time.tv_nsec += 1000000000; 599 } 600 601 break; 602 603 case BLK_TN_MESSAGE: 604 if (bit->pdu_len > 0) { 605 char msg[bit->pdu_len+1]; 606 607 memcpy(msg, (char *)payload, bit->pdu_len); 608 msg[bit->pdu_len] = '\0'; 609 610 fprintf(ofp, 611 "%3d,%-3d %2d %8s %5d.%09lu %5u %2s %3s %s\n", 612 MAJOR(bit->device), MINOR(bit->device), 613 bit->cpu, "0", (int) SECONDS(bit->time), 614 (unsigned long) NANO_SECONDS(bit->time), 615 0, "m", "N", msg); 616 } 617 break; 618 619 default: 620 /* Ignore unknown notify events */ 621 ; 622 } 623 } 624 625 char *find_process_name(pid_t pid) 626 { 627 struct process_pid_map *ppm = find_ppm(pid); 628 629 if (ppm) 630 return ppm->comm; 631 632 return NULL; 633 } 634 635 static inline int ppi_hash_pid(pid_t pid) 636 { 637 return jhash_1word(pid, JHASH_RANDOM) & PPI_HASH_MASK; 638 } 639 640 static inline int ppi_hash_name(const char *name) 641 { 642 return jhash(name, 16, JHASH_RANDOM) & PPI_HASH_MASK; 643 } 644 645 static inline int ppi_hash(struct per_process_info *ppi) 646 { 647 struct process_pid_map *ppm = ppi->ppm; 648 649 if (ppi_hash_by_pid) 650 return ppi_hash_pid(ppm->pid); 651 652 return ppi_hash_name(ppm->comm); 653 } 654 655 static inline void add_ppi_to_hash(struct per_process_info *ppi) 656 { 657 const int hash_idx = ppi_hash(ppi); 658 659 ppi->hash_next = ppi_hash_table[hash_idx]; 660 ppi_hash_table[hash_idx] = ppi; 661 } 662 663 static inline void add_ppi_to_list(struct per_process_info *ppi) 664 { 665 ppi->list_next = ppi_list; 666 ppi_list = ppi; 667 ppi_list_entries++; 668 } 669 670 static struct per_process_info *find_ppi_by_name(char *name) 671 { 672 const int hash_idx = ppi_hash_name(name); 673 struct per_process_info *ppi; 674 675 ppi = ppi_hash_table[hash_idx]; 676 while (ppi) { 677 struct process_pid_map *ppm = ppi->ppm; 678 679 if (!strcmp(ppm->comm, name)) 680 return ppi; 681 682 ppi = ppi->hash_next; 683 } 684 685 return NULL; 686 } 687 688 static struct per_process_info *find_ppi_by_pid(pid_t pid) 689 { 690 const int hash_idx = ppi_hash_pid(pid); 691 struct per_process_info *ppi; 692 693 ppi = ppi_hash_table[hash_idx]; 694 while (ppi) { 695 struct process_pid_map *ppm = ppi->ppm; 696 697 if (ppm->pid == pid) 698 return ppi; 699 700 ppi = ppi->hash_next; 701 } 702 703 return NULL; 704 } 705 706 static struct per_process_info *find_ppi(pid_t pid) 707 { 708 struct per_process_info *ppi; 709 char *name; 710 711 if (ppi_hash_by_pid) 712 return find_ppi_by_pid(pid); 713 714 name = find_process_name(pid); 715 if (!name) 716 return NULL; 717 718 ppi = find_ppi_by_name(name); 719 if (ppi && ppi->ppm->pid != pid) 720 ppi->more_than_one = 1; 721 722 return ppi; 723 } 724 725 /* 726 * struct trace and blktrace allocation cache, we do potentially 727 * millions of mallocs for these structures while only using at most 728 * a few thousand at the time 729 */ 730 static inline void t_free(struct trace *t) 731 { 732 if (t_alloc_cache < 1024) { 733 t->next = t_alloc_list; 734 t_alloc_list = t; 735 t_alloc_cache++; 736 } else 737 free(t); 738 } 739 740 static inline struct trace *t_alloc(void) 741 { 742 struct trace *t = t_alloc_list; 743 744 if (t) { 745 t_alloc_list = t->next; 746 t_alloc_cache--; 747 return t; 748 } 749 750 return malloc(sizeof(*t)); 751 } 752 753 static inline void bit_free(struct blk_io_trace *bit) 754 { 755 if (bit_alloc_cache < 1024 && !bit->pdu_len) { 756 /* 757 * abuse a 64-bit field for a next pointer for the free item 758 */ 759 bit->time = (__u64) (unsigned long) bit_alloc_list; 760 bit_alloc_list = (struct blk_io_trace *) bit; 761 bit_alloc_cache++; 762 } else 763 free(bit); 764 } 765 766 static inline struct blk_io_trace *bit_alloc(void) 767 { 768 struct blk_io_trace *bit = bit_alloc_list; 769 770 if (bit) { 771 bit_alloc_list = (struct blk_io_trace *) (unsigned long) \ 772 bit->time; 773 bit_alloc_cache--; 774 return bit; 775 } 776 777 return malloc(sizeof(*bit)); 778 } 779 780 static inline void __put_trace_last(struct per_dev_info *pdi, struct trace *t) 781 { 782 struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); 783 784 rb_erase(&t->rb_node, &pci->rb_last); 785 pci->rb_last_entries--; 786 787 bit_free(t->bit); 788 t_free(t); 789 } 790 791 static void put_trace(struct per_dev_info *pdi, struct trace *t) 792 { 793 rb_erase(&t->rb_node, &rb_sort_root); 794 rb_sort_entries--; 795 796 trace_rb_insert_last(pdi, t); 797 } 798 799 static inline int trace_rb_insert(struct trace *t, struct rb_root *root) 800 { 801 struct rb_node **p = &root->rb_node; 802 struct rb_node *parent = NULL; 803 struct trace *__t; 804 805 while (*p) { 806 parent = *p; 807 808 __t = rb_entry(parent, struct trace, rb_node); 809 810 if (t->bit->time < __t->bit->time) 811 p = &(*p)->rb_left; 812 else if (t->bit->time > __t->bit->time) 813 p = &(*p)->rb_right; 814 else if (t->bit->device < __t->bit->device) 815 p = &(*p)->rb_left; 816 else if (t->bit->device > __t->bit->device) 817 p = &(*p)->rb_right; 818 else if (t->bit->sequence < __t->bit->sequence) 819 p = &(*p)->rb_left; 820 else /* >= sequence */ 821 p = &(*p)->rb_right; 822 } 823 824 rb_link_node(&t->rb_node, parent, p); 825 rb_insert_color(&t->rb_node, root); 826 return 0; 827 } 828 829 static inline int trace_rb_insert_sort(struct trace *t) 830 { 831 if (!trace_rb_insert(t, &rb_sort_root)) { 832 rb_sort_entries++; 833 return 0; 834 } 835 836 return 1; 837 } 838 839 static int trace_rb_insert_last(struct per_dev_info *pdi, struct trace *t) 840 { 841 struct per_cpu_info *pci = get_cpu_info(pdi, t->bit->cpu); 842 843 if (trace_rb_insert(t, &pci->rb_last)) 844 return 1; 845 846 pci->rb_last_entries++; 847 848 if (pci->rb_last_entries > rb_batch * pdi->nfiles) { 849 struct rb_node *n = rb_first(&pci->rb_last); 850 851 t = rb_entry(n, struct trace, rb_node); 852 __put_trace_last(pdi, t); 853 } 854 855 return 0; 856 } 857 858 static struct trace *trace_rb_find(dev_t device, unsigned long sequence, 859 struct rb_root *root, int order) 860 { 861 struct rb_node *n = root->rb_node; 862 struct rb_node *prev = NULL; 863 struct trace *__t; 864 865 while (n) { 866 __t = rb_entry(n, struct trace, rb_node); 867 prev = n; 868 869 if (device < __t->bit->device) 870 n = n->rb_left; 871 else if (device > __t->bit->device) 872 n = n->rb_right; 873 else if (sequence < __t->bit->sequence) 874 n = n->rb_left; 875 else if (sequence > __t->bit->sequence) 876 n = n->rb_right; 877 else 878 return __t; 879 } 880 881 /* 882 * hack - the list may not be sequence ordered because some 883 * events don't have sequence and time matched. so we end up 884 * being a little off in the rb lookup here, because we don't 885 * know the time we are looking for. compensate by browsing 886 * a little ahead from the last entry to find the match 887 */ 888 if (order && prev) { 889 int max = 5; 890 891 while (((n = rb_next(prev)) != NULL) && max--) { 892 __t = rb_entry(n, struct trace, rb_node); 893 894 if (__t->bit->device == device && 895 __t->bit->sequence == sequence) 896 return __t; 897 898 prev = n; 899 } 900 } 901 902 return NULL; 903 } 904 905 static inline struct trace *trace_rb_find_last(struct per_dev_info *pdi, 906 struct per_cpu_info *pci, 907 unsigned long seq) 908 { 909 return trace_rb_find(pdi->dev, seq, &pci->rb_last, 0); 910 } 911 912 static inline int track_rb_insert(struct per_dev_info *pdi,struct io_track *iot) 913 { 914 struct rb_node **p = &pdi->rb_track.rb_node; 915 struct rb_node *parent = NULL; 916 struct io_track *__iot; 917 918 while (*p) { 919 parent = *p; 920 __iot = rb_entry(parent, struct io_track, rb_node); 921 922 if (iot->sector < __iot->sector) 923 p = &(*p)->rb_left; 924 else if (iot->sector > __iot->sector) 925 p = &(*p)->rb_right; 926 else { 927 fprintf(stderr, 928 "sector alias (%Lu) on device %d,%d!\n", 929 (unsigned long long) iot->sector, 930 MAJOR(pdi->dev), MINOR(pdi->dev)); 931 return 1; 932 } 933 } 934 935 rb_link_node(&iot->rb_node, parent, p); 936 rb_insert_color(&iot->rb_node, &pdi->rb_track); 937 return 0; 938 } 939 940 static struct io_track *__find_track(struct per_dev_info *pdi, __u64 sector) 941 { 942 struct rb_node *n = pdi->rb_track.rb_node; 943 struct io_track *__iot; 944 945 while (n) { 946 __iot = rb_entry(n, struct io_track, rb_node); 947 948 if (sector < __iot->sector) 949 n = n->rb_left; 950 else if (sector > __iot->sector) 951 n = n->rb_right; 952 else 953 return __iot; 954 } 955 956 return NULL; 957 } 958 959 static struct io_track *find_track(struct per_dev_info *pdi, pid_t pid, 960 __u64 sector) 961 { 962 struct io_track *iot; 963 964 iot = __find_track(pdi, sector); 965 if (!iot) { 966 iot = malloc(sizeof(*iot)); 967 iot->ppm = find_ppm(pid); 968 if (!iot->ppm) 969 iot->ppm = add_ppm_hash(pid, "unknown"); 970 iot->sector = sector; 971 track_rb_insert(pdi, iot); 972 } 973 974 return iot; 975 } 976 977 static void log_track_frontmerge(struct per_dev_info *pdi, 978 struct blk_io_trace *t) 979 { 980 struct io_track *iot; 981 982 if (!track_ios) 983 return; 984 985 iot = __find_track(pdi, t->sector + t_sec(t)); 986 if (!iot) { 987 if (verbose) 988 fprintf(stderr, "merge not found for (%d,%d): %llu\n", 989 MAJOR(pdi->dev), MINOR(pdi->dev), 990 (unsigned long long) t->sector + t_sec(t)); 991 return; 992 } 993 994 rb_erase(&iot->rb_node, &pdi->rb_track); 995 iot->sector -= t_sec(t); 996 track_rb_insert(pdi, iot); 997 } 998 999 static void log_track_getrq(struct per_dev_info *pdi, struct blk_io_trace *t) 1000 { 1001 struct io_track *iot; 1002 1003 if (!track_ios) 1004 return; 1005 1006 iot = find_track(pdi, t->pid, t->sector); 1007 iot->allocation_time = t->time; 1008 } 1009 1010 static inline int is_remapper(struct per_dev_info *pdi) 1011 { 1012 int major = MAJOR(pdi->dev); 1013 1014 return (major == 253 || major == 9); 1015 } 1016 1017 /* 1018 * for md/dm setups, the interesting cycle is Q -> C. So track queueing 1019 * time here, as dispatch time 1020 */ 1021 static void log_track_queue(struct per_dev_info *pdi, struct blk_io_trace *t) 1022 { 1023 struct io_track *iot; 1024 1025 if (!track_ios) 1026 return; 1027 if (!is_remapper(pdi)) 1028 return; 1029 1030 iot = find_track(pdi, t->pid, t->sector); 1031 iot->dispatch_time = t->time; 1032 } 1033 1034 /* 1035 * return time between rq allocation and insertion 1036 */ 1037 static unsigned long long log_track_insert(struct per_dev_info *pdi, 1038 struct blk_io_trace *t) 1039 { 1040 unsigned long long elapsed; 1041 struct io_track *iot; 1042 1043 if (!track_ios) 1044 return -1; 1045 1046 iot = find_track(pdi, t->pid, t->sector); 1047 iot->queue_time = t->time; 1048 1049 if (!iot->allocation_time) 1050 return -1; 1051 1052 elapsed = iot->queue_time - iot->allocation_time; 1053 1054 if (per_process_stats) { 1055 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1056 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1057 1058 if (ppi && elapsed > ppi->longest_allocation_wait[w]) 1059 ppi->longest_allocation_wait[w] = elapsed; 1060 } 1061 1062 return elapsed; 1063 } 1064 1065 /* 1066 * return time between queue and issue 1067 */ 1068 static unsigned long long log_track_issue(struct per_dev_info *pdi, 1069 struct blk_io_trace *t) 1070 { 1071 unsigned long long elapsed; 1072 struct io_track *iot; 1073 1074 if (!track_ios) 1075 return -1; 1076 if ((t->action & BLK_TC_ACT(BLK_TC_FS)) == 0) 1077 return -1; 1078 1079 iot = __find_track(pdi, t->sector); 1080 if (!iot) { 1081 if (verbose) 1082 fprintf(stderr, "issue not found for (%d,%d): %llu\n", 1083 MAJOR(pdi->dev), MINOR(pdi->dev), 1084 (unsigned long long) t->sector); 1085 return -1; 1086 } 1087 1088 iot->dispatch_time = t->time; 1089 elapsed = iot->dispatch_time - iot->queue_time; 1090 1091 if (per_process_stats) { 1092 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1093 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1094 1095 if (ppi && elapsed > ppi->longest_dispatch_wait[w]) 1096 ppi->longest_dispatch_wait[w] = elapsed; 1097 } 1098 1099 return elapsed; 1100 } 1101 1102 /* 1103 * return time between dispatch and complete 1104 */ 1105 static unsigned long long log_track_complete(struct per_dev_info *pdi, 1106 struct blk_io_trace *t) 1107 { 1108 unsigned long long elapsed; 1109 struct io_track *iot; 1110 1111 if (!track_ios) 1112 return -1; 1113 1114 iot = __find_track(pdi, t->sector); 1115 if (!iot) { 1116 if (verbose) 1117 fprintf(stderr,"complete not found for (%d,%d): %llu\n", 1118 MAJOR(pdi->dev), MINOR(pdi->dev), 1119 (unsigned long long) t->sector); 1120 return -1; 1121 } 1122 1123 iot->completion_time = t->time; 1124 elapsed = iot->completion_time - iot->dispatch_time; 1125 1126 if (per_process_stats) { 1127 struct per_process_info *ppi = find_ppi(iot->ppm->pid); 1128 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1129 1130 if (ppi && elapsed > ppi->longest_completion_wait[w]) 1131 ppi->longest_completion_wait[w] = elapsed; 1132 } 1133 1134 /* 1135 * kill the trace, we don't need it after completion 1136 */ 1137 rb_erase(&iot->rb_node, &pdi->rb_track); 1138 free(iot); 1139 1140 return elapsed; 1141 } 1142 1143 1144 static struct io_stats *find_process_io_stats(pid_t pid) 1145 { 1146 struct per_process_info *ppi = find_ppi(pid); 1147 1148 if (!ppi) { 1149 ppi = malloc(sizeof(*ppi)); 1150 memset(ppi, 0, sizeof(*ppi)); 1151 ppi->ppm = find_ppm(pid); 1152 if (!ppi->ppm) 1153 ppi->ppm = add_ppm_hash(pid, "unknown"); 1154 add_ppi_to_hash(ppi); 1155 add_ppi_to_list(ppi); 1156 } 1157 1158 return &ppi->io_stats; 1159 } 1160 1161 static char *get_dev_name(struct per_dev_info *pdi, char *buffer, int size) 1162 { 1163 if (pdi->name) 1164 snprintf(buffer, size, "%s", pdi->name); 1165 else 1166 snprintf(buffer, size, "%d,%d",MAJOR(pdi->dev),MINOR(pdi->dev)); 1167 return buffer; 1168 } 1169 1170 static void check_time(struct per_dev_info *pdi, struct blk_io_trace *bit) 1171 { 1172 unsigned long long this = bit->time; 1173 unsigned long long last = pdi->last_reported_time; 1174 1175 pdi->backwards = (this < last) ? 'B' : ' '; 1176 pdi->last_reported_time = this; 1177 } 1178 1179 static inline void __account_m(struct io_stats *ios, struct blk_io_trace *t, 1180 int rw) 1181 { 1182 if (rw) { 1183 ios->mwrites++; 1184 ios->mwrite_kb += t_kb(t); 1185 ios->mwrite_b += t_b(t); 1186 } else { 1187 ios->mreads++; 1188 ios->mread_kb += t_kb(t); 1189 ios->mread_b += t_b(t); 1190 } 1191 } 1192 1193 static inline void account_m(struct blk_io_trace *t, struct per_cpu_info *pci, 1194 int rw) 1195 { 1196 __account_m(&pci->io_stats, t, rw); 1197 1198 if (per_process_stats) { 1199 struct io_stats *ios = find_process_io_stats(t->pid); 1200 1201 __account_m(ios, t, rw); 1202 } 1203 } 1204 1205 static inline void __account_pc_queue(struct io_stats *ios, 1206 struct blk_io_trace *t, int rw) 1207 { 1208 if (rw) { 1209 ios->qwrites_pc++; 1210 ios->qwrite_kb_pc += t_kb(t); 1211 ios->qwrite_b_pc += t_b(t); 1212 } else { 1213 ios->qreads_pc++; 1214 ios->qread_kb += t_kb(t); 1215 ios->qread_b_pc += t_b(t); 1216 } 1217 } 1218 1219 static inline void account_pc_queue(struct blk_io_trace *t, 1220 struct per_cpu_info *pci, int rw) 1221 { 1222 __account_pc_queue(&pci->io_stats, t, rw); 1223 1224 if (per_process_stats) { 1225 struct io_stats *ios = find_process_io_stats(t->pid); 1226 1227 __account_pc_queue(ios, t, rw); 1228 } 1229 } 1230 1231 static inline void __account_pc_issue(struct io_stats *ios, int rw, 1232 unsigned int bytes) 1233 { 1234 if (rw) { 1235 ios->iwrites_pc++; 1236 ios->iwrite_kb_pc += bytes >> 10; 1237 ios->iwrite_b_pc += bytes & 1023; 1238 } else { 1239 ios->ireads_pc++; 1240 ios->iread_kb_pc += bytes >> 10; 1241 ios->iread_b_pc += bytes & 1023; 1242 } 1243 } 1244 1245 static inline void account_pc_issue(struct blk_io_trace *t, 1246 struct per_cpu_info *pci, int rw) 1247 { 1248 __account_pc_issue(&pci->io_stats, rw, t->bytes); 1249 1250 if (per_process_stats) { 1251 struct io_stats *ios = find_process_io_stats(t->pid); 1252 1253 __account_pc_issue(ios, rw, t->bytes); 1254 } 1255 } 1256 1257 static inline void __account_pc_requeue(struct io_stats *ios, 1258 struct blk_io_trace *t, int rw) 1259 { 1260 if (rw) { 1261 ios->wrqueue_pc++; 1262 ios->iwrite_kb_pc -= t_kb(t); 1263 ios->iwrite_b_pc -= t_b(t); 1264 } else { 1265 ios->rrqueue_pc++; 1266 ios->iread_kb_pc -= t_kb(t); 1267 ios->iread_b_pc -= t_b(t); 1268 } 1269 } 1270 1271 static inline void account_pc_requeue(struct blk_io_trace *t, 1272 struct per_cpu_info *pci, int rw) 1273 { 1274 __account_pc_requeue(&pci->io_stats, t, rw); 1275 1276 if (per_process_stats) { 1277 struct io_stats *ios = find_process_io_stats(t->pid); 1278 1279 __account_pc_requeue(ios, t, rw); 1280 } 1281 } 1282 1283 static inline void __account_pc_c(struct io_stats *ios, int rw) 1284 { 1285 if (rw) 1286 ios->cwrites_pc++; 1287 else 1288 ios->creads_pc++; 1289 } 1290 1291 static inline void account_pc_c(struct blk_io_trace *t, 1292 struct per_cpu_info *pci, int rw) 1293 { 1294 __account_pc_c(&pci->io_stats, rw); 1295 1296 if (per_process_stats) { 1297 struct io_stats *ios = find_process_io_stats(t->pid); 1298 1299 __account_pc_c(ios, rw); 1300 } 1301 } 1302 1303 static inline void __account_queue(struct io_stats *ios, struct blk_io_trace *t, 1304 int rw) 1305 { 1306 if (rw) { 1307 ios->qwrites++; 1308 ios->qwrite_kb += t_kb(t); 1309 ios->qwrite_b += t_b(t); 1310 } else { 1311 ios->qreads++; 1312 ios->qread_kb += t_kb(t); 1313 ios->qread_b += t_b(t); 1314 } 1315 } 1316 1317 static inline void account_queue(struct blk_io_trace *t, 1318 struct per_cpu_info *pci, int rw) 1319 { 1320 __account_queue(&pci->io_stats, t, rw); 1321 1322 if (per_process_stats) { 1323 struct io_stats *ios = find_process_io_stats(t->pid); 1324 1325 __account_queue(ios, t, rw); 1326 } 1327 } 1328 1329 static inline void __account_c(struct io_stats *ios, int rw, int bytes) 1330 { 1331 if (rw) { 1332 ios->cwrites++; 1333 ios->cwrite_kb += bytes >> 10; 1334 ios->cwrite_b += bytes & 1023; 1335 } else { 1336 ios->creads++; 1337 ios->cread_kb += bytes >> 10; 1338 ios->cread_b += bytes & 1023; 1339 } 1340 } 1341 1342 static inline void account_c(struct blk_io_trace *t, struct per_cpu_info *pci, 1343 int rw, int bytes) 1344 { 1345 __account_c(&pci->io_stats, rw, bytes); 1346 1347 if (per_process_stats) { 1348 struct io_stats *ios = find_process_io_stats(t->pid); 1349 1350 __account_c(ios, rw, bytes); 1351 } 1352 } 1353 1354 static inline void __account_issue(struct io_stats *ios, int rw, 1355 unsigned int bytes) 1356 { 1357 if (rw) { 1358 ios->iwrites++; 1359 ios->iwrite_kb += bytes >> 10; 1360 ios->iwrite_b += bytes & 1023; 1361 } else { 1362 ios->ireads++; 1363 ios->iread_kb += bytes >> 10; 1364 ios->iread_b += bytes & 1023; 1365 } 1366 } 1367 1368 static inline void account_issue(struct blk_io_trace *t, 1369 struct per_cpu_info *pci, int rw) 1370 { 1371 __account_issue(&pci->io_stats, rw, t->bytes); 1372 1373 if (per_process_stats) { 1374 struct io_stats *ios = find_process_io_stats(t->pid); 1375 1376 __account_issue(ios, rw, t->bytes); 1377 } 1378 } 1379 1380 static inline void __account_unplug(struct io_stats *ios, int timer) 1381 { 1382 if (timer) 1383 ios->timer_unplugs++; 1384 else 1385 ios->io_unplugs++; 1386 } 1387 1388 static inline void account_unplug(struct blk_io_trace *t, 1389 struct per_cpu_info *pci, int timer) 1390 { 1391 __account_unplug(&pci->io_stats, timer); 1392 1393 if (per_process_stats) { 1394 struct io_stats *ios = find_process_io_stats(t->pid); 1395 1396 __account_unplug(ios, timer); 1397 } 1398 } 1399 1400 static inline void __account_requeue(struct io_stats *ios, 1401 struct blk_io_trace *t, int rw) 1402 { 1403 if (rw) { 1404 ios->wrqueue++; 1405 ios->iwrite_kb -= t_kb(t); 1406 ios->iwrite_b -= t_b(t); 1407 } else { 1408 ios->rrqueue++; 1409 ios->iread_kb -= t_kb(t); 1410 ios->iread_b -= t_b(t); 1411 } 1412 } 1413 1414 static inline void account_requeue(struct blk_io_trace *t, 1415 struct per_cpu_info *pci, int rw) 1416 { 1417 __account_requeue(&pci->io_stats, t, rw); 1418 1419 if (per_process_stats) { 1420 struct io_stats *ios = find_process_io_stats(t->pid); 1421 1422 __account_requeue(ios, t, rw); 1423 } 1424 } 1425 1426 static void log_complete(struct per_dev_info *pdi, struct per_cpu_info *pci, 1427 struct blk_io_trace *t, char *act) 1428 { 1429 process_fmt(act, pci, t, log_track_complete(pdi, t), 0, NULL); 1430 } 1431 1432 static void log_insert(struct per_dev_info *pdi, struct per_cpu_info *pci, 1433 struct blk_io_trace *t, char *act) 1434 { 1435 process_fmt(act, pci, t, log_track_insert(pdi, t), 0, NULL); 1436 } 1437 1438 static void log_queue(struct per_cpu_info *pci, struct blk_io_trace *t, 1439 char *act) 1440 { 1441 process_fmt(act, pci, t, -1, 0, NULL); 1442 } 1443 1444 static void log_issue(struct per_dev_info *pdi, struct per_cpu_info *pci, 1445 struct blk_io_trace *t, char *act) 1446 { 1447 process_fmt(act, pci, t, log_track_issue(pdi, t), 0, NULL); 1448 } 1449 1450 static void log_merge(struct per_dev_info *pdi, struct per_cpu_info *pci, 1451 struct blk_io_trace *t, char *act) 1452 { 1453 if (act[0] == 'F') 1454 log_track_frontmerge(pdi, t); 1455 1456 process_fmt(act, pci, t, -1ULL, 0, NULL); 1457 } 1458 1459 static void log_action(struct per_cpu_info *pci, struct blk_io_trace *t, 1460 char *act) 1461 { 1462 process_fmt(act, pci, t, -1ULL, 0, NULL); 1463 } 1464 1465 static void log_generic(struct per_cpu_info *pci, struct blk_io_trace *t, 1466 char *act) 1467 { 1468 process_fmt(act, pci, t, -1ULL, 0, NULL); 1469 } 1470 1471 static void log_unplug(struct per_cpu_info *pci, struct blk_io_trace *t, 1472 char *act) 1473 { 1474 process_fmt(act, pci, t, -1ULL, 0, NULL); 1475 } 1476 1477 static void log_split(struct per_cpu_info *pci, struct blk_io_trace *t, 1478 char *act) 1479 { 1480 process_fmt(act, pci, t, -1ULL, 0, NULL); 1481 } 1482 1483 static void log_pc(struct per_cpu_info *pci, struct blk_io_trace *t, char *act) 1484 { 1485 unsigned char *buf = (unsigned char *) t + sizeof(*t); 1486 1487 process_fmt(act, pci, t, -1ULL, t->pdu_len, buf); 1488 } 1489 1490 static void dump_trace_pc(struct blk_io_trace *t, struct per_dev_info *pdi, 1491 struct per_cpu_info *pci) 1492 { 1493 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1494 int act = t->action & 0xffff; 1495 1496 switch (act) { 1497 case __BLK_TA_QUEUE: 1498 log_generic(pci, t, "Q"); 1499 account_pc_queue(t, pci, w); 1500 break; 1501 case __BLK_TA_GETRQ: 1502 log_generic(pci, t, "G"); 1503 break; 1504 case __BLK_TA_SLEEPRQ: 1505 log_generic(pci, t, "S"); 1506 break; 1507 case __BLK_TA_REQUEUE: 1508 /* 1509 * can happen if we miss traces, don't let it go 1510 * below zero 1511 */ 1512 if (pdi->cur_depth[w]) 1513 pdi->cur_depth[w]--; 1514 account_pc_requeue(t, pci, w); 1515 log_generic(pci, t, "R"); 1516 break; 1517 case __BLK_TA_ISSUE: 1518 account_pc_issue(t, pci, w); 1519 pdi->cur_depth[w]++; 1520 if (pdi->cur_depth[w] > pdi->max_depth[w]) 1521 pdi->max_depth[w] = pdi->cur_depth[w]; 1522 log_pc(pci, t, "D"); 1523 break; 1524 case __BLK_TA_COMPLETE: 1525 if (pdi->cur_depth[w]) 1526 pdi->cur_depth[w]--; 1527 log_pc(pci, t, "C"); 1528 account_pc_c(t, pci, w); 1529 break; 1530 case __BLK_TA_INSERT: 1531 log_pc(pci, t, "I"); 1532 break; 1533 default: 1534 fprintf(stderr, "Bad pc action %x\n", act); 1535 break; 1536 } 1537 } 1538 1539 static void dump_trace_fs(struct blk_io_trace *t, struct per_dev_info *pdi, 1540 struct per_cpu_info *pci) 1541 { 1542 int w = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0; 1543 int act = t->action & 0xffff; 1544 1545 switch (act) { 1546 case __BLK_TA_QUEUE: 1547 log_track_queue(pdi, t); 1548 account_queue(t, pci, w); 1549 log_queue(pci, t, "Q"); 1550 break; 1551 case __BLK_TA_INSERT: 1552 log_insert(pdi, pci, t, "I"); 1553 break; 1554 case __BLK_TA_BACKMERGE: 1555 account_m(t, pci, w); 1556 log_merge(pdi, pci, t, "M"); 1557 break; 1558 case __BLK_TA_FRONTMERGE: 1559 account_m(t, pci, w); 1560 log_merge(pdi, pci, t, "F"); 1561 break; 1562 case __BLK_TA_GETRQ: 1563 log_track_getrq(pdi, t); 1564 log_generic(pci, t, "G"); 1565 break; 1566 case __BLK_TA_SLEEPRQ: 1567 log_generic(pci, t, "S"); 1568 break; 1569 case __BLK_TA_REQUEUE: 1570 /* 1571 * can happen if we miss traces, don't let it go 1572 * below zero 1573 */ 1574 if (pdi->cur_depth[w]) 1575 pdi->cur_depth[w]--; 1576 account_requeue(t, pci, w); 1577 log_queue(pci, t, "R"); 1578 break; 1579 case __BLK_TA_ISSUE: 1580 account_issue(t, pci, w); 1581 pdi->cur_depth[w]++; 1582 if (pdi->cur_depth[w] > pdi->max_depth[w]) 1583 pdi->max_depth[w] = pdi->cur_depth[w]; 1584 log_issue(pdi, pci, t, "D"); 1585 break; 1586 case __BLK_TA_COMPLETE: 1587 if (pdi->cur_depth[w]) 1588 pdi->cur_depth[w]--; 1589 account_c(t, pci, w, t->bytes); 1590 log_complete(pdi, pci, t, "C"); 1591 break; 1592 case __BLK_TA_PLUG: 1593 log_action(pci, t, "P"); 1594 break; 1595 case __BLK_TA_UNPLUG_IO: 1596 account_unplug(t, pci, 0); 1597 log_unplug(pci, t, "U"); 1598 break; 1599 case __BLK_TA_UNPLUG_TIMER: 1600 account_unplug(t, pci, 1); 1601 log_unplug(pci, t, "UT"); 1602 break; 1603 case __BLK_TA_SPLIT: 1604 log_split(pci, t, "X"); 1605 break; 1606 case __BLK_TA_BOUNCE: 1607 log_generic(pci, t, "B"); 1608 break; 1609 case __BLK_TA_REMAP: 1610 log_generic(pci, t, "A"); 1611 break; 1612 case __BLK_TA_DRV_DATA: 1613 have_drv_data = 1; 1614 /* dump to binary file only */ 1615 break; 1616 default: 1617 fprintf(stderr, "Bad fs action %x\n", t->action); 1618 break; 1619 } 1620 } 1621 1622 static void dump_trace(struct blk_io_trace *t, struct per_cpu_info *pci, 1623 struct per_dev_info *pdi) 1624 { 1625 if (text_output) { 1626 if (t->action == BLK_TN_MESSAGE) 1627 handle_notify(t); 1628 else if (t->action & BLK_TC_ACT(BLK_TC_PC)) 1629 dump_trace_pc(t, pdi, pci); 1630 else 1631 dump_trace_fs(t, pdi, pci); 1632 } 1633 1634 if (!pdi->events) 1635 pdi->first_reported_time = t->time; 1636 1637 pdi->events++; 1638 1639 if (bin_output_msgs || 1640 !(t->action & BLK_TC_ACT(BLK_TC_NOTIFY) && 1641 t->action == BLK_TN_MESSAGE)) 1642 output_binary(t, sizeof(*t) + t->pdu_len); 1643 } 1644 1645 /* 1646 * print in a proper way, not too small and not too big. if more than 1647 * 1000,000K, turn into M and so on 1648 */ 1649 static char *size_cnv(char *dst, unsigned long long num, int in_kb) 1650 { 1651 char suff[] = { '\0', 'K', 'M', 'G', 'P' }; 1652 unsigned int i = 0; 1653 1654 if (in_kb) 1655 i++; 1656 1657 while (num > 1000 * 1000ULL && (i < sizeof(suff) - 1)) { 1658 i++; 1659 num /= 1000; 1660 } 1661 1662 sprintf(dst, "%'8Lu%c", num, suff[i]); 1663 return dst; 1664 } 1665 1666 static void dump_io_stats(struct per_dev_info *pdi, struct io_stats *ios, 1667 char *msg) 1668 { 1669 static char x[256], y[256]; 1670 1671 fprintf(ofp, "%s\n", msg); 1672 1673 fprintf(ofp, " Reads Queued: %s, %siB\t", 1674 size_cnv(x, ios->qreads, 0), 1675 size_cnv(y, ios->qread_kb + (ios->qread_b>>10), 1)); 1676 fprintf(ofp, " Writes Queued: %s, %siB\n", 1677 size_cnv(x, ios->qwrites, 0), 1678 size_cnv(y, ios->qwrite_kb + (ios->qwrite_b>>10), 1)); 1679 fprintf(ofp, " Read Dispatches: %s, %siB\t", 1680 size_cnv(x, ios->ireads, 0), 1681 size_cnv(y, ios->iread_kb + (ios->iread_b>>10), 1)); 1682 fprintf(ofp, " Write Dispatches: %s, %siB\n", 1683 size_cnv(x, ios->iwrites, 0), 1684 size_cnv(y, ios->iwrite_kb + (ios->iwrite_b>>10), 1)); 1685 fprintf(ofp, " Reads Requeued: %s\t\t", size_cnv(x, ios->rrqueue, 0)); 1686 fprintf(ofp, " Writes Requeued: %s\n", size_cnv(x, ios->wrqueue, 0)); 1687 fprintf(ofp, " Reads Completed: %s, %siB\t", 1688 size_cnv(x, ios->creads, 0), 1689 size_cnv(y, ios->cread_kb + (ios->cread_b>>10), 1)); 1690 fprintf(ofp, " Writes Completed: %s, %siB\n", 1691 size_cnv(x, ios->cwrites, 0), 1692 size_cnv(y, ios->cwrite_kb + (ios->cwrite_b>>10), 1)); 1693 fprintf(ofp, " Read Merges: %s, %siB\t", 1694 size_cnv(x, ios->mreads, 0), 1695 size_cnv(y, ios->mread_kb + (ios->mread_b>>10), 1)); 1696 fprintf(ofp, " Write Merges: %s, %siB\n", 1697 size_cnv(x, ios->mwrites, 0), 1698 size_cnv(y, ios->mwrite_kb + (ios->mwrite_b>>10), 1)); 1699 if (pdi) { 1700 fprintf(ofp, " Read depth: %'8u%8c\t", pdi->max_depth[0], ' '); 1701 fprintf(ofp, " Write depth: %'8u\n", pdi->max_depth[1]); 1702 } 1703 if (ios->qreads_pc || ios->qwrites_pc || ios->ireads_pc || ios->iwrites_pc || 1704 ios->rrqueue_pc || ios->wrqueue_pc || ios->creads_pc || ios->cwrites_pc) { 1705 fprintf(ofp, " PC Reads Queued: %s, %siB\t", 1706 size_cnv(x, ios->qreads_pc, 0), 1707 size_cnv(y, 1708 ios->qread_kb_pc + (ios->qread_b_pc>>10), 1)); 1709 fprintf(ofp, " PC Writes Queued: %s, %siB\n", 1710 size_cnv(x, ios->qwrites_pc, 0), 1711 size_cnv(y, 1712 ios->qwrite_kb_pc + (ios->qwrite_b_pc>>10), 1)); 1713 fprintf(ofp, " PC Read Disp.: %s, %siB\t", 1714 size_cnv(x, ios->ireads_pc, 0), 1715 size_cnv(y, 1716 ios->iread_kb_pc + (ios->iread_b_pc>>10), 1)); 1717 fprintf(ofp, " PC Write Disp.: %s, %siB\n", 1718 size_cnv(x, ios->iwrites_pc, 0), 1719 size_cnv(y, 1720 ios->iwrite_kb_pc + (ios->iwrite_b_pc>>10), 1721 1)); 1722 fprintf(ofp, " PC Reads Req.: %s\t\t", size_cnv(x, ios->rrqueue_pc, 0)); 1723 fprintf(ofp, " PC Writes Req.: %s\n", size_cnv(x, ios->wrqueue_pc, 0)); 1724 fprintf(ofp, " PC Reads Compl.: %s\t\t", size_cnv(x, ios->creads_pc, 0)); 1725 fprintf(ofp, " PC Writes Compl.: %s\n", size_cnv(x, ios->cwrites_pc, 0)); 1726 } 1727 fprintf(ofp, " IO unplugs: %'8lu%8c\t", ios->io_unplugs, ' '); 1728 fprintf(ofp, " Timer unplugs: %'8lu\n", ios->timer_unplugs); 1729 } 1730 1731 static void dump_wait_stats(struct per_process_info *ppi) 1732 { 1733 unsigned long rawait = ppi->longest_allocation_wait[0] / 1000; 1734 unsigned long rdwait = ppi->longest_dispatch_wait[0] / 1000; 1735 unsigned long rcwait = ppi->longest_completion_wait[0] / 1000; 1736 unsigned long wawait = ppi->longest_allocation_wait[1] / 1000; 1737 unsigned long wdwait = ppi->longest_dispatch_wait[1] / 1000; 1738 unsigned long wcwait = ppi->longest_completion_wait[1] / 1000; 1739 1740 fprintf(ofp, " Allocation wait: %'8lu%8c\t", rawait, ' '); 1741 fprintf(ofp, " Allocation wait: %'8lu\n", wawait); 1742 fprintf(ofp, " Dispatch wait: %'8lu%8c\t", rdwait, ' '); 1743 fprintf(ofp, " Dispatch wait: %'8lu\n", wdwait); 1744 fprintf(ofp, " Completion wait: %'8lu%8c\t", rcwait, ' '); 1745 fprintf(ofp, " Completion wait: %'8lu\n", wcwait); 1746 } 1747 1748 static int ppi_name_compare(const void *p1, const void *p2) 1749 { 1750 struct per_process_info *ppi1 = *((struct per_process_info **) p1); 1751 struct per_process_info *ppi2 = *((struct per_process_info **) p2); 1752 int res; 1753 1754 res = strverscmp(ppi1->ppm->comm, ppi2->ppm->comm); 1755 if (!res) 1756 res = ppi1->ppm->pid > ppi2->ppm->pid; 1757 1758 return res; 1759 } 1760 1761 static void sort_process_list(void) 1762 { 1763 struct per_process_info **ppis; 1764 struct per_process_info *ppi; 1765 int i = 0; 1766 1767 ppis = malloc(ppi_list_entries * sizeof(struct per_process_info *)); 1768 1769 ppi = ppi_list; 1770 while (ppi) { 1771 ppis[i++] = ppi; 1772 ppi = ppi->list_next; 1773 } 1774 1775 qsort(ppis, ppi_list_entries, sizeof(ppi), ppi_name_compare); 1776 1777 i = ppi_list_entries - 1; 1778 ppi_list = NULL; 1779 while (i >= 0) { 1780 ppi = ppis[i]; 1781 1782 ppi->list_next = ppi_list; 1783 ppi_list = ppi; 1784 i--; 1785 } 1786 1787 free(ppis); 1788 } 1789 1790 static void show_process_stats(void) 1791 { 1792 struct per_process_info *ppi; 1793 1794 sort_process_list(); 1795 1796 ppi = ppi_list; 1797 while (ppi) { 1798 struct process_pid_map *ppm = ppi->ppm; 1799 char name[64]; 1800 1801 if (ppi->more_than_one) 1802 sprintf(name, "%s (%u, ...)", ppm->comm, ppm->pid); 1803 else 1804 sprintf(name, "%s (%u)", ppm->comm, ppm->pid); 1805 1806 dump_io_stats(NULL, &ppi->io_stats, name); 1807 dump_wait_stats(ppi); 1808 ppi = ppi->list_next; 1809 } 1810 1811 fprintf(ofp, "\n"); 1812 } 1813 1814 static void show_device_and_cpu_stats(void) 1815 { 1816 struct per_dev_info *pdi; 1817 struct per_cpu_info *pci; 1818 struct io_stats total, *ios; 1819 unsigned long long rrate, wrate, msec; 1820 int i, j, pci_events; 1821 char line[3 + 8/*cpu*/ + 2 + 32/*dev*/ + 3]; 1822 char name[32]; 1823 double ratio; 1824 1825 for (pdi = devices, i = 0; i < ndevices; i++, pdi++) { 1826 1827 memset(&total, 0, sizeof(total)); 1828 pci_events = 0; 1829 1830 if (i > 0) 1831 fprintf(ofp, "\n"); 1832 1833 for (pci = pdi->cpus, j = 0; j < pdi->ncpus; j++, pci++) { 1834 if (!pci->nelems) 1835 continue; 1836 1837 ios = &pci->io_stats; 1838 total.qreads += ios->qreads; 1839 total.qwrites += ios->qwrites; 1840 total.creads += ios->creads; 1841 total.cwrites += ios->cwrites; 1842 total.mreads += ios->mreads; 1843 total.mwrites += ios->mwrites; 1844 total.ireads += ios->ireads; 1845 total.iwrites += ios->iwrites; 1846 total.rrqueue += ios->rrqueue; 1847 total.wrqueue += ios->wrqueue; 1848 total.qread_kb += ios->qread_kb; 1849 total.qwrite_kb += ios->qwrite_kb; 1850 total.cread_kb += ios->cread_kb; 1851 total.cwrite_kb += ios->cwrite_kb; 1852 total.iread_kb += ios->iread_kb; 1853 total.iwrite_kb += ios->iwrite_kb; 1854 total.mread_kb += ios->mread_kb; 1855 total.mwrite_kb += ios->mwrite_kb; 1856 total.qread_b += ios->qread_b; 1857 total.qwrite_b += ios->qwrite_b; 1858 total.cread_b += ios->cread_b; 1859 total.cwrite_b += ios->cwrite_b; 1860 total.iread_b += ios->iread_b; 1861 total.iwrite_b += ios->iwrite_b; 1862 total.mread_b += ios->mread_b; 1863 total.mwrite_b += ios->mwrite_b; 1864 1865 total.qreads_pc += ios->qreads_pc; 1866 total.qwrites_pc += ios->qwrites_pc; 1867 total.creads_pc += ios->creads_pc; 1868 total.cwrites_pc += ios->cwrites_pc; 1869 total.ireads_pc += ios->ireads_pc; 1870 total.iwrites_pc += ios->iwrites_pc; 1871 total.rrqueue_pc += ios->rrqueue_pc; 1872 total.wrqueue_pc += ios->wrqueue_pc; 1873 total.qread_kb_pc += ios->qread_kb_pc; 1874 total.qwrite_kb_pc += ios->qwrite_kb_pc; 1875 total.iread_kb_pc += ios->iread_kb_pc; 1876 total.iwrite_kb_pc += ios->iwrite_kb_pc; 1877 total.qread_b_pc += ios->qread_b_pc; 1878 total.qwrite_b_pc += ios->qwrite_b_pc; 1879 total.iread_b_pc += ios->iread_b_pc; 1880 total.iwrite_b_pc += ios->iwrite_b_pc; 1881 1882 total.timer_unplugs += ios->timer_unplugs; 1883 total.io_unplugs += ios->io_unplugs; 1884 1885 snprintf(line, sizeof(line) - 1, "CPU%d (%s):", 1886 j, get_dev_name(pdi, name, sizeof(name))); 1887 dump_io_stats(pdi, ios, line); 1888 pci_events++; 1889 } 1890 1891 if (pci_events > 1) { 1892 fprintf(ofp, "\n"); 1893 snprintf(line, sizeof(line) - 1, "Total (%s):", 1894 get_dev_name(pdi, name, sizeof(name))); 1895 dump_io_stats(NULL, &total, line); 1896 } 1897 1898 wrate = rrate = 0; 1899 msec = (pdi->last_reported_time - pdi->first_reported_time) / 1000000; 1900 if (msec) { 1901 rrate = ((1000 * total.cread_kb) + total.cread_b) / 1902 msec; 1903 wrate = ((1000 * total.cwrite_kb) + total.cwrite_b) / 1904 msec; 1905 } 1906 1907 fprintf(ofp, "\nThroughput (R/W): %'LuKiB/s / %'LuKiB/s\n", 1908 rrate, wrate); 1909 fprintf(ofp, "Events (%s): %'Lu entries\n", 1910 get_dev_name(pdi, line, sizeof(line)), pdi->events); 1911 1912 collect_pdi_skips(pdi); 1913 if (!pdi->skips && !pdi->events) 1914 ratio = 0.0; 1915 else 1916 ratio = 100.0 * ((double)pdi->seq_skips / 1917 (double)(pdi->events + pdi->seq_skips)); 1918 fprintf(ofp, "Skips: %'lu forward (%'llu - %5.1lf%%)\n", 1919 pdi->skips, pdi->seq_skips, ratio); 1920 } 1921 } 1922 1923 static void find_genesis(void) 1924 { 1925 struct trace *t = trace_list; 1926 1927 genesis_time = -1ULL; 1928 while (t != NULL) { 1929 if (t->bit->time < genesis_time) 1930 genesis_time = t->bit->time; 1931 1932 t = t->next; 1933 } 1934 1935 /* The time stamp record will usually be the first 1936 * record in the trace, but not always. 1937 */ 1938 if (start_timestamp 1939 && start_timestamp != genesis_time) { 1940 long delta = genesis_time - start_timestamp; 1941 1942 abs_start_time.tv_sec += SECONDS(delta); 1943 abs_start_time.tv_nsec += NANO_SECONDS(delta); 1944 if (abs_start_time.tv_nsec < 0) { 1945 abs_start_time.tv_nsec += 1000000000; 1946 abs_start_time.tv_sec -= 1; 1947 } else 1948 if (abs_start_time.tv_nsec > 1000000000) { 1949 abs_start_time.tv_nsec -= 1000000000; 1950 abs_start_time.tv_sec += 1; 1951 } 1952 } 1953 } 1954 1955 static inline int check_stopwatch(struct blk_io_trace *bit) 1956 { 1957 if (bit->time < stopwatch_end && 1958 bit->time >= stopwatch_start) 1959 return 0; 1960 1961 return 1; 1962 } 1963 1964 /* 1965 * return youngest entry read 1966 */ 1967 static int sort_entries(unsigned long long *youngest) 1968 { 1969 struct per_dev_info *pdi = NULL; 1970 struct per_cpu_info *pci = NULL; 1971 struct trace *t; 1972 1973 if (!genesis_time) 1974 find_genesis(); 1975 1976 *youngest = 0; 1977 while ((t = trace_list) != NULL) { 1978 struct blk_io_trace *bit = t->bit; 1979 1980 trace_list = t->next; 1981 1982 bit->time -= genesis_time; 1983 1984 if (bit->time < *youngest || !*youngest) 1985 *youngest = bit->time; 1986 1987 if (!pdi || pdi->dev != bit->device) { 1988 pdi = get_dev_info(bit->device); 1989 pci = NULL; 1990 } 1991 1992 if (!pci || pci->cpu != bit->cpu) 1993 pci = get_cpu_info(pdi, bit->cpu); 1994 1995 if (bit->sequence < pci->smallest_seq_read) 1996 pci->smallest_seq_read = bit->sequence; 1997 1998 if (check_stopwatch(bit)) { 1999 bit_free(bit); 2000 t_free(t); 2001 continue; 2002 } 2003 2004 if (trace_rb_insert_sort(t)) 2005 return -1; 2006 } 2007 2008 return 0; 2009 } 2010 2011 /* 2012 * to continue, we must have traces from all online cpus in the tree 2013 */ 2014 static int check_cpu_map(struct per_dev_info *pdi) 2015 { 2016 unsigned long *cpu_map; 2017 struct rb_node *n; 2018 struct trace *__t; 2019 unsigned int i; 2020 int ret, cpu; 2021 2022 /* 2023 * create a map of the cpus we have traces for 2024 */ 2025 cpu_map = malloc(pdi->cpu_map_max / sizeof(long)); 2026 memset(cpu_map, 0, sizeof(*cpu_map)); 2027 n = rb_first(&rb_sort_root); 2028 while (n) { 2029 __t = rb_entry(n, struct trace, rb_node); 2030 cpu = __t->bit->cpu; 2031 2032 cpu_map[CPU_IDX(cpu)] |= (1UL << CPU_BIT(cpu)); 2033 n = rb_next(n); 2034 } 2035 2036 /* 2037 * we can't continue if pdi->cpu_map has entries set that we don't 2038 * have in the sort rbtree. the opposite is not a problem, though 2039 */ 2040 ret = 0; 2041 for (i = 0; i < pdi->cpu_map_max / CPUS_PER_LONG; i++) { 2042 if (pdi->cpu_map[i] & ~(cpu_map[i])) { 2043 ret = 1; 2044 break; 2045 } 2046 } 2047 2048 free(cpu_map); 2049 return ret; 2050 } 2051 2052 static int check_sequence(struct per_dev_info *pdi, struct trace *t, int force) 2053 { 2054 struct blk_io_trace *bit = t->bit; 2055 unsigned long expected_sequence; 2056 struct per_cpu_info *pci; 2057 struct trace *__t; 2058 2059 pci = get_cpu_info(pdi, bit->cpu); 2060 expected_sequence = pci->last_sequence + 1; 2061 2062 if (!expected_sequence) { 2063 /* 2064 * 1 should be the first entry, just allow it 2065 */ 2066 if (bit->sequence == 1) 2067 return 0; 2068 if (bit->sequence == pci->smallest_seq_read) 2069 return 0; 2070 2071 return check_cpu_map(pdi); 2072 } 2073 2074 if (bit->sequence == expected_sequence) 2075 return 0; 2076 2077 /* 2078 * we may not have seen that sequence yet. if we are not doing 2079 * the final run, break and wait for more entries. 2080 */ 2081 if (expected_sequence < pci->smallest_seq_read) { 2082 __t = trace_rb_find_last(pdi, pci, expected_sequence); 2083 if (!__t) 2084 goto skip; 2085 2086 __put_trace_last(pdi, __t); 2087 return 0; 2088 } else if (!force) { 2089 return 1; 2090 } else { 2091 skip: 2092 if (check_current_skips(pci, bit->sequence)) 2093 return 0; 2094 2095 if (expected_sequence < bit->sequence) 2096 insert_skip(pci, expected_sequence, bit->sequence - 1); 2097 return 0; 2098 } 2099 } 2100 2101 static void show_entries_rb(int force) 2102 { 2103 struct per_dev_info *pdi = NULL; 2104 struct per_cpu_info *pci = NULL; 2105 struct blk_io_trace *bit; 2106 struct rb_node *n; 2107 struct trace *t; 2108 2109 while ((n = rb_first(&rb_sort_root)) != NULL) { 2110 if (is_done() && !force && !pipeline) 2111 break; 2112 2113 t = rb_entry(n, struct trace, rb_node); 2114 bit = t->bit; 2115 2116 if (read_sequence - t->read_sequence < 1 && !force) 2117 break; 2118 2119 if (!pdi || pdi->dev != bit->device) { 2120 pdi = get_dev_info(bit->device); 2121 pci = NULL; 2122 } 2123 2124 if (!pdi) { 2125 fprintf(stderr, "Unknown device ID? (%d,%d)\n", 2126 MAJOR(bit->device), MINOR(bit->device)); 2127 break; 2128 } 2129 2130 if (!(bit->action == BLK_TN_MESSAGE) && 2131 check_sequence(pdi, t, force)) 2132 break; 2133 2134 if (!force && bit->time > last_allowed_time) 2135 break; 2136 2137 check_time(pdi, bit); 2138 2139 if (!pci || pci->cpu != bit->cpu) 2140 pci = get_cpu_info(pdi, bit->cpu); 2141 2142 if (!(bit->action == BLK_TN_MESSAGE)) 2143 pci->last_sequence = bit->sequence; 2144 2145 pci->nelems++; 2146 2147 if (bit->action & (act_mask << BLK_TC_SHIFT)) 2148 dump_trace(bit, pci, pdi); 2149 2150 put_trace(pdi, t); 2151 } 2152 } 2153 2154 static int read_data(int fd, void *buffer, int bytes, int block, int *fdblock) 2155 { 2156 int ret, bytes_left, fl; 2157 void *p; 2158 2159 if (block != *fdblock) { 2160 fl = fcntl(fd, F_GETFL); 2161 2162 if (!block) { 2163 *fdblock = 0; 2164 fcntl(fd, F_SETFL, fl | O_NONBLOCK); 2165 } else { 2166 *fdblock = 1; 2167 fcntl(fd, F_SETFL, fl & ~O_NONBLOCK); 2168 } 2169 } 2170 2171 bytes_left = bytes; 2172 p = buffer; 2173 while (bytes_left > 0) { 2174 ret = read(fd, p, bytes_left); 2175 if (!ret) 2176 return 1; 2177 else if (ret < 0) { 2178 if (errno != EAGAIN) { 2179 perror("read"); 2180 return -1; 2181 } 2182 2183 /* 2184 * never do partial reads. we can return if we 2185 * didn't read anything and we should not block, 2186 * otherwise wait for data 2187 */ 2188 if ((bytes_left == bytes) && !block) 2189 return 1; 2190 2191 usleep(10); 2192 continue; 2193 } else { 2194 p += ret; 2195 bytes_left -= ret; 2196 } 2197 } 2198 2199 return 0; 2200 } 2201 2202 static inline __u16 get_pdulen(struct blk_io_trace *bit) 2203 { 2204 if (data_is_native) 2205 return bit->pdu_len; 2206 2207 return __bswap_16(bit->pdu_len); 2208 } 2209 2210 static inline __u32 get_magic(struct blk_io_trace *bit) 2211 { 2212 if (data_is_native) 2213 return bit->magic; 2214 2215 return __bswap_32(bit->magic); 2216 } 2217 2218 static int read_events(int fd, int always_block, int *fdblock) 2219 { 2220 struct per_dev_info *pdi = NULL; 2221 unsigned int events = 0; 2222 2223 while (!is_done() && events < rb_batch) { 2224 struct blk_io_trace *bit; 2225 struct trace *t; 2226 int pdu_len, should_block, ret; 2227 __u32 magic; 2228 2229 bit = bit_alloc(); 2230 2231 should_block = !events || always_block; 2232 2233 ret = read_data(fd, bit, sizeof(*bit), should_block, fdblock); 2234 if (ret) { 2235 bit_free(bit); 2236 if (!events && ret < 0) 2237 events = ret; 2238 break; 2239 } 2240 2241 /* 2242 * look at first trace to check whether we need to convert 2243 * data in the future 2244 */ 2245 if (data_is_native == -1 && check_data_endianness(bit->magic)) 2246 break; 2247 2248 magic = get_magic(bit); 2249 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { 2250 fprintf(stderr, "Bad magic %x\n", magic); 2251 break; 2252 } 2253 2254 pdu_len = get_pdulen(bit); 2255 if (pdu_len) { 2256 void *ptr = realloc(bit, sizeof(*bit) + pdu_len); 2257 2258 if (read_data(fd, ptr + sizeof(*bit), pdu_len, 1, fdblock)) { 2259 bit_free(ptr); 2260 break; 2261 } 2262 2263 bit = ptr; 2264 } 2265 2266 trace_to_cpu(bit); 2267 2268 if (verify_trace(bit)) { 2269 bit_free(bit); 2270 continue; 2271 } 2272 2273 /* 2274 * not a real trace, so grab and handle it here 2275 */ 2276 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) { 2277 handle_notify(bit); 2278 output_binary(bit, sizeof(*bit) + bit->pdu_len); 2279 continue; 2280 } 2281 2282 t = t_alloc(); 2283 memset(t, 0, sizeof(*t)); 2284 t->bit = bit; 2285 t->read_sequence = read_sequence; 2286 2287 t->next = trace_list; 2288 trace_list = t; 2289 2290 if (!pdi || pdi->dev != bit->device) 2291 pdi = get_dev_info(bit->device); 2292 2293 if (bit->time > pdi->last_read_time) 2294 pdi->last_read_time = bit->time; 2295 2296 events++; 2297 } 2298 2299 return events; 2300 } 2301 2302 /* 2303 * Managing input streams 2304 */ 2305 2306 struct ms_stream { 2307 struct ms_stream *next; 2308 struct trace *first, *last; 2309 struct per_dev_info *pdi; 2310 unsigned int cpu; 2311 }; 2312 2313 #define MS_HASH(d, c) ((MAJOR(d) & 0xff) ^ (MINOR(d) & 0xff) ^ (cpu & 0xff)) 2314 2315 struct ms_stream *ms_head; 2316 struct ms_stream *ms_hash[256]; 2317 2318 static void ms_sort(struct ms_stream *msp); 2319 static int ms_prime(struct ms_stream *msp); 2320 2321 static inline struct trace *ms_peek(struct ms_stream *msp) 2322 { 2323 return (msp == NULL) ? NULL : msp->first; 2324 } 2325 2326 static inline __u64 ms_peek_time(struct ms_stream *msp) 2327 { 2328 return ms_peek(msp)->bit->time; 2329 } 2330 2331 static inline void ms_resort(struct ms_stream *msp) 2332 { 2333 if (msp->next && ms_peek_time(msp) > ms_peek_time(msp->next)) { 2334 ms_head = msp->next; 2335 msp->next = NULL; 2336 ms_sort(msp); 2337 } 2338 } 2339 2340 static inline void ms_deq(struct ms_stream *msp) 2341 { 2342 msp->first = msp->first->next; 2343 if (!msp->first) { 2344 msp->last = NULL; 2345 if (!ms_prime(msp)) { 2346 ms_head = msp->next; 2347 msp->next = NULL; 2348 return; 2349 } 2350 } 2351 2352 ms_resort(msp); 2353 } 2354 2355 static void ms_sort(struct ms_stream *msp) 2356 { 2357 __u64 msp_t = ms_peek_time(msp); 2358 struct ms_stream *this_msp = ms_head; 2359 2360 if (this_msp == NULL) 2361 ms_head = msp; 2362 else if (msp_t < ms_peek_time(this_msp)) { 2363 msp->next = this_msp; 2364 ms_head = msp; 2365 } 2366 else { 2367 while (this_msp->next && ms_peek_time(this_msp->next) < msp_t) 2368 this_msp = this_msp->next; 2369 2370 msp->next = this_msp->next; 2371 this_msp->next = msp; 2372 } 2373 } 2374 2375 static int ms_prime(struct ms_stream *msp) 2376 { 2377 __u32 magic; 2378 unsigned int i; 2379 struct trace *t; 2380 struct per_dev_info *pdi = msp->pdi; 2381 struct per_cpu_info *pci = get_cpu_info(pdi, msp->cpu); 2382 struct blk_io_trace *bit = NULL; 2383 int ret, pdu_len, ndone = 0; 2384 2385 for (i = 0; !is_done() && pci->fd >= 0 && i < rb_batch; i++) { 2386 bit = bit_alloc(); 2387 ret = read_data(pci->fd, bit, sizeof(*bit), 1, &pci->fdblock); 2388 if (ret) 2389 goto err; 2390 2391 if (data_is_native == -1 && check_data_endianness(bit->magic)) 2392 goto err; 2393 2394 magic = get_magic(bit); 2395 if ((magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) { 2396 fprintf(stderr, "Bad magic %x\n", magic); 2397 goto err; 2398 2399 } 2400 2401 pdu_len = get_pdulen(bit); 2402 if (pdu_len) { 2403 void *ptr = realloc(bit, sizeof(*bit) + pdu_len); 2404 ret = read_data(pci->fd, ptr + sizeof(*bit), pdu_len, 2405 1, &pci->fdblock); 2406 if (ret) { 2407 free(ptr); 2408 bit = NULL; 2409 goto err; 2410 } 2411 2412 bit = ptr; 2413 } 2414 2415 trace_to_cpu(bit); 2416 if (verify_trace(bit)) 2417 goto err; 2418 2419 if (bit->cpu != pci->cpu) { 2420 fprintf(stderr, "cpu %d trace info has error cpu %d\n", 2421 pci->cpu, bit->cpu); 2422 continue; 2423 } 2424 2425 if (bit->action & BLK_TC_ACT(BLK_TC_NOTIFY) && bit->action != BLK_TN_MESSAGE) { 2426 handle_notify(bit); 2427 output_binary(bit, sizeof(*bit) + bit->pdu_len); 2428 bit_free(bit); 2429 2430 i -= 1; 2431 continue; 2432 } 2433 2434 if (bit->time > pdi->last_read_time) 2435 pdi->last_read_time = bit->time; 2436 2437 t = t_alloc(); 2438 memset(t, 0, sizeof(*t)); 2439 t->bit = bit; 2440 2441 if (msp->first == NULL) 2442 msp->first = msp->last = t; 2443 else { 2444 msp->last->next = t; 2445 msp->last = t; 2446 } 2447 2448 ndone++; 2449 } 2450 2451 return ndone; 2452 2453 err: 2454 if (bit) bit_free(bit); 2455 2456 cpu_mark_offline(pdi, pci->cpu); 2457 close(pci->fd); 2458 pci->fd = -1; 2459 2460 return ndone; 2461 } 2462 2463 static struct ms_stream *ms_alloc(struct per_dev_info *pdi, int cpu) 2464 { 2465 struct ms_stream *msp = malloc(sizeof(*msp)); 2466 2467 msp->next = NULL; 2468 msp->first = msp->last = NULL; 2469 msp->pdi = pdi; 2470 msp->cpu = cpu; 2471 2472 if (ms_prime(msp)) 2473 ms_sort(msp); 2474 2475 return msp; 2476 } 2477 2478 static int setup_file(struct per_dev_info *pdi, int cpu) 2479 { 2480 int len = 0; 2481 struct stat st; 2482 char *p, *dname; 2483 struct per_cpu_info *pci = get_cpu_info(pdi, cpu); 2484 2485 pci->cpu = cpu; 2486 pci->fdblock = -1; 2487 2488 p = strdup(pdi->name); 2489 dname = dirname(p); 2490 if (strcmp(dname, ".")) { 2491 input_dir = dname; 2492 p = strdup(pdi->name); 2493 strcpy(pdi->name, basename(p)); 2494 } 2495 free(p); 2496 2497 if (input_dir) 2498 len = sprintf(pci->fname, "%s/", input_dir); 2499 2500 snprintf(pci->fname + len, sizeof(pci->fname)-1-len, 2501 "%s.blktrace.%d", pdi->name, pci->cpu); 2502 if (stat(pci->fname, &st) < 0) 2503 return 0; 2504 if (!st.st_size) 2505 return 1; 2506 2507 pci->fd = open(pci->fname, O_RDONLY); 2508 if (pci->fd < 0) { 2509 perror(pci->fname); 2510 return 0; 2511 } 2512 2513 printf("Input file %s added\n", pci->fname); 2514 cpu_mark_online(pdi, pci->cpu); 2515 2516 pdi->nfiles++; 2517 ms_alloc(pdi, pci->cpu); 2518 2519 return 1; 2520 } 2521 2522 static int handle(struct ms_stream *msp) 2523 { 2524 struct trace *t; 2525 struct per_dev_info *pdi; 2526 struct per_cpu_info *pci; 2527 struct blk_io_trace *bit; 2528 2529 t = ms_peek(msp); 2530 2531 bit = t->bit; 2532 pdi = msp->pdi; 2533 pci = get_cpu_info(pdi, msp->cpu); 2534 pci->nelems++; 2535 bit->time -= genesis_time; 2536 2537 if (t->bit->time > stopwatch_end) 2538 return 0; 2539 2540 pdi->last_reported_time = bit->time; 2541 if ((bit->action & (act_mask << BLK_TC_SHIFT))&& 2542 t->bit->time >= stopwatch_start) 2543 dump_trace(bit, pci, pdi); 2544 2545 ms_deq(msp); 2546 2547 if (text_output) 2548 trace_rb_insert_last(pdi, t); 2549 else { 2550 bit_free(t->bit); 2551 t_free(t); 2552 } 2553 2554 return 1; 2555 } 2556 2557 /* 2558 * Check if we need to sanitize the name. We allow 'foo', or if foo.blktrace.X 2559 * is given, then strip back down to 'foo' to avoid missing files. 2560 */ 2561 static int name_fixup(char *name) 2562 { 2563 char *b; 2564 2565 if (!name) 2566 return 1; 2567 2568 b = strstr(name, ".blktrace."); 2569 if (b) 2570 *b = '\0'; 2571 2572 return 0; 2573 } 2574 2575 static int do_file(void) 2576 { 2577 int i, cpu, ret; 2578 struct per_dev_info *pdi; 2579 2580 /* 2581 * first prepare all files for reading 2582 */ 2583 for (i = 0; i < ndevices; i++) { 2584 pdi = &devices[i]; 2585 ret = name_fixup(pdi->name); 2586 if (ret) 2587 return ret; 2588 2589 for (cpu = 0; setup_file(pdi, cpu); cpu++) 2590 ; 2591 2592 if (!cpu) { 2593 fprintf(stderr,"No input files found for %s\n", 2594 pdi->name); 2595 return 1; 2596 } 2597 } 2598 2599 /* 2600 * Get the initial time stamp 2601 */ 2602 if (ms_head) 2603 genesis_time = ms_peek_time(ms_head); 2604 2605 /* 2606 * Keep processing traces while any are left 2607 */ 2608 while (!is_done() && ms_head && handle(ms_head)) 2609 ; 2610 2611 return 0; 2612 } 2613 2614 static void do_pipe(int fd) 2615 { 2616 unsigned long long youngest; 2617 int events, fdblock; 2618 2619 last_allowed_time = -1ULL; 2620 fdblock = -1; 2621 while ((events = read_events(fd, 0, &fdblock)) > 0) { 2622 read_sequence++; 2623 2624 #if 0 2625 smallest_seq_read = -1U; 2626 #endif 2627 2628 if (sort_entries(&youngest)) 2629 break; 2630 2631 if (youngest > stopwatch_end) 2632 break; 2633 2634 show_entries_rb(0); 2635 } 2636 2637 if (rb_sort_entries) 2638 show_entries_rb(1); 2639 } 2640 2641 static int do_fifo(void) 2642 { 2643 int fd; 2644 2645 if (!strcmp(pipename, "-")) 2646 fd = dup(STDIN_FILENO); 2647 else 2648 fd = open(pipename, O_RDONLY); 2649 2650 if (fd == -1) { 2651 perror("dup stdin"); 2652 return -1; 2653 } 2654 2655 do_pipe(fd); 2656 close(fd); 2657 return 0; 2658 } 2659 2660 static void show_stats(void) 2661 { 2662 if (!ofp) 2663 return; 2664 if (stats_printed) 2665 return; 2666 2667 stats_printed = 1; 2668 2669 if (per_process_stats) 2670 show_process_stats(); 2671 2672 if (per_device_and_cpu_stats) 2673 show_device_and_cpu_stats(); 2674 2675 fflush(ofp); 2676 } 2677 2678 static void handle_sigint(__attribute__((__unused__)) int sig) 2679 { 2680 done = 1; 2681 } 2682 2683 /* 2684 * Extract start and duration times from a string, allowing 2685 * us to specify a time interval of interest within a trace. 2686 * Format: "duration" (start is zero) or "start:duration". 2687 */ 2688 static int find_stopwatch_interval(char *string) 2689 { 2690 double value; 2691 char *sp; 2692 2693 value = strtod(string, &sp); 2694 if (sp == string) { 2695 fprintf(stderr,"Invalid stopwatch timer: %s\n", string); 2696 return 1; 2697 } 2698 if (*sp == ':') { 2699 stopwatch_start = DOUBLE_TO_NANO_ULL(value); 2700 string = sp + 1; 2701 value = strtod(string, &sp); 2702 if (sp == string || *sp != '\0') { 2703 fprintf(stderr,"Invalid stopwatch duration time: %s\n", 2704 string); 2705 return 1; 2706 } 2707 } else if (*sp != '\0') { 2708 fprintf(stderr,"Invalid stopwatch start timer: %s\n", string); 2709 return 1; 2710 } 2711 stopwatch_end = DOUBLE_TO_NANO_ULL(value); 2712 if (stopwatch_end <= stopwatch_start) { 2713 fprintf(stderr, "Invalid stopwatch interval: %Lu -> %Lu\n", 2714 stopwatch_start, stopwatch_end); 2715 return 1; 2716 } 2717 2718 return 0; 2719 } 2720 2721 static int is_pipe(const char *str) 2722 { 2723 struct stat st; 2724 2725 if (!strcmp(str, "-")) 2726 return 1; 2727 if (!stat(str, &st) && S_ISFIFO(st.st_mode)) 2728 return 1; 2729 2730 return 0; 2731 } 2732 2733 #define S_OPTS "a:A:b:D:d:f:F:hi:o:Oqstw:vVM" 2734 static char usage_str[] = "\n\n" \ 2735 "-i <file> | --input=<file>\n" \ 2736 "[ -a <action field> | --act-mask=<action field> ]\n" \ 2737 "[ -A <action mask> | --set-mask=<action mask> ]\n" \ 2738 "[ -b <traces> | --batch=<traces> ]\n" \ 2739 "[ -d <file> | --dump-binary=<file> ]\n" \ 2740 "[ -D <dir> | --input-directory=<dir> ]\n" \ 2741 "[ -f <format> | --format=<format> ]\n" \ 2742 "[ -F <spec> | --format-spec=<spec> ]\n" \ 2743 "[ -h | --hash-by-name ]\n" \ 2744 "[ -o <file> | --output=<file> ]\n" \ 2745 "[ -O | --no-text-output ]\n" \ 2746 "[ -q | --quiet ]\n" \ 2747 "[ -s | --per-program-stats ]\n" \ 2748 "[ -t | --track-ios ]\n" \ 2749 "[ -w <time> | --stopwatch=<time> ]\n" \ 2750 "[ -M | --no-msgs\n" \ 2751 "[ -v | --verbose ]\n" \ 2752 "[ -V | --version ]\n\n" \ 2753 "\t-a Only trace specified actions. See documentation\n" \ 2754 "\t-A Give trace mask as a single value. See documentation\n" \ 2755 "\t-b stdin read batching\n" \ 2756 "\t-d Output file. If specified, binary data is written to file\n" \ 2757 "\t-D Directory to prepend to input file names\n" \ 2758 "\t-f Output format. Customize the output format. The format field\n" \ 2759 "\t identifies can be found in the documentation\n" \ 2760 "\t-F Format specification. Can be found in the documentation\n" \ 2761 "\t-h Hash processes by name, not pid\n" \ 2762 "\t-i Input file containing trace data, or '-' for stdin\n" \ 2763 "\t-o Output file. If not given, output is stdout\n" \ 2764 "\t-O Do NOT output text data\n" \ 2765 "\t-q Quiet. Don't display any stats at the end of the trace\n" \ 2766 "\t-s Show per-program io statistics\n" \ 2767 "\t-t Track individual ios. Will tell you the time a request took\n" \ 2768 "\t to get queued, to get dispatched, and to get completed\n" \ 2769 "\t-w Only parse data between the given time interval in seconds.\n" \ 2770 "\t If 'start' isn't given, blkparse defaults the start time to 0\n" \ 2771 "\t-M Do not output messages to binary file\n" \ 2772 "\t-v More verbose for marginal errors\n" \ 2773 "\t-V Print program version info\n\n"; 2774 2775 static void usage(char *prog) 2776 { 2777 fprintf(stderr, "Usage: %s %s", prog, usage_str); 2778 } 2779 2780 int main(int argc, char *argv[]) 2781 { 2782 int i, c, ret, mode; 2783 int act_mask_tmp = 0; 2784 char *ofp_buffer = NULL; 2785 char *bin_ofp_buffer = NULL; 2786 2787 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { 2788 switch (c) { 2789 case 'a': 2790 i = find_mask_map(optarg); 2791 if (i < 0) { 2792 fprintf(stderr,"Invalid action mask %s\n", 2793 optarg); 2794 return 1; 2795 } 2796 act_mask_tmp |= i; 2797 break; 2798 2799 case 'A': 2800 if ((sscanf(optarg, "%x", &i) != 1) || 2801 !valid_act_opt(i)) { 2802 fprintf(stderr, 2803 "Invalid set action mask %s/0x%x\n", 2804 optarg, i); 2805 return 1; 2806 } 2807 act_mask_tmp = i; 2808 break; 2809 case 'i': 2810 if (is_pipe(optarg) && !pipeline) { 2811 pipeline = 1; 2812 pipename = strdup(optarg); 2813 } else if (resize_devices(optarg) != 0) 2814 return 1; 2815 break; 2816 case 'D': 2817 input_dir = optarg; 2818 break; 2819 case 'o': 2820 output_name = optarg; 2821 break; 2822 case 'O': 2823 text_output = 0; 2824 break; 2825 case 'b': 2826 rb_batch = atoi(optarg); 2827 if (rb_batch <= 0) 2828 rb_batch = RB_BATCH_DEFAULT; 2829 break; 2830 case 's': 2831 per_process_stats = 1; 2832 break; 2833 case 't': 2834 track_ios = 1; 2835 break; 2836 case 'q': 2837 per_device_and_cpu_stats = 0; 2838 break; 2839 case 'w': 2840 if (find_stopwatch_interval(optarg) != 0) 2841 return 1; 2842 break; 2843 case 'f': 2844 set_all_format_specs(optarg); 2845 break; 2846 case 'F': 2847 if (add_format_spec(optarg) != 0) 2848 return 1; 2849 break; 2850 case 'h': 2851 ppi_hash_by_pid = 0; 2852 break; 2853 case 'v': 2854 verbose++; 2855 break; 2856 case 'V': 2857 printf("%s version %s\n", argv[0], blkparse_version); 2858 return 0; 2859 case 'd': 2860 dump_binary = optarg; 2861 break; 2862 case 'M': 2863 bin_output_msgs = 0; 2864 break; 2865 default: 2866 usage(argv[0]); 2867 return 1; 2868 } 2869 } 2870 2871 while (optind < argc) { 2872 if (is_pipe(argv[optind]) && !pipeline) { 2873 pipeline = 1; 2874 pipename = strdup(argv[optind]); 2875 } else if (resize_devices(argv[optind]) != 0) 2876 return 1; 2877 optind++; 2878 } 2879 2880 if (!pipeline && !ndevices) { 2881 usage(argv[0]); 2882 return 1; 2883 } 2884 2885 if (act_mask_tmp != 0) 2886 act_mask = act_mask_tmp; 2887 2888 memset(&rb_sort_root, 0, sizeof(rb_sort_root)); 2889 2890 signal(SIGINT, handle_sigint); 2891 signal(SIGHUP, handle_sigint); 2892 signal(SIGTERM, handle_sigint); 2893 2894 setlocale(LC_NUMERIC, "en_US"); 2895 2896 if (text_output) { 2897 if (!output_name) { 2898 ofp = fdopen(STDOUT_FILENO, "w"); 2899 mode = _IOLBF; 2900 } else { 2901 char ofname[PATH_MAX]; 2902 2903 snprintf(ofname, sizeof(ofname) - 1, "%s", output_name); 2904 ofp = fopen(ofname, "w"); 2905 mode = _IOFBF; 2906 } 2907 2908 if (!ofp) { 2909 perror("fopen"); 2910 return 1; 2911 } 2912 2913 ofp_buffer = malloc(4096); 2914 if (setvbuf(ofp, ofp_buffer, mode, 4096)) { 2915 perror("setvbuf"); 2916 return 1; 2917 } 2918 } 2919 2920 if (dump_binary) { 2921 if (!strcmp(dump_binary, "-")) 2922 dump_fp = stdout; 2923 else { 2924 dump_fp = fopen(dump_binary, "w"); 2925 if (!dump_fp) { 2926 perror(dump_binary); 2927 dump_binary = NULL; 2928 return 1; 2929 } 2930 } 2931 bin_ofp_buffer = malloc(128 * 1024); 2932 if (setvbuf(dump_fp, bin_ofp_buffer, _IOFBF, 128 * 1024)) { 2933 perror("setvbuf binary"); 2934 return 1; 2935 } 2936 } 2937 2938 if (pipeline) 2939 ret = do_fifo(); 2940 else 2941 ret = do_file(); 2942 2943 if (!ret) 2944 show_stats(); 2945 2946 if (have_drv_data && !dump_binary) 2947 printf("\ndiscarded traces containing low-level device driver " 2948 "specific data (only available in binary output)\n"); 2949 2950 if (ofp_buffer) { 2951 fflush(ofp); 2952 free(ofp_buffer); 2953 } 2954 if (bin_ofp_buffer) { 2955 fflush(dump_fp); 2956 free(bin_ofp_buffer); 2957 } 2958 return ret; 2959 } 2960