1 /* 2 * I/O monitor based on block queue trace data 3 * 4 * Copyright IBM Corp. 2008 5 * 6 * Author(s): Martin Peschke <mp3 (at) de.ibm.com> 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 21 */ 22 23 #include <sys/types.h> 24 #include <sys/stat.h> 25 #include <fcntl.h> 26 #include <unistd.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <signal.h> 31 #include <getopt.h> 32 #include <errno.h> 33 #include <locale.h> 34 #include <libgen.h> 35 #include <sys/msg.h> 36 #include <pthread.h> 37 #include <time.h> 38 39 #include "blktrace.h" 40 #include "rbtree.h" 41 #include "jhash.h" 42 #include "blkiomon.h" 43 44 struct trace { 45 struct blk_io_trace bit; 46 struct rb_node node; 47 struct trace *next; 48 long sequence; 49 }; 50 51 struct rb_search { 52 struct rb_node **node_ptr; 53 struct rb_node *parent; 54 }; 55 56 struct dstat_msg { 57 long mtype; 58 struct blkiomon_stat stat; 59 }; 60 61 struct dstat { 62 struct dstat_msg msg; 63 struct rb_node node; 64 struct dstat *next; 65 }; 66 67 struct output { 68 char *fn; 69 FILE *fp; 70 char *buf; 71 int pipe; 72 }; 73 74 static char blkiomon_version[] = "0.3"; 75 76 static FILE *ifp; 77 static int interval = -1; 78 79 static struct trace *vacant_traces_list = NULL; 80 static int vacant_traces = 0; 81 82 #define TRACE_HASH_SIZE 128 83 struct trace *thash[TRACE_HASH_SIZE] = {}; 84 85 static struct dstat *vacant_dstats_list = NULL; 86 static struct rb_root dstat_tree[2] = { RB_ROOT, RB_ROOT }; 87 static struct dstat *dstat_list[2] = {}; 88 static int dstat_curr = 0; 89 90 static struct output drvdata, human, binary, debug; 91 92 static char *msg_q_name = NULL; 93 static int msg_q_id = -1, msg_q = -1; 94 static long msg_id = -1; 95 96 static pthread_t interval_thread; 97 static pthread_mutex_t dstat_mutex = PTHREAD_MUTEX_INITIALIZER; 98 99 int data_is_native = -1; 100 101 static int up = 1; 102 103 /* debugging */ 104 static long leftover = 0, driverdata = 0, match = 0, mismatch = 0, sequence = 0; 105 106 static void dump_bit(struct trace *t, const char *descr) 107 { 108 struct blk_io_trace *bit = &t->bit; 109 110 if (!debug.fn) 111 return; 112 113 fprintf(debug.fp, "--- %s ---\n", descr); 114 fprintf(debug.fp, "magic %16d\n", bit->magic); 115 fprintf(debug.fp, "sequence %16d\n", bit->sequence); 116 fprintf(debug.fp, "time %16ld\n", (unsigned long)bit->time); 117 fprintf(debug.fp, "sector %16ld\n", (unsigned long)bit->sector); 118 fprintf(debug.fp, "bytes %16d\n", bit->bytes); 119 fprintf(debug.fp, "action %16x\n", bit->action); 120 fprintf(debug.fp, "pid %16d\n", bit->pid); 121 fprintf(debug.fp, "device %16d\n", bit->device); 122 fprintf(debug.fp, "cpu %16d\n", bit->cpu); 123 fprintf(debug.fp, "error %16d\n", bit->error); 124 fprintf(debug.fp, "pdu_len %16d\n", bit->pdu_len); 125 126 fprintf(debug.fp, "order %16ld\n", t->sequence); 127 } 128 129 static void dump_bits(struct trace *t1, struct trace *t2, const char *descr) 130 { 131 struct blk_io_trace *bit1 = &t1->bit; 132 struct blk_io_trace *bit2 = &t2->bit; 133 134 if (!debug.fn) 135 return; 136 137 fprintf(debug.fp, "--- %s ---\n", descr); 138 fprintf(debug.fp, "magic %16d %16d\n", bit1->magic, bit2->magic); 139 fprintf(debug.fp, "sequence %16d %16d\n", 140 bit1->sequence, bit2->sequence); 141 fprintf(debug.fp, "time %16ld %16ld\n", 142 (unsigned long)bit1->time, (unsigned long)bit2->time); 143 fprintf(debug.fp, "sector %16ld %16ld\n", 144 (unsigned long)bit1->sector, (unsigned long)bit2->sector); 145 fprintf(debug.fp, "bytes %16d %16d\n", bit1->bytes, bit2->bytes); 146 fprintf(debug.fp, "action %16x %16x\n", bit1->action, bit2->action); 147 fprintf(debug.fp, "pid %16d %16d\n", bit1->pid, bit2->pid); 148 fprintf(debug.fp, "device %16d %16d\n", bit1->device, bit2->device); 149 fprintf(debug.fp, "cpu %16d %16d\n", bit1->cpu, bit2->cpu); 150 fprintf(debug.fp, "error %16d %16d\n", bit1->error, bit2->error); 151 fprintf(debug.fp, "pdu_len %16d %16d\n", bit1->pdu_len, bit2->pdu_len); 152 153 fprintf(debug.fp, "order %16ld %16ld\n", t1->sequence, t2->sequence); 154 } 155 156 static struct dstat *blkiomon_alloc_dstat(void) 157 { 158 struct dstat *dstat; 159 160 if (vacant_dstats_list) { 161 dstat = vacant_dstats_list; 162 vacant_dstats_list = dstat->next; 163 } else 164 dstat = malloc(sizeof(*dstat)); 165 if (!dstat) { 166 fprintf(stderr, 167 "blkiomon: could not allocate device statistic"); 168 return NULL; 169 } 170 171 blkiomon_stat_init(&dstat->msg.stat); 172 return dstat; 173 } 174 175 static struct dstat *blkiomon_find_dstat(struct rb_search *search, __u32 device) 176 { 177 struct rb_node **p = &(dstat_tree[dstat_curr].rb_node); 178 struct rb_node *parent = NULL; 179 struct dstat *dstat; 180 181 while (*p) { 182 parent = *p; 183 184 dstat = rb_entry(parent, struct dstat, node); 185 186 if (dstat->msg.stat.device < device) 187 p = &(*p)->rb_left; 188 else if (dstat->msg.stat.device > device) 189 p = &(*p)->rb_right; 190 else 191 return dstat; 192 } 193 search->node_ptr = p; 194 search->parent = parent; 195 return NULL; 196 } 197 198 static struct dstat *blkiomon_get_dstat(__u32 device) 199 { 200 struct dstat *dstat; 201 struct rb_search search; 202 203 pthread_mutex_lock(&dstat_mutex); 204 205 dstat = blkiomon_find_dstat(&search, device); 206 if (dstat) 207 goto out; 208 209 dstat = blkiomon_alloc_dstat(); 210 if (!dstat) 211 goto out; 212 213 dstat->msg.stat.device = device; 214 215 rb_link_node(&dstat->node, search.parent, search.node_ptr); 216 rb_insert_color(&dstat->node, &dstat_tree[dstat_curr]); 217 218 dstat->next = dstat_list[dstat_curr]; 219 dstat_list[dstat_curr] = dstat; 220 221 out: 222 pthread_mutex_unlock(&dstat_mutex); 223 return dstat; 224 } 225 226 static int blkiomon_output_msg_q(struct dstat *dstat) 227 { 228 if (!msg_q_name) 229 return 0; 230 231 dstat->msg.mtype = msg_id; 232 return msgsnd(msg_q, &dstat->msg, sizeof(struct blkiomon_stat), 0); 233 } 234 235 static int blkiomon_output_binary(struct dstat *dstat) 236 { 237 struct blkiomon_stat *p = &dstat->msg.stat; 238 239 if (!binary.fn) 240 return 0; 241 242 if (fwrite(p, sizeof(*p), 1, binary.fp) != 1) 243 goto failed; 244 if (binary.pipe && fflush(binary.fp)) 245 goto failed; 246 return 0; 247 248 failed: 249 fprintf(stderr, "blkiomon: could not write to %s\n", binary.fn); 250 fclose(binary.fp); 251 binary.fn = NULL; 252 return 1; 253 } 254 255 static struct dstat *blkiomon_output(struct dstat *head, struct timespec *ts) 256 { 257 struct dstat *dstat, *tail = NULL; 258 259 for (dstat = head; dstat; dstat = dstat->next) { 260 dstat->msg.stat.time = ts->tv_sec; 261 blkiomon_stat_print(human.fp, &dstat->msg.stat); 262 blkiomon_stat_to_be(&dstat->msg.stat); 263 blkiomon_output_binary(dstat); 264 blkiomon_output_msg_q(dstat); 265 tail = dstat; 266 } 267 return tail; 268 } 269 270 static void *blkiomon_interval(void *data) 271 { 272 struct timespec wake, r; 273 struct dstat *head, *tail; 274 int finished; 275 276 clock_gettime(CLOCK_REALTIME, &wake); 277 278 while (1) { 279 wake.tv_sec += interval; 280 if (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &wake, &r)) { 281 fprintf(stderr, "blkiomon: interrupted sleep"); 282 continue; 283 } 284 285 /* grab tree and make data gatherer build up another tree */ 286 pthread_mutex_lock(&dstat_mutex); 287 finished = dstat_curr; 288 dstat_curr = dstat_curr ? 0 : 1; 289 pthread_mutex_unlock(&dstat_mutex); 290 291 head = dstat_list[finished]; 292 if (!head) 293 continue; 294 dstat_list[finished] = NULL; 295 dstat_tree[finished] = RB_ROOT; 296 tail = blkiomon_output(head, &wake); 297 298 pthread_mutex_lock(&dstat_mutex); 299 tail->next = vacant_dstats_list; 300 vacant_dstats_list = head; 301 pthread_mutex_unlock(&dstat_mutex); 302 } 303 return data; 304 } 305 306 #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE)) 307 308 static int blkiomon_account(struct blk_io_trace *bit_d, 309 struct blk_io_trace *bit_c) 310 { 311 struct dstat *dstat; 312 struct blkiomon_stat *p; 313 __u64 d2c = (bit_c->time - bit_d->time) / 1000; /* ns -> us */ 314 __u32 size = bit_d->bytes; 315 __u64 thrput = size * 1000 / d2c; 316 317 dstat = blkiomon_get_dstat(bit_d->device); 318 if (!dstat) 319 return 1; 320 p = &dstat->msg.stat; 321 322 if (BLK_DATADIR(bit_c->action) & BLK_TC_READ) { 323 minmax_account(&p->thrput_r, thrput); 324 minmax_account(&p->size_r, size); 325 minmax_account(&p->d2c_r, d2c); 326 } else if (BLK_DATADIR(bit_c->action) & BLK_TC_WRITE) { 327 minmax_account(&p->thrput_w, thrput); 328 minmax_account(&p->size_w, size); 329 minmax_account(&p->d2c_w, d2c); 330 } else 331 p->bidir++; 332 333 histlog2_account(p->size_hist, size, &size_hist); 334 histlog2_account(p->d2c_hist, d2c, &d2c_hist); 335 return 0; 336 } 337 338 static struct trace *blkiomon_alloc_trace(void) 339 { 340 struct trace *t = vacant_traces_list; 341 if (t) { 342 vacant_traces_list = t->next; 343 vacant_traces--; 344 } else 345 t = malloc(sizeof(*t)); 346 memset(t, 0, sizeof(*t)); 347 return t; 348 } 349 350 static void blkiomon_free_trace(struct trace *t) 351 { 352 if (vacant_traces < 256) { 353 t->next = vacant_traces_list; 354 vacant_traces_list = t; 355 vacant_traces++; 356 } else 357 free(t); 358 } 359 360 static int action(int a) 361 { 362 int bits = BLK_TC_WRITE | BLK_TC_READ | BLK_TC_FS | BLK_TC_PC; 363 return a & (BLK_TC_ACT(bits)); 364 } 365 366 static void blkiomon_store_trace(struct trace *t) 367 { 368 int i = t->bit.sector % TRACE_HASH_SIZE; 369 370 t->next = thash[i]; 371 thash[i] = t; 372 } 373 374 static struct trace *blkiomon_fetch_trace(struct blk_io_trace *bit) 375 { 376 int i = bit->sector % TRACE_HASH_SIZE; 377 struct trace *t, *prev = NULL; 378 379 for (t = thash[i]; t; t = t->next) { 380 if (t->bit.device == bit->device && 381 t->bit.sector == bit->sector && 382 action(t->bit.action) == action(bit->action)) { 383 if (prev) 384 prev->next = t->next; 385 else 386 thash[i] = t->next; 387 return t; 388 } 389 prev = t; 390 } 391 return NULL; 392 } 393 394 static struct trace *blkiomon_do_trace(struct trace *t) 395 { 396 struct trace *t_stored, *t_old, *t_young; 397 398 /* store trace if there is no match yet */ 399 t_stored = blkiomon_fetch_trace(&t->bit); 400 if (!t_stored) { 401 blkiomon_store_trace(t); 402 return blkiomon_alloc_trace(); 403 } 404 405 /* figure out older trace and younger trace */ 406 if (t_stored->bit.time < t->bit.time) { 407 t_old = t_stored; 408 t_young = t; 409 } else { 410 t_old = t; 411 t_young = t_stored; 412 } 413 414 /* we need an older D trace and a younger C trace */ 415 if (t_old->bit.action & BLK_TC_ACT(BLK_TC_ISSUE) && 416 t_young->bit.action & BLK_TC_ACT(BLK_TC_COMPLETE)) { 417 /* matching D and C traces - update statistics */ 418 match++; 419 blkiomon_account(&t_old->bit, &t_young->bit); 420 blkiomon_free_trace(t_stored); 421 return t; 422 } 423 424 /* no matching D and C traces - keep more recent trace */ 425 dump_bits(t_old, t_young, "mismatch"); 426 mismatch++; 427 blkiomon_store_trace(t_young); 428 return t_old; 429 } 430 431 static int blkiomon_dump_drvdata(struct blk_io_trace *bit, void *pdu_buf) 432 { 433 if (!drvdata.fn) 434 return 0; 435 436 if (fwrite(bit, sizeof(*bit), 1, drvdata.fp) != 1) 437 goto failed; 438 if (fwrite(pdu_buf, bit->pdu_len, 1, drvdata.fp) != 1) 439 goto failed; 440 if (drvdata.pipe && fflush(drvdata.fp)) 441 goto failed; 442 return 0; 443 444 failed: 445 fprintf(stderr, "blkiomon: could not write to %s\n", drvdata.fn); 446 fclose(drvdata.fp); 447 drvdata.fn = NULL; 448 return 1; 449 } 450 451 static int blkiomon_do_fifo(void) 452 { 453 struct trace *t; 454 struct blk_io_trace *bit; 455 void *pdu_buf = NULL; 456 457 t = blkiomon_alloc_trace(); 458 if (!t) 459 return 1; 460 bit = &t->bit; 461 462 while (up) { 463 if (fread(bit, sizeof(*bit), 1, ifp) != 1) { 464 if (!feof(ifp)) 465 fprintf(stderr, 466 "blkiomon: could not read trace"); 467 break; 468 } 469 if (ferror(ifp)) { 470 clearerr(ifp); 471 fprintf(stderr, "blkiomon: error while reading trace"); 472 break; 473 } 474 475 if (data_is_native == -1 && check_data_endianness(bit->magic)) { 476 fprintf(stderr, "blkiomon: endianess problem\n"); 477 break; 478 } 479 480 /* endianess */ 481 trace_to_cpu(bit); 482 if (verify_trace(bit)) { 483 fprintf(stderr, "blkiomon: bad trace\n"); 484 break; 485 } 486 487 /* read additional trace payload */ 488 if (bit->pdu_len) { 489 pdu_buf = realloc(pdu_buf, bit->pdu_len); 490 if (fread(pdu_buf, bit->pdu_len, 1, ifp) != 1) { 491 clearerr(ifp); 492 fprintf(stderr, "blkiomon: could not read payload\n"); 493 break; 494 } 495 } 496 497 t->sequence = sequence++; 498 499 /* forward low-level device driver trace to other tool */ 500 if (bit->action & BLK_TC_ACT(BLK_TC_DRV_DATA)) { 501 driverdata++; 502 if (blkiomon_dump_drvdata(bit, pdu_buf)) { 503 fprintf(stderr, "blkiomon: could not send trace\n"); 504 break; 505 } 506 continue; 507 } 508 509 if (!(bit->action & BLK_TC_ACT(BLK_TC_ISSUE | BLK_TC_COMPLETE))) 510 continue; 511 512 /* try to find matching trace and update statistics */ 513 t = blkiomon_do_trace(t); 514 if (!t) { 515 fprintf(stderr, "blkiomon: could not alloc trace\n"); 516 break; 517 } 518 bit = &t->bit; 519 /* t and bit will be recycled for next incoming trace */ 520 } 521 blkiomon_free_trace(t); 522 free(pdu_buf); 523 return 0; 524 } 525 526 static int blkiomon_open_output(struct output *out) 527 { 528 int mode, vbuf_size; 529 530 if (!out->fn) 531 return 0; 532 533 if (!strcmp(out->fn, "-")) { 534 out->fp = fdopen(STDOUT_FILENO, "w"); 535 mode = _IOLBF; 536 vbuf_size = 4096; 537 out->pipe = 1; 538 } else { 539 out->fp = fopen(out->fn, "w"); 540 mode = _IOFBF; 541 vbuf_size = 128 * 1024; 542 out->pipe = 0; 543 } 544 if (!out->fp) 545 goto failed; 546 out->buf = malloc(128 * 1024); 547 if (setvbuf(out->fp, out->buf, mode, vbuf_size)) 548 goto failed; 549 return 0; 550 551 failed: 552 fprintf(stderr, "blkiomon: could not write to %s\n", out->fn); 553 out->fn = NULL; 554 free(out->buf); 555 return 1; 556 } 557 558 static int blkiomon_open_msg_q(void) 559 { 560 key_t key; 561 562 if (!msg_q_name) 563 return 0; 564 if (!msg_q_id || msg_id <= 0) 565 return 1; 566 key = ftok(msg_q_name, msg_q_id); 567 if (key == -1) 568 return 1; 569 while (up) { 570 msg_q = msgget(key, S_IRWXU); 571 if (msg_q >= 0) 572 break; 573 } 574 return (msg_q >= 0 ? 0 : -1); 575 } 576 577 static void blkiomon_debug(void) 578 { 579 int i; 580 struct trace *t; 581 582 if (!debug.fn) 583 return; 584 585 for (i = 0; i < TRACE_HASH_SIZE; i++) 586 for (t = thash[i]; t; t = t->next) { 587 dump_bit(t, "leftover"); 588 leftover++; 589 } 590 591 fprintf(debug.fp, "%ld leftover, %ld match, %ld mismatch, " 592 "%ld driverdata, %ld overall\n", 593 leftover, match, mismatch, driverdata, sequence); 594 } 595 596 #define S_OPTS "b:d:D:h:I:Q:q:m:V" 597 598 static char usage_str[] = "\n\nblkiomon " \ 599 "-I <interval> | --interval=<interval>\n" \ 600 "[ -h <file> | --human-readable=<file> ]\n" \ 601 "[ -b <file> | --binary=<file> ]\n" \ 602 "[ -D <file> | --debug=<file> ]\n" \ 603 "[ -Q <path name> | --msg-queue=<path name>]\n" \ 604 "[ -q <msg queue id> | --msg-queue-id=<msg queue id>]\n" \ 605 "[ -m <msg id> | --msg-id=<msg id>]\n" \ 606 "[ -V | --version ]\n\n" \ 607 "\t-I Sample interval.\n" \ 608 "\t-h Human-readable output file.\n" \ 609 "\t-b Binary output file.\n" \ 610 "\t-d Output file for data emitted by low level device driver.\n" \ 611 "\t-D Output file for debugging data.\n" \ 612 "\t-Qqm Output to message queue using given ID for messages.\n" \ 613 "\t-V Print program version.\n\n"; 614 615 static struct option l_opts[] = { 616 { 617 .name = "human-readable", 618 .has_arg = required_argument, 619 .flag = NULL, 620 .val = 'h' 621 }, 622 { 623 .name = "binary", 624 .has_arg = required_argument, 625 .flag = NULL, 626 .val = 'b' 627 }, 628 { 629 .name = "dump-lldd", 630 .has_arg = required_argument, 631 .flag = NULL, 632 .val = 'd' 633 }, 634 { 635 .name = "debug", 636 .has_arg = required_argument, 637 .flag = NULL, 638 .val = 'D' 639 }, 640 { 641 .name = "interval", 642 .has_arg = required_argument, 643 .flag = NULL, 644 .val = 'I' 645 }, 646 { 647 .name = "msg-queue", 648 .has_arg = required_argument, 649 .flag = NULL, 650 .val = 'Q' 651 }, 652 { 653 .name = "msg-queue-id", 654 .has_arg = required_argument, 655 .flag = NULL, 656 .val = 'q' 657 }, 658 { 659 .name = "msg-id", 660 .has_arg = required_argument, 661 .flag = NULL, 662 .val = 'm' 663 }, 664 { 665 .name = "version", 666 .has_arg = no_argument, 667 .flag = NULL, 668 .val = 'V' 669 }, 670 { 671 .name = NULL, 672 } 673 }; 674 675 static void blkiomon_signal(int signal) 676 { 677 fprintf(stderr, "blkiomon: terminated by signal\n"); 678 up = signal & 0; 679 } 680 681 int main(int argc, char *argv[]) 682 { 683 int c; 684 685 signal(SIGALRM, blkiomon_signal); 686 signal(SIGINT, blkiomon_signal); 687 signal(SIGTERM, blkiomon_signal); 688 signal(SIGQUIT, blkiomon_signal); 689 690 while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) { 691 switch (c) { 692 case 'h': 693 human.fn = optarg; 694 break; 695 case 'b': 696 binary.fn = optarg; 697 break; 698 case 'd': 699 drvdata.fn = optarg; 700 break; 701 case 'D': 702 debug.fn = optarg; 703 break; 704 case 'I': 705 interval = atoi(optarg); 706 break; 707 case 'Q': 708 msg_q_name = optarg; 709 break; 710 case 'q': 711 msg_q_id = atoi(optarg); 712 break; 713 case 'm': 714 msg_id = atoi(optarg); 715 break; 716 case 'V': 717 printf("%s version %s\n", argv[0], blkiomon_version); 718 return 0; 719 default: 720 fprintf(stderr, "Usage: %s", usage_str); 721 return 1; 722 } 723 } 724 725 if (interval <= 0) { 726 fprintf(stderr, "Usage: %s", usage_str); 727 return 1; 728 } 729 730 ifp = fdopen(STDIN_FILENO, "r"); 731 if (!ifp) { 732 perror("blkiomon: could not open stdin for reading"); 733 return 1; 734 } 735 736 if (blkiomon_open_output(&human)) 737 return 1; 738 if (blkiomon_open_output(&binary)) 739 return 1; 740 if (blkiomon_open_output(&drvdata)) 741 return 1; 742 if (blkiomon_open_output(&debug)) 743 return 1; 744 if (blkiomon_open_msg_q()) 745 return 1; 746 747 if (pthread_create(&interval_thread, NULL, blkiomon_interval, NULL)) { 748 fprintf(stderr, "blkiomon: could not create thread"); 749 return 1; 750 } 751 752 blkiomon_do_fifo(); 753 754 blkiomon_debug(); 755 return 0; 756 } 757