Home | History | Annotate | Download | only in iowatcher
      1 /*
      2  * Copyright (C) 2012 Fusion-io
      3  *
      4  *  This program is free software; you can redistribute it and/or
      5  *  modify it under the terms of the GNU General Public
      6  *  License v2 as published by the Free Software Foundation.
      7  *
      8  *  This program is distributed in the hope that it will be useful,
      9  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     10  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     11  *  GNU General Public License for more details.
     12  *
     13  *  You should have received a copy of the GNU General Public License
     14  *  along with this program; if not, write to the Free Software
     15  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     16  *
     17  *  Parts of this file were imported from Jens Axboe's blktrace sources (also GPL)
     18  */
     19 #include <sys/types.h>
     20 #include <sys/stat.h>
     21 #include <fcntl.h>
     22 #include <unistd.h>
     23 #include <stdlib.h>
     24 #include <stdio.h>
     25 #include <math.h>
     26 #include <inttypes.h>
     27 #include <string.h>
     28 #include <asm/types.h>
     29 #include <errno.h>
     30 #include <sys/mman.h>
     31 #include <time.h>
     32 #include <math.h>
     33 #include <dirent.h>
     34 
     35 #include "plot.h"
     36 #include "blkparse.h"
     37 #include "list.h"
     38 #include "tracers.h"
     39 
     40 #define IO_HASH_TABLE_BITS  11
     41 #define IO_HASH_TABLE_SIZE (1 << IO_HASH_TABLE_BITS)
     42 static struct list_head io_hash_table[IO_HASH_TABLE_SIZE];
     43 static u64 ios_in_flight = 0;
     44 
     45 #define PROCESS_HASH_TABLE_BITS 7
     46 #define PROCESS_HASH_TABLE_SIZE (1 << PROCESS_HASH_TABLE_BITS)
     47 static struct list_head process_hash_table[PROCESS_HASH_TABLE_SIZE];
     48 
     49 extern int plot_io_action;
     50 extern int io_per_process;
     51 
     52 /*
     53  * Trace categories
     54  */
     55 enum {
     56 	BLK_TC_READ	= 1 << 0,	/* reads */
     57 	BLK_TC_WRITE	= 1 << 1,	/* writes */
     58 	BLK_TC_FLUSH	= 1 << 2,	/* flush */
     59 	BLK_TC_SYNC	= 1 << 3,	/* sync */
     60 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
     61 	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
     62 	BLK_TC_ISSUE	= 1 << 6,	/* issue */
     63 	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
     64 	BLK_TC_FS	= 1 << 8,	/* fs requests */
     65 	BLK_TC_PC	= 1 << 9,	/* pc requests */
     66 	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
     67 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
     68 	BLK_TC_META	= 1 << 12,	/* metadata */
     69 	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
     70 	BLK_TC_DRV_DATA	= 1 << 14,	/* binary driver data */
     71 	BLK_TC_FUA	= 1 << 15,	/* fua requests */
     72 
     73 	BLK_TC_END	= 1 << 15,	/* we've run out of bits! */
     74 };
     75 
     76 #define BLK_TC_SHIFT		(16)
     77 #define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
     78 #define BLK_DATADIR(a) (((a) >> BLK_TC_SHIFT) & (BLK_TC_READ | BLK_TC_WRITE))
     79 
     80 /*
     81  * Basic trace actions
     82  */
     83 enum {
     84 	__BLK_TA_QUEUE = 1,		/* queued */
     85 	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
     86 	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
     87 	__BLK_TA_GETRQ,			/* allocated new request */
     88 	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
     89 	__BLK_TA_REQUEUE,		/* request requeued */
     90 	__BLK_TA_ISSUE,			/* sent to driver */
     91 	__BLK_TA_COMPLETE,		/* completed by driver */
     92 	__BLK_TA_PLUG,			/* queue was plugged */
     93 	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
     94 	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
     95 	__BLK_TA_INSERT,		/* insert request */
     96 	__BLK_TA_SPLIT,			/* bio was split */
     97 	__BLK_TA_BOUNCE,		/* bio was bounced */
     98 	__BLK_TA_REMAP,			/* bio was remapped */
     99 	__BLK_TA_ABORT,			/* request aborted */
    100 	__BLK_TA_DRV_DATA,		/* binary driver data */
    101 };
    102 
    103 #define BLK_TA_MASK ((1 << BLK_TC_SHIFT) - 1)
    104 
    105 /*
    106  * Notify events.
    107  */
    108 enum blktrace_notify {
    109 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
    110 	__BLK_TN_TIMESTAMP,		/* include system clock */
    111 	__BLK_TN_MESSAGE,               /* Character string message */
    112 };
    113 
    114 /*
    115  * Trace actions in full. Additionally, read or write is masked
    116  */
    117 #define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
    118 #define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    119 #define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
    120 #define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    121 #define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
    122 #define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
    123 #define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
    124 #define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
    125 #define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
    126 #define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
    127 #define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
    128 #define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
    129 #define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
    130 #define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
    131 #define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
    132 #define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
    133 #define BLK_TA_DRV_DATA		(__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
    134 
    135 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
    136 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
    137 #define BLK_TN_MESSAGE		(__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY))
    138 
    139 #define BLK_IO_TRACE_MAGIC	0x65617400
    140 #define BLK_IO_TRACE_VERSION	0x07
    141 /*
    142  * The trace itself
    143  */
    144 struct blk_io_trace {
    145 	__u32 magic;		/* MAGIC << 8 | version */
    146 	__u32 sequence;		/* event number */
    147 	__u64 time;		/* in nanoseconds */
    148 	__u64 sector;		/* disk offset */
    149 	__u32 bytes;		/* transfer length */
    150 	__u32 action;		/* what happened */
    151 	__u32 pid;		/* who did it */
    152 	__u32 device;		/* device identifier (dev_t) */
    153 	__u32 cpu;		/* on what cpu did it happen */
    154 	__u16 error;		/* completion error */
    155 	__u16 pdu_len;		/* length of data after this trace */
    156 };
    157 
    158 struct pending_io {
    159 	/* sector offset of this IO */
    160 	u64 sector;
    161 
    162 	/* dev_t for this IO */
    163 	u32 device;
    164 
    165 	/* time this IO was dispatched */
    166 	u64 dispatch_time;
    167 	/* time this IO was finished */
    168 	u64 completion_time;
    169 	struct list_head hash_list;
    170 	/* process which queued this IO */
    171 	u32 pid;
    172 };
    173 
    174 struct pid_map {
    175 	struct list_head hash_list;
    176 	u32 pid;
    177 	int index;
    178 	char name[0];
    179 };
    180 
    181 u64 get_record_time(struct trace *trace)
    182 {
    183 	return trace->io->time;
    184 }
    185 
    186 void init_io_hash_table(void)
    187 {
    188 	int i;
    189 	struct list_head *head;
    190 
    191 	for (i = 0; i < IO_HASH_TABLE_SIZE; i++) {
    192 		head = io_hash_table + i;
    193 		INIT_LIST_HEAD(head);
    194 	}
    195 }
    196 
    197 /* taken from the kernel hash.h */
    198 static inline u64 hash_sector(u64 val)
    199 {
    200 	u64 hash = val;
    201 
    202 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
    203 	u64 n = hash;
    204 	n <<= 18;
    205 	hash -= n;
    206 	n <<= 33;
    207 	hash -= n;
    208 	n <<= 3;
    209 	hash += n;
    210 	n <<= 3;
    211 	hash -= n;
    212 	n <<= 4;
    213 	hash += n;
    214 	n <<= 2;
    215 	hash += n;
    216 
    217 	/* High bits are more random, so use them. */
    218 	return hash >> (64 - IO_HASH_TABLE_BITS);
    219 }
    220 
    221 static int io_hash_table_insert(struct pending_io *ins_pio)
    222 {
    223 	u64 sector = ins_pio->sector;
    224 	u32 dev = ins_pio->device;
    225 	int slot = hash_sector(sector);
    226 	struct list_head *head;
    227 	struct pending_io *pio;
    228 
    229 	head = io_hash_table + slot;
    230 	list_for_each_entry(pio, head, hash_list) {
    231 		if (pio->sector == sector && pio->device == dev)
    232 			return -EEXIST;
    233 	}
    234 	list_add_tail(&ins_pio->hash_list, head);
    235 	return 0;
    236 }
    237 
    238 static struct pending_io *io_hash_table_search(u64 sector, u32 dev)
    239 {
    240 	int slot = hash_sector(sector);
    241 	struct list_head *head;
    242 	struct pending_io *pio;
    243 
    244 	head = io_hash_table + slot;
    245 	list_for_each_entry(pio, head, hash_list) {
    246 		if (pio->sector == sector && pio->device == dev)
    247 			return pio;
    248 	}
    249 	return NULL;
    250 }
    251 
    252 static struct pending_io *hash_queued_io(struct blk_io_trace *io)
    253 {
    254 	struct pending_io *pio;
    255 	int ret;
    256 
    257 	pio = calloc(1, sizeof(*pio));
    258 	pio->sector = io->sector;
    259 	pio->device = io->device;
    260 	pio->pid = io->pid;
    261 
    262 	ret = io_hash_table_insert(pio);
    263 	if (ret < 0) {
    264 		/* crud, the IO is there already */
    265 		free(pio);
    266 		return NULL;
    267 	}
    268 	return pio;
    269 }
    270 
    271 static struct pending_io *hash_dispatched_io(struct blk_io_trace *io)
    272 {
    273 	struct pending_io *pio;
    274 
    275 	pio = io_hash_table_search(io->sector, io->device);
    276 	if (!pio) {
    277 		pio = hash_queued_io(io);
    278 		if (!pio)
    279 			return NULL;
    280 	}
    281 	pio->dispatch_time = io->time;
    282 	return pio;
    283 }
    284 
    285 static struct pending_io *hash_completed_io(struct blk_io_trace *io)
    286 {
    287 	struct pending_io *pio;
    288 
    289 	pio = io_hash_table_search(io->sector, io->device);
    290 
    291 	if (!pio)
    292 		return NULL;
    293 	return pio;
    294 }
    295 
    296 void init_process_hash_table(void)
    297 {
    298 	int i;
    299 	struct list_head *head;
    300 
    301 	for (i = 0; i < PROCESS_HASH_TABLE_SIZE; i++) {
    302 		head = process_hash_table + i;
    303 		INIT_LIST_HEAD(head);
    304 	}
    305 }
    306 
    307 static u32 hash_pid(u32 pid)
    308 {
    309 	u32 hash = pid;
    310 
    311 	hash ^= pid >> 3;
    312 	hash ^= pid >> 3;
    313 	hash ^= pid >> 4;
    314 	hash ^= pid >> 6;
    315 	return (hash & (PROCESS_HASH_TABLE_SIZE - 1));
    316 }
    317 
    318 static struct pid_map *process_hash_search(u32 pid)
    319 {
    320 	int slot = hash_pid(pid);
    321 	struct list_head *head;
    322 	struct pid_map *pm;
    323 
    324 	head = process_hash_table + slot;
    325 	list_for_each_entry(pm, head, hash_list) {
    326 		if (pm->pid == pid)
    327 			return pm;
    328 	}
    329 	return NULL;
    330 }
    331 
    332 static struct pid_map *process_hash_insert(u32 pid, char *name)
    333 {
    334 	int slot = hash_pid(pid);
    335 	struct pid_map *pm;
    336 	int old_index = 0;
    337 	char buf[16];
    338 
    339 	pm = process_hash_search(pid);
    340 	if (pm) {
    341 		/* Entry exists and name shouldn't be changed? */
    342 		if (!name || !strcmp(name, pm->name))
    343 			return pm;
    344 		list_del(&pm->hash_list);
    345 		old_index = pm->index;
    346 		free(pm);
    347 	}
    348 	if (!name) {
    349 		sprintf(buf, "[%u]", pid);
    350 		name = buf;
    351 	}
    352 	pm = malloc(sizeof(struct pid_map) + strlen(name) + 1);
    353 	pm->pid = pid;
    354 	pm->index = old_index;
    355 	strcpy(pm->name, name);
    356 	list_add_tail(&pm->hash_list, process_hash_table + slot);
    357 
    358 	return pm;
    359 }
    360 
    361 static void handle_notify(struct trace *trace)
    362 {
    363 	struct blk_io_trace *io = trace->io;
    364 	void *payload = (char *)io + sizeof(*io);
    365 	u32 two32[2];
    366 
    367 	if (io->action == BLK_TN_PROCESS) {
    368 		if (io_per_process)
    369 			process_hash_insert(io->pid, payload);
    370 		return;
    371 	}
    372 
    373 	if (io->action != BLK_TN_TIMESTAMP)
    374 		return;
    375 
    376 	if (io->pdu_len != sizeof(two32))
    377 		return;
    378 
    379 	memcpy(two32, payload, sizeof(two32));
    380 	trace->start_timestamp = io->time;
    381 	trace->abs_start_time.tv_sec = two32[0];
    382 	trace->abs_start_time.tv_nsec = two32[1];
    383 	if (trace->abs_start_time.tv_nsec < 0) {
    384 		trace->abs_start_time.tv_sec--;
    385 		trace->abs_start_time.tv_nsec += 1000000000;
    386 	}
    387 }
    388 
    389 int next_record(struct trace *trace)
    390 {
    391 	int skip = trace->io->pdu_len;
    392 	u64 offset;
    393 
    394 	trace->cur += sizeof(*trace->io) + skip;
    395 	offset = trace->cur - trace->start;
    396 	if (offset >= trace->len)
    397 		return 1;
    398 
    399 	trace->io = (struct blk_io_trace *)trace->cur;
    400 	return 0;
    401 }
    402 
    403 void first_record(struct trace *trace)
    404 {
    405 	trace->cur = trace->start;
    406 	trace->io = (struct blk_io_trace *)trace->cur;
    407 }
    408 
    409 static int is_io_event(struct blk_io_trace *test)
    410 {
    411 	char *message;
    412 	if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
    413 		return 1;
    414 	if (test->action == BLK_TN_MESSAGE) {
    415 		int len = test->pdu_len;
    416 		if (len < 3)
    417 			return 0;
    418 		message = (char *)(test + 1);
    419 		if (strncmp(message, "fio ", 4) == 0) {
    420 			return 1;
    421 		}
    422 	}
    423 	return 0;
    424 }
    425 
    426 u64 find_last_time(struct trace *trace)
    427 {
    428 	char *p = trace->start + trace->len;
    429 	struct blk_io_trace *test;
    430 	int search_len = 0;
    431 	u64 found = 0;
    432 
    433 	if (trace->len < sizeof(*trace->io))
    434 		return 0;
    435 	p -= sizeof(*trace->io);
    436 	while (p >= trace->start) {
    437 		test = (struct blk_io_trace *)p;
    438 		if (CHECK_MAGIC(test) && is_io_event(test)) {
    439 			u64 offset = p - trace->start;
    440 			if (offset + sizeof(*test) + test->pdu_len == trace->len) {
    441 				return test->time;
    442 			}
    443 		}
    444 		p--;
    445 		search_len++;
    446 		if (search_len > 8192) {
    447 			break;
    448 		}
    449 	}
    450 
    451 	/* searching backwards didn't work out, we'll have to scan the file */
    452 	first_record(trace);
    453 	while (1) {
    454 		if (is_io_event(trace->io))
    455 			found = trace->io->time;
    456 		if (next_record(trace))
    457 			break;
    458 	}
    459 	first_record(trace);
    460 	return found;
    461 }
    462 
    463 static int parse_fio_bank_message(struct trace *trace, u64 *bank_ret, u64 *offset_ret,
    464 			   u64 *num_banks_ret)
    465 {
    466 	char *s;
    467 	char *next;
    468 	char *message;
    469 	struct blk_io_trace *test = trace->io;
    470 	int len = test->pdu_len;
    471 	u64 bank;
    472 	u64 offset;
    473 	u64 num_banks;
    474 
    475 	if (!(test->action & BLK_TC_ACT(BLK_TC_NOTIFY)))
    476 		return -1;
    477 	if (test->action != BLK_TN_MESSAGE)
    478 		return -1;
    479 
    480 	/* the message is fio rw bank offset num_banks */
    481 	if (len < 3)
    482 		return -1;
    483 	message = (char *)(test + 1);
    484 	if (strncmp(message, "fio r ", 6) != 0)
    485 		return -1;
    486 
    487 	message = strndup(message, len);
    488 	s = strchr(message, ' ');
    489 	if (!s)
    490 		goto out;
    491 	s++;
    492 	s = strchr(s, ' ');
    493 	if (!s)
    494 		goto out;
    495 
    496 	bank = strtoll(s, &next, 10);
    497 	if (s == next)
    498 		goto out;
    499 	s = next;
    500 
    501 	offset = strtoll(s, &next, 10);
    502 	if (s == next)
    503 		goto out;
    504 	s = next;
    505 
    506 	num_banks = strtoll(s, &next, 10);
    507 	if (s == next)
    508 		goto out;
    509 
    510 	*bank_ret = bank;
    511 	*offset_ret = offset;
    512 	*num_banks_ret = num_banks;
    513 
    514 	return 0;
    515 out:
    516 	free(message);
    517 	return -1;
    518 }
    519 
    520 static struct dev_info *lookup_dev(struct trace *trace, struct blk_io_trace *io)
    521 {
    522 	u32 dev = io->device;
    523 	int i;
    524 	struct dev_info *di = NULL;
    525 
    526 	for (i = 0; i < trace->num_devices; i++) {
    527 		if (trace->devices[i].device == dev) {
    528 			di = trace->devices + i;
    529 			goto found;
    530 		}
    531 	}
    532 	i = trace->num_devices++;
    533 	if (i >= MAX_DEVICES_PER_TRACE) {
    534 		fprintf(stderr, "Trace contains too many devices (%d)\n", i);
    535 		exit(1);
    536 	}
    537 	di = trace->devices + i;
    538 	di->device = dev;
    539 found:
    540 	return di;
    541 }
    542 
    543 static void map_devices(struct trace *trace)
    544 {
    545 	struct dev_info *di;
    546 	u64 found;
    547 	u64 map_start = 0;
    548 	int i;
    549 
    550 	first_record(trace);
    551 	while (1) {
    552 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
    553 			di = lookup_dev(trace, trace->io);
    554 			found = trace->io->sector << 9;
    555 			if (found < di->min)
    556 				di->min = found;
    557 
    558 			found += trace->io->bytes;
    559 			if (di->max < found)
    560 				di->max = found;
    561 		}
    562 		if (next_record(trace))
    563 			break;
    564 	}
    565 	first_record(trace);
    566 	for (i = 0; i < trace->num_devices; i++) {
    567 		di = trace->devices + i;
    568 		di->map = map_start;
    569 		map_start += di->max - di->min;
    570 	}
    571 }
    572 
    573 static u64 map_io(struct trace *trace, struct blk_io_trace *io)
    574 {
    575 	struct dev_info *di = lookup_dev(trace, io);
    576 	u64 val = trace->io->sector << 9;
    577 	return di->map + val - di->min;
    578 }
    579 
    580 void find_extreme_offsets(struct trace *trace, u64 *min_ret, u64 *max_ret, u64 *max_bank_ret,
    581 			  u64 *max_offset_ret)
    582 {
    583 	u64 found = 0;
    584 	u64 max = 0, min = ~(u64)0;
    585 	u64 max_bank = 0;
    586 	u64 max_bank_offset = 0;
    587 	u64 num_banks = 0;
    588 
    589 	map_devices(trace);
    590 
    591 	first_record(trace);
    592 	while (1) {
    593 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
    594 			found = map_io(trace, trace->io);
    595 			if (found < min)
    596 				min = found;
    597 
    598 			found += trace->io->bytes;
    599 			if (max < found)
    600 				max = found;
    601 		} else {
    602 			u64 bank;
    603 			u64 offset;
    604 			if (!parse_fio_bank_message(trace, &bank,
    605 						    &offset, &num_banks)) {
    606 				if (bank > max_bank)
    607 					max_bank = bank;
    608 				if (offset > max_bank_offset)
    609 					max_bank_offset = offset;
    610 			}
    611 		}
    612 		if (next_record(trace))
    613 			break;
    614 	}
    615 	first_record(trace);
    616 	*min_ret = min;
    617 	*max_ret = max;
    618 	*max_bank_ret = max_bank;
    619 	*max_offset_ret = max_bank_offset;
    620 }
    621 
    622 static void check_io_types(struct trace *trace)
    623 {
    624 	struct blk_io_trace *io = trace->io;
    625 	int action = io->action & BLK_TA_MASK;
    626 
    627 	if (!(io->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
    628 		switch (action) {
    629 		case __BLK_TA_COMPLETE:
    630 			trace->found_completion = 1;
    631 			break;
    632 		case __BLK_TA_ISSUE:
    633 			trace->found_issue = 1;
    634 			break;
    635 		case __BLK_TA_QUEUE:
    636 			trace->found_queue = 1;
    637 			break;
    638 		};
    639 	}
    640 }
    641 
    642 
    643 int filter_outliers(struct trace *trace, u64 min_offset, u64 max_offset,
    644 		    u64 *yzoom_min, u64 *yzoom_max)
    645 {
    646 	int hits[11];
    647 	u64 max_per_bucket[11];
    648 	u64 min_per_bucket[11];
    649 	u64 bytes_per_bucket = (max_offset - min_offset + 1) / 10;
    650 	int slot;
    651 	int fat_count = 0;
    652 
    653 	memset(hits, 0, sizeof(int) * 11);
    654 	memset(max_per_bucket, 0, sizeof(u64) * 11);
    655 	memset(min_per_bucket, 0xff, sizeof(u64) * 11);
    656 	first_record(trace);
    657 	while (1) {
    658 		check_io_types(trace);
    659 		if (!(trace->io->action & BLK_TC_ACT(BLK_TC_NOTIFY)) &&
    660 		    (trace->io->action & BLK_TA_MASK) == __BLK_TA_QUEUE) {
    661 			u64 off = map_io(trace, trace->io) - min_offset;
    662 
    663 			slot = (int)(off / bytes_per_bucket);
    664 			hits[slot]++;
    665 			if (off < min_per_bucket[slot])
    666 				min_per_bucket[slot] = off;
    667 
    668 			off += trace->io->bytes;
    669 			slot = (int)(off / bytes_per_bucket);
    670 			hits[slot]++;
    671 			if (off > max_per_bucket[slot])
    672 				max_per_bucket[slot] = off;
    673 		}
    674 		if (next_record(trace))
    675 			break;
    676 	}
    677 	first_record(trace);
    678 	for (slot = 0; slot < 11; slot++) {
    679 		if (hits[slot] > fat_count) {
    680 			fat_count = hits[slot];
    681 		}
    682 	}
    683 
    684 	*yzoom_max = max_offset;
    685 	for (slot = 10; slot >= 0; slot--) {
    686 		double d = hits[slot];
    687 
    688 		if (d >= (double)fat_count * .05) {
    689 			*yzoom_max = max_per_bucket[slot] + min_offset;
    690 			break;
    691 		}
    692 	}
    693 
    694 	*yzoom_min = min_offset;
    695 	for (slot = 0; slot < 10; slot++) {
    696 		double d = hits[slot];
    697 
    698 		if (d >= (double)fat_count * .05) {
    699 			*yzoom_min = min_per_bucket[slot] + min_offset;
    700 			break;
    701 		}
    702 	}
    703 	return 0;
    704 }
    705 
    706 static char footer[] = ".blktrace.0";
    707 static int footer_len = sizeof(footer) - 1;
    708 
    709 static int match_trace(char *name, int *len)
    710 {
    711 	int match_len;
    712 	int footer_start;
    713 
    714 	match_len = strlen(name);
    715 	if (match_len <= footer_len)
    716 		return 0;
    717 
    718 	footer_start = match_len - footer_len;
    719 	if (strcmp(name + footer_start, footer) != 0)
    720 		return 0;
    721 
    722 	if (len)
    723 		*len = match_len;
    724 	return 1;
    725 }
    726 
    727 struct tracelist {
    728 	struct tracelist *next;
    729 	char *name;
    730 };
    731 
    732 static struct tracelist *traces_list(char *dir_name, int *len)
    733 {
    734 	int count = 0;
    735 	struct tracelist *traces = NULL;
    736 	int dlen = strlen(dir_name);
    737 	DIR *dir = opendir(dir_name);
    738 	if (!dir)
    739 		return NULL;
    740 
    741 	while (1) {
    742 		int n = 0;
    743 		struct tracelist *tl;
    744 		struct dirent *d = readdir(dir);
    745 		if (!d)
    746 			break;
    747 
    748 		if (!match_trace(d->d_name, &n))
    749 			continue;
    750 
    751 		n += dlen + 1; /* dir + '/' + file */
    752 		/* Allocate space for tracelist + filename */
    753 		tl = calloc(1, sizeof(struct tracelist) + (sizeof(char) * (n + 1)));
    754 		if (!tl) {
    755 			closedir(dir);
    756 			return NULL;
    757 		}
    758 		tl->next = traces;
    759 		tl->name = (char *)(tl + 1);
    760 		snprintf(tl->name, n, "%s/%s", dir_name, d->d_name);
    761 		traces = tl;
    762 		count++;
    763 	}
    764 
    765 	closedir(dir);
    766 
    767 	if (len)
    768 		*len = count;
    769 
    770 	return traces;
    771 }
    772 
    773 static void traces_free(struct tracelist *traces)
    774 {
    775 	while (traces) {
    776 		struct tracelist *tl = traces;
    777 		traces = traces->next;
    778 		free(tl);
    779 	}
    780 }
    781 
    782 static int dump_traces(struct tracelist *traces, int count, char *dumpfile)
    783 {
    784 	struct tracelist *tl;
    785 	char **argv = NULL;
    786 	int argc = 0;
    787 	int i;
    788 	int err = 0;
    789 
    790 	argc = count * 2; /* {"-i", trace } */
    791 	argc += 4; /* See below */
    792 	argv = calloc(argc + 1, sizeof(char *));
    793 	if (!argv)
    794 		return -errno;
    795 
    796 	i = 0;
    797 	argv[i++] = "blkparse";
    798 	argv[i++] = "-O";
    799 	argv[i++] = "-d";
    800 	argv[i++] = dumpfile;
    801 	for (tl = traces; tl != NULL; tl = tl->next) {
    802 		argv[i++] = "-i";
    803 		argv[i++] = tl->name;
    804 	}
    805 
    806 	err = run_program(argc, argv, 1, NULL, NULL);
    807 	if (err)
    808 		fprintf(stderr, "%s exited with %d, expected 0\n", argv[0], err);
    809 	free(argv);
    810 	return err;
    811 }
    812 
    813 static char *find_trace_file(char *filename)
    814 {
    815 	int ret;
    816 	struct stat st;
    817 	char *dot;
    818 	int found_dir = 0;
    819 	char *dumpfile;
    820 	int len = strlen(filename);
    821 
    822 	/* look for an exact match of whatever they pass in.
    823 	 * If it is a file, assume it is the dump file.
    824 	 * If a directory, remember that it existed so we
    825 	 * can combine traces in that directory later
    826 	 */
    827 	ret = stat(filename, &st);
    828 	if (ret == 0) {
    829 		if (S_ISREG(st.st_mode))
    830 			return strdup(filename);
    831 
    832 		if (S_ISDIR(st.st_mode))
    833 			found_dir = 1;
    834 	}
    835 
    836 	if (found_dir) {
    837 		int i;
    838 		/* Eat up trailing '/'s */
    839 		for (i = len - 1; filename[i] == '/'; i--)
    840 			filename[i] = '\0';
    841 	}
    842 
    843 	/*
    844 	 * try tacking .dump onto the end and see if that already
    845 	 * has been generated
    846 	 */
    847 	ret = asprintf(&dumpfile, "%s.dump", filename);
    848 	if (ret == -1) {
    849 		perror("Error building dump file name");
    850 		return NULL;
    851 	}
    852 	ret = stat(dumpfile, &st);
    853 	if (ret == 0)
    854 		return dumpfile;
    855 
    856 	/*
    857 	 * try to generate the .dump from all the traces in
    858 	 * a single dir.
    859 	 */
    860 	if (found_dir) {
    861 		int count;
    862 		struct tracelist *traces = traces_list(filename, &count);
    863 		if (traces) {
    864 			ret = dump_traces(traces, count, dumpfile);
    865 			traces_free(traces);
    866 			if (ret == 0)
    867 				return dumpfile;
    868 		}
    869 	}
    870 	free(dumpfile);
    871 
    872 	/*
    873 	 * try to generate the .dump from all the blktrace
    874 	 * files for a named trace
    875 	 */
    876 	dot = strrchr(filename, '.');
    877 	if (!dot || strcmp(".dump", dot) != 0) {
    878 		struct tracelist trace = {0 ,NULL};
    879 		if (dot && dot != filename)
    880 			len = dot - filename;
    881 
    882 		ret = asprintf(&trace.name, "%*s.blktrace.0", len, filename);
    883 		if (ret == -1)
    884 			return NULL;
    885 		ret = asprintf(&dumpfile, "%*s.dump", len, filename);
    886 		if (ret == -1) {
    887 			free(trace.name);
    888 			return NULL;
    889 		}
    890 
    891 		ret = dump_traces(&trace, 1, dumpfile);
    892 		if (ret == 0) {
    893 			free(trace.name);
    894 			return dumpfile;
    895 		}
    896 		free(trace.name);
    897 		free(dumpfile);
    898 	}
    899 	return NULL;
    900 }
    901 struct trace *open_trace(char *filename)
    902 {
    903 	int fd;
    904 	char *p;
    905 	struct stat st;
    906 	int ret;
    907 	struct trace *trace;
    908 	char *found_filename;
    909 
    910 	trace = calloc(1, sizeof(*trace));
    911 	if (!trace) {
    912 		fprintf(stderr, "unable to allocate memory for trace\n");
    913 		return NULL;
    914 	}
    915 
    916 	found_filename = find_trace_file(filename);
    917 	if (!found_filename) {
    918 		fprintf(stderr, "Unable to find trace file %s\n", filename);
    919 		goto fail;
    920 	}
    921 	filename = found_filename;
    922 
    923 	fd = open(filename, O_RDONLY);
    924 	if (fd < 0) {
    925 		fprintf(stderr, "Unable to open trace file %s err %s\n", filename, strerror(errno));
    926 		goto fail;
    927 	}
    928 	ret = fstat(fd, &st);
    929 	if (ret < 0) {
    930 		fprintf(stderr, "stat failed on %s err %s\n", filename, strerror(errno));
    931 		goto fail_fd;
    932 	}
    933 	p = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    934 	if (p == MAP_FAILED) {
    935 		fprintf(stderr, "Unable to mmap trace file %s, err %s\n", filename, strerror(errno));
    936 		goto fail_fd;
    937 	}
    938 	trace->fd = fd;
    939 	trace->len = st.st_size;
    940 	trace->start = p;
    941 	trace->cur = p;
    942 	trace->io = (struct blk_io_trace *)p;
    943 	return trace;
    944 
    945 fail_fd:
    946 	close(fd);
    947 fail:
    948 	free(trace);
    949 	return NULL;
    950 }
    951 static inline int tput_event(struct trace *trace)
    952 {
    953 	if (trace->found_completion)
    954 		return __BLK_TA_COMPLETE;
    955 	if (trace->found_issue)
    956 		return __BLK_TA_ISSUE;
    957 	if (trace->found_queue)
    958 		return __BLK_TA_QUEUE;
    959 
    960 	return __BLK_TA_COMPLETE;
    961 }
    962 
    963 int action_char_to_num(char action)
    964 {
    965 	switch (action) {
    966 	case 'Q':
    967 		return __BLK_TA_QUEUE;
    968 	case 'D':
    969 		return __BLK_TA_ISSUE;
    970 	case 'C':
    971 		return __BLK_TA_COMPLETE;
    972 	}
    973 	return -1;
    974 }
    975 
    976 static inline int io_event(struct trace *trace)
    977 {
    978 	if (plot_io_action)
    979 		return plot_io_action;
    980 	if (trace->found_queue)
    981 		return __BLK_TA_QUEUE;
    982 	if (trace->found_issue)
    983 		return __BLK_TA_ISSUE;
    984 	if (trace->found_completion)
    985 		return __BLK_TA_COMPLETE;
    986 
    987 	return __BLK_TA_COMPLETE;
    988 }
    989 
    990 void add_tput(struct trace *trace, struct graph_line_data *writes_gld,
    991 	      struct graph_line_data *reads_gld)
    992 {
    993 	struct blk_io_trace *io = trace->io;
    994 	struct graph_line_data *gld;
    995 	int action = io->action & BLK_TA_MASK;
    996 	int seconds;
    997 
    998 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
    999 		return;
   1000 
   1001 	if (action != tput_event(trace))
   1002 		return;
   1003 
   1004 	if (BLK_DATADIR(io->action) & BLK_TC_READ)
   1005 		gld = reads_gld;
   1006 	else
   1007 		gld = writes_gld;
   1008 
   1009 	seconds = SECONDS(io->time);
   1010 	gld->data[seconds].sum += io->bytes;
   1011 
   1012 	gld->data[seconds].count = 1;
   1013 	if (gld->data[seconds].sum > gld->max)
   1014 		gld->max = gld->data[seconds].sum;
   1015 }
   1016 
   1017 #define GDD_PTR_ALLOC_STEP 16
   1018 
   1019 static struct pid_map *get_pid_map(struct trace_file *tf, u32 pid)
   1020 {
   1021 	struct pid_map *pm;
   1022 
   1023 	if (!io_per_process) {
   1024 		if (!tf->io_plots)
   1025 			tf->io_plots = 1;
   1026 		return NULL;
   1027 	}
   1028 
   1029 	pm = process_hash_insert(pid, NULL);
   1030 	/* New entry? */
   1031 	if (!pm->index) {
   1032 		if (tf->io_plots == tf->io_plots_allocated) {
   1033 			tf->io_plots_allocated += GDD_PTR_ALLOC_STEP;
   1034 			tf->gdd_reads = realloc(tf->gdd_reads, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
   1035 			if (!tf->gdd_reads)
   1036 				abort();
   1037 			tf->gdd_writes = realloc(tf->gdd_writes, tf->io_plots_allocated * sizeof(struct graph_dot_data *));
   1038 			if (!tf->gdd_writes)
   1039 				abort();
   1040 			memset(tf->gdd_reads + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
   1041 			       0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
   1042 			memset(tf->gdd_writes + tf->io_plots_allocated - GDD_PTR_ALLOC_STEP,
   1043 			       0, GDD_PTR_ALLOC_STEP * sizeof(struct graph_dot_data *));
   1044 		}
   1045 		pm->index = tf->io_plots++;
   1046 
   1047 		return pm;
   1048 	}
   1049 	return pm;
   1050 }
   1051 
   1052 void add_io(struct trace *trace, struct trace_file *tf)
   1053 {
   1054 	struct blk_io_trace *io = trace->io;
   1055 	int action = io->action & BLK_TA_MASK;
   1056 	u64 offset;
   1057 	int index;
   1058 	char *label;
   1059 	struct pid_map *pm;
   1060 
   1061 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
   1062 		return;
   1063 
   1064 	if (action != io_event(trace))
   1065 		return;
   1066 
   1067 	offset = map_io(trace, io);
   1068 
   1069 	pm = get_pid_map(tf, io->pid);
   1070 	if (!pm) {
   1071 		index = 0;
   1072 		label = "";
   1073 	} else {
   1074 		index = pm->index;
   1075 		label = pm->name;
   1076 	}
   1077 	if (BLK_DATADIR(io->action) & BLK_TC_READ) {
   1078 		if (!tf->gdd_reads[index])
   1079 			tf->gdd_reads[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
   1080 		set_gdd_bit(tf->gdd_reads[index], offset, io->bytes, io->time);
   1081 	} else if (BLK_DATADIR(io->action) & BLK_TC_WRITE) {
   1082 		if (!tf->gdd_writes[index])
   1083 			tf->gdd_writes[index] = alloc_dot_data(tf->min_seconds, tf->max_seconds, tf->min_offset, tf->max_offset, tf->stop_seconds, pick_color(), strdup(label));
   1084 		set_gdd_bit(tf->gdd_writes[index], offset, io->bytes, io->time);
   1085 	}
   1086 }
   1087 
   1088 void add_pending_io(struct trace *trace, struct graph_line_data *gld)
   1089 {
   1090 	unsigned int seconds;
   1091 	struct blk_io_trace *io = trace->io;
   1092 	int action = io->action & BLK_TA_MASK;
   1093 	double avg;
   1094 	struct pending_io *pio;
   1095 
   1096 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
   1097 		return;
   1098 
   1099 	if (action == __BLK_TA_QUEUE) {
   1100 		if (io->sector == 0)
   1101 			return;
   1102 		if (trace->found_issue || trace->found_completion) {
   1103 			pio = hash_queued_io(trace->io);
   1104 			/*
   1105 			 * When there are no ISSUE events count depth and
   1106 			 * latency at least from queue events
   1107 			 */
   1108 			if (pio && !trace->found_issue) {
   1109 				pio->dispatch_time = io->time;
   1110 				goto account_io;
   1111 			}
   1112 		}
   1113 		return;
   1114 	}
   1115 	if (action == __BLK_TA_REQUEUE) {
   1116 		if (ios_in_flight > 0)
   1117 			ios_in_flight--;
   1118 		return;
   1119 	}
   1120 	if (action != __BLK_TA_ISSUE)
   1121 		return;
   1122 
   1123 	pio = hash_dispatched_io(trace->io);
   1124 	if (!pio)
   1125 		return;
   1126 
   1127 	if (!trace->found_completion) {
   1128 		list_del(&pio->hash_list);
   1129 		free(pio);
   1130 	}
   1131 
   1132 account_io:
   1133 	ios_in_flight++;
   1134 
   1135 	seconds = SECONDS(io->time);
   1136 	gld->data[seconds].sum += ios_in_flight;
   1137 	gld->data[seconds].count++;
   1138 
   1139 	avg = (double)gld->data[seconds].sum / gld->data[seconds].count;
   1140 	if (gld->max < (u64)avg) {
   1141 		gld->max = avg;
   1142 	}
   1143 }
   1144 
   1145 void add_completed_io(struct trace *trace,
   1146 		      struct graph_line_data *latency_gld)
   1147 {
   1148 	struct blk_io_trace *io = trace->io;
   1149 	int seconds;
   1150 	int action = io->action & BLK_TA_MASK;
   1151 	struct pending_io *pio;
   1152 	double avg;
   1153 	u64 latency;
   1154 
   1155 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
   1156 		return;
   1157 
   1158 	if (action != __BLK_TA_COMPLETE)
   1159 		return;
   1160 
   1161 	seconds = SECONDS(io->time);
   1162 
   1163 	pio = hash_completed_io(trace->io);
   1164 	if (!pio)
   1165 		return;
   1166 
   1167 	if (ios_in_flight > 0)
   1168 		ios_in_flight--;
   1169 	if (io->time >= pio->dispatch_time) {
   1170 		latency = io->time - pio->dispatch_time;
   1171 		latency_gld->data[seconds].sum += latency;
   1172 		latency_gld->data[seconds].count++;
   1173 	}
   1174 
   1175 	list_del(&pio->hash_list);
   1176 	free(pio);
   1177 
   1178 	avg = (double)latency_gld->data[seconds].sum /
   1179 		latency_gld->data[seconds].count;
   1180 	if (latency_gld->max < (u64)avg) {
   1181 		latency_gld->max = avg;
   1182 	}
   1183 }
   1184 
   1185 void add_iop(struct trace *trace, struct graph_line_data *gld)
   1186 {
   1187 	struct blk_io_trace *io = trace->io;
   1188 	int action = io->action & BLK_TA_MASK;
   1189 	int seconds;
   1190 
   1191 	if (io->action & BLK_TC_ACT(BLK_TC_NOTIFY))
   1192 		return;
   1193 
   1194 	/* iops and tput use the same events */
   1195 	if (action != tput_event(trace))
   1196 		return;
   1197 
   1198 	seconds = SECONDS(io->time);
   1199 	gld->data[seconds].sum += 1;
   1200 	gld->data[seconds].count = 1;
   1201 	if (gld->data[seconds].sum > gld->max)
   1202 		gld->max = gld->data[seconds].sum;
   1203 }
   1204 
   1205 void check_record(struct trace *trace)
   1206 {
   1207 	handle_notify(trace);
   1208 }
   1209