Home | History | Annotate | Download | only in fio
      1 /*
      2  * blktrace support code for fio
      3  */
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <sys/stat.h>
      7 #include <dirent.h>
      8 
      9 #include "flist.h"
     10 #include "fio.h"
     11 #include "blktrace_api.h"
     12 
     13 #define TRACE_FIFO_SIZE	8192
     14 
     15 /*
     16  * fifo refill frontend, to avoid reading data in trace sized bites
     17  */
     18 static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
     19 {
     20 	char buf[TRACE_FIFO_SIZE];
     21 	unsigned int total;
     22 	int ret;
     23 
     24 	total = sizeof(buf);
     25 	if (total > fifo_room(fifo))
     26 		total = fifo_room(fifo);
     27 
     28 	ret = read(fd, buf, total);
     29 	if (ret < 0) {
     30 		td_verror(td, errno, "read blktrace file");
     31 		return -1;
     32 	}
     33 
     34 	if (ret > 0)
     35 		ret = fifo_put(fifo, buf, ret);
     36 
     37 	dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
     38 	return ret;
     39 }
     40 
     41 /*
     42  * Retrieve 'len' bytes from the fifo, refilling if necessary.
     43  */
     44 static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
     45 			  void *buf, unsigned int len)
     46 {
     47 	if (fifo_len(fifo) < len) {
     48 		int ret = refill_fifo(td, fifo, fd);
     49 
     50 		if (ret < 0)
     51 			return ret;
     52 	}
     53 
     54 	return fifo_get(fifo, buf, len);
     55 }
     56 
     57 /*
     58  * Just discard the pdu by seeking past it.
     59  */
     60 static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
     61 		       struct blk_io_trace *t)
     62 {
     63 	if (t->pdu_len == 0)
     64 		return 0;
     65 
     66 	dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
     67 	return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
     68 }
     69 
     70 /*
     71  * Check if this is a blktrace binary data file. We read a single trace
     72  * into memory and check for the magic signature.
     73  */
     74 int is_blktrace(const char *filename, int *need_swap)
     75 {
     76 	struct blk_io_trace t;
     77 	int fd, ret;
     78 
     79 	fd = open(filename, O_RDONLY);
     80 	if (fd < 0)
     81 		return 0;
     82 
     83 	ret = read(fd, &t, sizeof(t));
     84 	close(fd);
     85 
     86 	if (ret < 0) {
     87 		perror("read blktrace");
     88 		return 0;
     89 	} else if (ret != sizeof(t)) {
     90 		log_err("fio: short read on blktrace file\n");
     91 		return 0;
     92 	}
     93 
     94 	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
     95 		*need_swap = 0;
     96 		return 1;
     97 	}
     98 
     99 	/*
    100 	 * Maybe it needs to be endian swapped...
    101 	 */
    102 	t.magic = fio_swap32(t.magic);
    103 	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC) {
    104 		*need_swap = 1;
    105 		return 1;
    106 	}
    107 
    108 	return 0;
    109 }
    110 
    111 static int lookup_device(struct thread_data *td, char *path, unsigned int maj,
    112 			 unsigned int min)
    113 {
    114 	struct dirent *dir;
    115 	struct stat st;
    116 	int found = 0;
    117 	DIR *D;
    118 
    119 	D = opendir(path);
    120 	if (!D)
    121 		return 0;
    122 
    123 	while ((dir = readdir(D)) != NULL) {
    124 		char full_path[256];
    125 
    126 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
    127 			continue;
    128 
    129 		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
    130 		if (lstat(full_path, &st) == -1) {
    131 			perror("lstat");
    132 			break;
    133 		}
    134 
    135 		if (S_ISDIR(st.st_mode)) {
    136 			found = lookup_device(td, full_path, maj, min);
    137 			if (found) {
    138 				strcpy(path, full_path);
    139 				break;
    140 			}
    141 		}
    142 
    143 		if (!S_ISBLK(st.st_mode))
    144 			continue;
    145 
    146 		/*
    147 		 * If replay_redirect is set then always return this device
    148 		 * upon lookup which overrides the device lookup based on
    149 		 * major minor in the actual blktrace
    150 		 */
    151 		if (td->o.replay_redirect) {
    152 			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
    153 					" with: %s\n", maj, min,
    154 					td->o.replay_redirect);
    155 			strcpy(path, td->o.replay_redirect);
    156 			found = 1;
    157 			break;
    158 		}
    159 
    160 		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
    161 			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
    162 			strcpy(path, full_path);
    163 			found = 1;
    164 			break;
    165 		}
    166 	}
    167 
    168 	closedir(D);
    169 	return found;
    170 }
    171 
    172 #define FMINORBITS	20
    173 #define FMINORMASK	((1U << FMINORBITS) - 1)
    174 #define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
    175 #define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
    176 
    177 static void trace_add_open_close_event(struct thread_data *td, int fileno, enum file_log_act action)
    178 {
    179 	struct io_piece *ipo;
    180 
    181 	ipo = calloc(1, sizeof(*ipo));
    182 	init_ipo(ipo);
    183 
    184 	ipo->ddir = DDIR_INVAL;
    185 	ipo->fileno = fileno;
    186 	ipo->file_action = action;
    187 	flist_add_tail(&ipo->list, &td->io_log_list);
    188 }
    189 
    190 static int trace_add_file(struct thread_data *td, __u32 device)
    191 {
    192 	static unsigned int last_maj, last_min, last_fileno;
    193 	unsigned int maj = FMAJOR(device);
    194 	unsigned int min = FMINOR(device);
    195 	struct fio_file *f;
    196 	char dev[256];
    197 	unsigned int i;
    198 
    199 	if (last_maj == maj && last_min == min)
    200 		return last_fileno;
    201 
    202 	last_maj = maj;
    203 	last_min = min;
    204 
    205 	/*
    206 	 * check for this file in our list
    207 	 */
    208 	for_each_file(td, f, i)
    209 		if (f->major == maj && f->minor == min) {
    210 			last_fileno = f->fileno;
    211 			return last_fileno;
    212 		}
    213 
    214 	strcpy(dev, "/dev");
    215 	if (lookup_device(td, dev, maj, min)) {
    216 		int fileno;
    217 
    218 		dprint(FD_BLKTRACE, "add devices %s\n", dev);
    219 		fileno = add_file_exclusive(td, dev);
    220 		td->o.open_files++;
    221 		td->files[fileno]->major = maj;
    222 		td->files[fileno]->minor = min;
    223 		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
    224 		last_fileno = fileno;
    225 	}
    226 
    227 	return last_fileno;
    228 }
    229 
    230 /*
    231  * Store blk_io_trace data in an ipo for later retrieval.
    232  */
    233 static void store_ipo(struct thread_data *td, unsigned long long offset,
    234 		      unsigned int bytes, int rw, unsigned long long ttime,
    235 		      int fileno)
    236 {
    237 	struct io_piece *ipo = malloc(sizeof(*ipo));
    238 
    239 	init_ipo(ipo);
    240 
    241 	/*
    242 	 * the 512 is wrong here, it should be the hardware sector size...
    243 	 */
    244 	ipo->offset = offset * 512;
    245 	ipo->len = bytes;
    246 	ipo->delay = ttime / 1000;
    247 	if (rw)
    248 		ipo->ddir = DDIR_WRITE;
    249 	else
    250 		ipo->ddir = DDIR_READ;
    251 	ipo->fileno = fileno;
    252 
    253 	dprint(FD_BLKTRACE, "store ddir=%d, off=%llu, len=%lu, delay=%lu\n",
    254 							ipo->ddir, ipo->offset,
    255 							ipo->len, ipo->delay);
    256 	queue_io_piece(td, ipo);
    257 }
    258 
    259 static void handle_trace_notify(struct blk_io_trace *t)
    260 {
    261 	switch (t->action) {
    262 	case BLK_TN_PROCESS:
    263 		log_info("blktrace: got process notify: %x, %d\n",
    264 				t->action, t->pid);
    265 		break;
    266 	case BLK_TN_TIMESTAMP:
    267 		log_info("blktrace: got timestamp notify: %x, %d\n",
    268 				t->action, t->pid);
    269 		break;
    270 	case BLK_TN_MESSAGE:
    271 		break;
    272 	default:
    273 		dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
    274 		break;
    275 	}
    276 }
    277 
    278 static void handle_trace_discard(struct thread_data *td, struct blk_io_trace *t,
    279 				 unsigned long long ttime, unsigned long *ios)
    280 {
    281 	struct io_piece *ipo = malloc(sizeof(*ipo));
    282 	int fileno;
    283 
    284 	init_ipo(ipo);
    285 	fileno = trace_add_file(td, t->device);
    286 
    287 	ios[DDIR_WRITE]++;
    288 	td->o.size += t->bytes;
    289 
    290 	memset(ipo, 0, sizeof(*ipo));
    291 	INIT_FLIST_HEAD(&ipo->list);
    292 
    293 	/*
    294 	 * the 512 is wrong here, it should be the hardware sector size...
    295 	 */
    296 	ipo->offset = t->sector * 512;
    297 	ipo->len = t->bytes;
    298 	ipo->delay = ttime / 1000;
    299 	ipo->ddir = DDIR_TRIM;
    300 	ipo->fileno = fileno;
    301 
    302 	dprint(FD_BLKTRACE, "store discard, off=%llu, len=%lu, delay=%lu\n",
    303 							ipo->offset, ipo->len,
    304 							ipo->delay);
    305 	queue_io_piece(td, ipo);
    306 }
    307 
    308 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
    309 			    unsigned long long ttime, unsigned long *ios,
    310 			    unsigned int *bs)
    311 {
    312 	int rw;
    313 	int fileno;
    314 
    315 	fileno = trace_add_file(td, t->device);
    316 
    317 	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
    318 
    319 	if (t->bytes > bs[rw])
    320 		bs[rw] = t->bytes;
    321 
    322 	ios[rw]++;
    323 	td->o.size += t->bytes;
    324 	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
    325 }
    326 
    327 /*
    328  * We only care for queue traces, most of the others are side effects
    329  * due to internal workings of the block layer.
    330  */
    331 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
    332 			 unsigned long long ttime, unsigned long *ios,
    333 			 unsigned int *bs)
    334 {
    335 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
    336 		return;
    337 	if (t->action & BLK_TC_ACT(BLK_TC_PC))
    338 		return;
    339 
    340 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
    341 		handle_trace_notify(t);
    342 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
    343 		handle_trace_discard(td, t, ttime, ios);
    344 	else
    345 		handle_trace_fs(td, t, ttime, ios, bs);
    346 }
    347 
    348 static void byteswap_trace(struct blk_io_trace *t)
    349 {
    350 	t->magic = fio_swap32(t->magic);
    351 	t->sequence = fio_swap32(t->sequence);
    352 	t->time = fio_swap64(t->time);
    353 	t->sector = fio_swap64(t->sector);
    354 	t->bytes = fio_swap32(t->bytes);
    355 	t->action = fio_swap32(t->action);
    356 	t->pid = fio_swap32(t->pid);
    357 	t->device = fio_swap32(t->device);
    358 	t->cpu = fio_swap32(t->cpu);
    359 	t->error = fio_swap16(t->error);
    360 	t->pdu_len = fio_swap16(t->pdu_len);
    361 }
    362 
    363 /*
    364  * Load a blktrace file by reading all the blk_io_trace entries, and storing
    365  * them as io_pieces like the fio text version would do.
    366  */
    367 int load_blktrace(struct thread_data *td, const char *filename, int need_swap)
    368 {
    369 	unsigned long long ttime, delay;
    370 	struct blk_io_trace t;
    371 	unsigned long ios[2], skipped_writes;
    372 	unsigned int cpu;
    373 	unsigned int rw_bs[2];
    374 	struct fifo *fifo;
    375 	int fd, i, old_state;
    376 	struct fio_file *f;
    377 	int this_depth, depth;
    378 
    379 	fd = open(filename, O_RDONLY);
    380 	if (fd < 0) {
    381 		td_verror(td, errno, "open blktrace file");
    382 		return 1;
    383 	}
    384 
    385 	fifo = fifo_alloc(TRACE_FIFO_SIZE);
    386 
    387 	old_state = td_bump_runstate(td, TD_SETTING_UP);
    388 
    389 	td->o.size = 0;
    390 
    391 	cpu = 0;
    392 	ttime = 0;
    393 	ios[0] = ios[1] = 0;
    394 	rw_bs[0] = rw_bs[1] = 0;
    395 	skipped_writes = 0;
    396 	this_depth = depth = 0;
    397 	do {
    398 		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
    399 
    400 		if (ret < 0)
    401 			goto err;
    402 		else if (!ret)
    403 			break;
    404 		else if (ret < (int) sizeof(t)) {
    405 			log_err("fio: short fifo get\n");
    406 			break;
    407 		}
    408 
    409 		if (need_swap)
    410 			byteswap_trace(&t);
    411 
    412 		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
    413 			log_err("fio: bad magic in blktrace data: %x\n",
    414 								t.magic);
    415 			goto err;
    416 		}
    417 		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
    418 			log_err("fio: bad blktrace version %d\n",
    419 								t.magic & 0xff);
    420 			goto err;
    421 		}
    422 		ret = discard_pdu(td, fifo, fd, &t);
    423 		if (ret < 0) {
    424 			td_verror(td, ret, "blktrace lseek");
    425 			goto err;
    426 		} else if (t.pdu_len != ret) {
    427 			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
    428 			goto err;
    429 		}
    430 		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
    431 			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
    432 				this_depth++;
    433 			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) {
    434 				depth = max(depth, this_depth);
    435 				this_depth = 0;
    436 			}
    437 			if (!ttime) {
    438 				ttime = t.time;
    439 				cpu = t.cpu;
    440 			}
    441 
    442 			delay = 0;
    443 			if (cpu == t.cpu)
    444 				delay = t.time - ttime;
    445 			if ((t.action & BLK_TC_ACT(BLK_TC_WRITE)) && read_only)
    446 				skipped_writes++;
    447 			else {
    448 				/*
    449 				 * set delay to zero if no_stall enabled for
    450 				 * fast replay
    451 				 */
    452 				if (td->o.no_stall)
    453 					delay = 0;
    454 
    455 				handle_trace(td, &t, delay, ios, rw_bs);
    456 			}
    457 
    458 			ttime = t.time;
    459 			cpu = t.cpu;
    460 		} else {
    461 			delay = 0;
    462 			handle_trace(td, &t, delay, ios, rw_bs);
    463 		}
    464 	} while (1);
    465 
    466 	for (i = 0; i < td->files_index; i++) {
    467 		f = td->files[i];
    468 		trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
    469 	}
    470 
    471 	fifo_free(fifo);
    472 	close(fd);
    473 
    474 	td_restore_runstate(td, old_state);
    475 
    476 	if (!td->files_index) {
    477 		log_err("fio: did not find replay device(s)\n");
    478 		return 1;
    479 	}
    480 
    481 	/*
    482 	 * For stacked devices, we don't always get a COMPLETE event so
    483 	 * the depth grows to insane values. Limit it to something sane(r).
    484 	 */
    485 	if (!depth || depth > 1024)
    486 		depth = 1024;
    487 
    488 	if (skipped_writes)
    489 		log_err("fio: %s skips replay of %lu writes due to read-only\n",
    490 						td->o.name, skipped_writes);
    491 
    492 	if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) {
    493 		log_err("fio: found no ios in blktrace data\n");
    494 		return 1;
    495 	} else if (ios[DDIR_READ] && !ios[DDIR_READ]) {
    496 		td->o.td_ddir = TD_DDIR_READ;
    497 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
    498 	} else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
    499 		td->o.td_ddir = TD_DDIR_WRITE;
    500 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
    501 	} else {
    502 		td->o.td_ddir = TD_DDIR_RW;
    503 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
    504 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
    505 	}
    506 
    507 	/*
    508 	 * We need to do direct/raw ios to the device, to avoid getting
    509 	 * read-ahead in our way.
    510 	 */
    511 	td->o.odirect = 1;
    512 
    513 	/*
    514 	 * we don't know if this option was set or not. it defaults to 1,
    515 	 * so we'll just guess that we should override it if it's still 1
    516 	 */
    517 	if (td->o.iodepth != 1)
    518 		td->o.iodepth = depth;
    519 
    520 	return 0;
    521 err:
    522 	close(fd);
    523 	fifo_free(fifo);
    524 	return 1;
    525 }
    526