Home | History | Annotate | Download | only in btreplay
      1 /*
      2  * Blktrace replay utility - Play traces back
      3  *
      4  * Copyright (C) 2007 Alan D. Brunelle <Alan.Brunelle (at) hp.com>
      5  *
      6  *  This program is free software; you can redistribute it and/or modify
      7  *  it under the terms of the GNU General Public License as published by
      8  *  the Free Software Foundation; either version 2 of the License, or
      9  *  (at your option) any later version.
     10  *
     11  *  This program is distributed in the hope that it will be useful,
     12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14  *  GNU General Public License for more details.
     15  *
     16  *  You should have received a copy of the GNU General Public License
     17  *  along with this program; if not, write to the Free Software
     18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     19  */
     20 
     21 static char build_date[] = __DATE__ " at "__TIME__;
     22 
     23 #include <assert.h>
     24 #include <errno.h>
     25 #include <fcntl.h>
     26 #include <libaio.h>
     27 #include <pthread.h>
     28 #include <sched.h>
     29 #include <signal.h>
     30 #include <stdio.h>
     31 #include <stdlib.h>
     32 #include <string.h>
     33 #include <time.h>
     34 #include <unistd.h>
     35 #include <sys/param.h>
     36 #include <sys/stat.h>
     37 #include <sys/time.h>
     38 #include <sys/types.h>
     39 #include <dirent.h>
     40 #include <stdarg.h>
     41 
     42 #if !defined(_GNU_SOURCE)
     43 #	define _GNU_SOURCE
     44 #endif
     45 #include <getopt.h>
     46 
     47 #include "list.h"
     48 #include "btrecord.h"
     49 
     50 /*
     51  * ========================================================================
     52  * ==== STRUCTURE DEFINITIONS =============================================
     53  * ========================================================================
     54  */
     55 
     56 /**
     57  * Each device map has one of these:
     58  *
     59  * @head:	Linked on to map_devs
     60  * @from_dev:	Device name as seen on recorded system
     61  * @to_dev:	Device name to be used on replay system
     62  */
     63 struct map_dev {
     64 	struct list_head head;
     65 	char *from_dev, *to_dev;
     66 };
     67 
     68 /**
     69  * Each device name specified has one of these (until threads are created)
     70  *
     71  * @head: 	Linked onto input_devs
     72  * @devnm: 	Device name -- 'sd*'
     73  */
     74 struct dev_info {
     75 	struct list_head head;
     76 	char *devnm;
     77 };
     78 
     79 /*
     80  * Per input file information
     81  *
     82  * @head: 	Used to link up on input_files
     83  * @free_iocbs: List of free iocb's available for use
     84  * @used_iocbs: List of iocb's currently outstanding
     85  * @mutex: 	Mutex used with condition variable to protect volatile values
     86  * @cond: 	Condition variable used when waiting on a volatile value change
     87  * @naios_out: 	Current number of AIOs outstanding on this context
     88  * @naios_free: Number of AIOs on the free list (short cut for list_len)
     89  * @send_wait: 	Boolean: When true, the sub thread is waiting on free IOCBs
     90  * @reap_wait: 	Boolean: When true, the rec thread is waiting on used IOCBs
     91  * @send_done: 	Boolean: When true, the sub thread has completed work
     92  * @reap_done: 	Boolean: When true, the rec thread has completed work
     93  * @sub_thread: Thread used to submit IOs.
     94  * @rec_thread: Thread used to reclaim IOs.
     95  * @ctx: 	IO context
     96  * @devnm: 	Copy of the device name being managed by this thread
     97  * @file_name: 	Full name of the input file
     98  * @cpu: 	CPU this thread is pinned to
     99  * @ifd: 	Input file descriptor
    100  * @ofd: 	Output file descriptor
    101  * @iterations: Remaining iterations to process
    102  * @vfp:	For verbose dumping of actions performed
    103  */
    104 struct thr_info {
    105 	struct list_head head, free_iocbs, used_iocbs;
    106 	pthread_mutex_t mutex;
    107 	pthread_cond_t cond;
    108 	volatile long naios_out, naios_free;
    109 	volatile int send_wait, reap_wait, send_done, reap_done;
    110 	pthread_t sub_thread, rec_thread;
    111 	io_context_t ctx;
    112 	char *devnm, *file_name;
    113 	int cpu, ifd, ofd, iterations;
    114 	FILE *vfp;
    115 };
    116 
    117 /*
    118  * Every Asynchronous IO used has one of these (naios per file/device).
    119  *
    120  * @iocb:	IOCB sent down via io_submit
    121  * @head:	Linked onto file_list.free_iocbs or file_list.used_iocbs
    122  * @tip:	Pointer to per-thread information this IO is associated with
    123  * @nbytes:	Number of bytes in buffer associated with iocb
    124  */
    125 struct iocb_pkt {
    126 	struct iocb iocb;
    127 	struct list_head head;
    128 	struct thr_info *tip;
    129 	int nbytes;
    130 };
    131 
    132 /*
    133  * ========================================================================
    134  * ==== GLOBAL VARIABLES ==================================================
    135  * ========================================================================
    136  */
    137 
    138 static volatile int signal_done = 0;	// Boolean: Signal'ed, need to quit
    139 
    140 static char *ibase = "replay";		// Input base name
    141 static char *idir = ".";		// Input directory base
    142 static int cpus_to_use = -1;		// Number of CPUs to use
    143 static int def_iterations = 1;		// Default number of iterations
    144 static int naios = 512;			// Number of AIOs per thread
    145 static int ncpus = 0;			// Number of CPUs in the system
    146 static int verbose = 0;			// Boolean: Output some extra info
    147 static int write_enabled = 0;		// Boolean: Enable writing
    148 static __u64 genesis = ~0;		// Earliest time seen
    149 static __u64 rgenesis;			// Our start time
    150 static size_t pgsize;			// System Page size
    151 static int nb_sec = 512;		// Number of bytes per sector
    152 static LIST_HEAD(input_devs);		// List of devices to handle
    153 static LIST_HEAD(input_files);		// List of input files to handle
    154 static LIST_HEAD(map_devs);		// List of device maps
    155 static int nfiles = 0;			// Number of files to handle
    156 static int no_stalls = 0;		// Boolean: Disable pre-stalls
    157 static unsigned acc_factor = 1;		// Int: Acceleration factor
    158 static int find_records = 0;		// Boolean: Find record files auto
    159 
    160 /*
    161  * Variables managed under control of condition variables.
    162  *
    163  * n_reclaims_done: 	Counts number of reclaim threads that have completed.
    164  * n_replays_done:	Counts number of replay threads that have completed.
    165  * n_replays_ready:	Counts number of replay threads ready to start.
    166  * n_iters_done:	Counts number of replay threads done one iteration.
    167  * iter_start:		Starts an iteration for the replay threads.
    168  */
    169 static volatile int n_reclaims_done = 0;
    170 static pthread_mutex_t reclaim_done_mutex = PTHREAD_MUTEX_INITIALIZER;
    171 static pthread_cond_t reclaim_done_cond = PTHREAD_COND_INITIALIZER;
    172 
    173 static volatile int n_replays_done = 0;
    174 static pthread_mutex_t replay_done_mutex = PTHREAD_MUTEX_INITIALIZER;
    175 static pthread_cond_t replay_done_cond = PTHREAD_COND_INITIALIZER;
    176 
    177 static volatile int n_replays_ready = 0;
    178 static pthread_mutex_t replay_ready_mutex = PTHREAD_MUTEX_INITIALIZER;
    179 static pthread_cond_t replay_ready_cond = PTHREAD_COND_INITIALIZER;
    180 
    181 static volatile int n_iters_done = 0;
    182 static pthread_mutex_t iter_done_mutex = PTHREAD_MUTEX_INITIALIZER;
    183 static pthread_cond_t iter_done_cond = PTHREAD_COND_INITIALIZER;
    184 
    185 static volatile int iter_start = 0;
    186 static pthread_mutex_t iter_start_mutex = PTHREAD_MUTEX_INITIALIZER;
    187 static pthread_cond_t iter_start_cond = PTHREAD_COND_INITIALIZER;
    188 
    189 /*
    190  * ========================================================================
    191  * ==== FORWARD REFERENECES ===============================================
    192  * ========================================================================
    193  */
    194 
    195 static void *replay_sub(void *arg);
    196 static void *replay_rec(void *arg);
    197 static char usage_str[];
    198 
    199 /*
    200  * ========================================================================
    201  * ==== INLINE ROUTINES ===================================================
    202  * ========================================================================
    203  */
    204 
    205 /*
    206  * The 'fatal' macro will output a perror message (if errstring is !NULL)
    207  * and display a string (with variable arguments) and then exit with the
    208  * specified exit value.
    209  */
    210 #define ERR_ARGS			1
    211 #define ERR_SYSCALL			2
    212 static inline void fatal(const char *errstring, const int exitval,
    213 			 const char *fmt, ...)
    214 {
    215 	va_list ap;
    216 
    217 	if (errstring)
    218 		perror(errstring);
    219 
    220 	va_start(ap, fmt);
    221 	vfprintf(stderr, fmt, ap);
    222 	va_end(ap);
    223 
    224 	exit(exitval);
    225 	/*NOTREACHED*/
    226 }
    227 
    228 static inline long long unsigned du64_to_sec(__u64 du64)
    229 {
    230 	return (long long unsigned)du64 / (1000 * 1000 * 1000);
    231 }
    232 
    233 static inline long long unsigned du64_to_nsec(__u64 du64)
    234 {
    235 	return llabs((long long)du64) % (1000 * 1000 * 1000);
    236 }
    237 
    238 /**
    239  * min - Return minimum of two integers
    240  */
    241 static inline int min(int a, int b)
    242 {
    243 	return a < b ? a : b;
    244 }
    245 
    246 /**
    247  * minl - Return minimum of two longs
    248  */
    249 static inline long minl(long a, long b)
    250 {
    251 	return a < b ? a : b;
    252 }
    253 
    254 /**
    255  * usage - Display usage string and version
    256  */
    257 static inline void usage(void)
    258 {
    259 	fprintf(stderr, "Usage: btreplay -- version %s\n%s",
    260 		my_btversion, usage_str);
    261 }
    262 
    263 /**
    264  * is_send_done - Returns true if sender should quit early
    265  * @tip: Per-thread information
    266  */
    267 static inline int is_send_done(struct thr_info *tip)
    268 {
    269 	return signal_done || tip->send_done;
    270 }
    271 
    272 /**
    273  * is_reap_done - Returns true if reaper should quit early
    274  * @tip: Per-thread information
    275  */
    276 static inline int is_reap_done(struct thr_info *tip)
    277 {
    278 	return tip->send_done && tip->naios_out == 0;
    279 }
    280 
    281 /**
    282  * ts2ns - Convert timespec values to a nanosecond value
    283  */
    284 #define NS_TICKS		((__u64)1000 * (__u64)1000 * (__u64)1000)
    285 static inline __u64 ts2ns(struct timespec *ts)
    286 {
    287 	return ((__u64)(ts->tv_sec) * NS_TICKS) + (__u64)(ts->tv_nsec);
    288 }
    289 
    290 /**
    291  * ts2ns - Convert timeval values to a nanosecond value
    292  */
    293 static inline __u64 tv2ns(struct timeval *tp)
    294 {
    295 	return ((__u64)(tp->tv_sec)) + ((__u64)(tp->tv_usec) * (__u64)1000);
    296 }
    297 
    298 /**
    299  * touch_memory - Force physical memory to be allocating it
    300  *
    301  * For malloc()ed memory we need to /touch/ it to make it really
    302  * exist. Otherwise, for write's (to storage) things may not work
    303  * as planned - we see Linux just use a single area to /read/ from
    304  * (as there isn't any memory that has been associated with the
    305  * allocated virtual addresses yet).
    306  */
    307 static inline void touch_memory(char *buf, size_t bsize)
    308 {
    309 #if defined(PREP_BUFS)
    310 	memset(buf, 0, bsize);
    311 #else
    312 	size_t i;
    313 
    314 	for (i = 0; i < bsize; i += pgsize)
    315 		buf[i] = 0;
    316 #endif
    317 }
    318 
    319 /**
    320  * buf_alloc - Returns a page-aligned buffer of the specified size
    321  * @nbytes: Number of bytes to allocate
    322  */
    323 static inline void *buf_alloc(size_t nbytes)
    324 {
    325 	void *buf;
    326 
    327 	if (posix_memalign(&buf, pgsize, nbytes)) {
    328 		fatal("posix_memalign", ERR_SYSCALL, "Allocation failed\n");
    329 		/*NOTREACHED*/
    330 	}
    331 
    332 	return buf;
    333 }
    334 
    335 /**
    336  * gettime - Returns current time
    337  */
    338 static inline __u64 gettime(void)
    339 {
    340 	static int use_clock_gettime = -1;		// Which clock to use
    341 
    342 	if (use_clock_gettime < 0) {
    343 		use_clock_gettime = clock_getres(CLOCK_MONOTONIC, NULL) == 0;
    344 		if (use_clock_gettime) {
    345 			struct timespec ts = {
    346 				.tv_sec = 0,
    347 				.tv_nsec = 0
    348 			};
    349 			clock_settime(CLOCK_MONOTONIC, &ts);
    350 		}
    351 	}
    352 
    353 	if (use_clock_gettime) {
    354 		struct timespec ts;
    355 		clock_gettime(CLOCK_MONOTONIC, &ts);
    356 		return ts2ns(&ts);
    357 	}
    358 	else {
    359 		struct timeval tp;
    360 		gettimeofday(&tp, NULL);
    361 		return tv2ns(&tp);
    362 	}
    363 }
    364 
    365 /**
    366  * setup_signal - Set up a signal handler for the specified signum
    367  */
    368 static inline void setup_signal(int signum, sighandler_t handler)
    369 {
    370 	if (signal(signum, handler) == SIG_ERR) {
    371 		fatal("signal", ERR_SYSCALL, "Failed to set signal %d\n",
    372 			signum);
    373 		/*NOTREACHED*/
    374 	}
    375 }
    376 
    377 /*
    378  * ========================================================================
    379  * ==== CONDITION VARIABLE ROUTINES =======================================
    380  * ========================================================================
    381  */
    382 
    383 /**
    384  * __set_cv - Increments a variable under condition variable control.
    385  * @pmp: 	Pointer to the associated mutex
    386  * @pcp: 	Pointer to the associated condition variable
    387  * @vp: 	Pointer to the variable being incremented
    388  * @mxv: 	Max value for variable (Used only when ASSERTS are on)
    389  */
    390 static inline void __set_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
    391 			    volatile int *vp,
    392 			    __attribute__((__unused__))int mxv)
    393 {
    394 	pthread_mutex_lock(pmp);
    395 	assert(*vp < mxv);
    396 	*vp += 1;
    397 	pthread_cond_signal(pcp);
    398 	pthread_mutex_unlock(pmp);
    399 }
    400 
    401 /**
    402  * __wait_cv - Waits for a variable under cond var control to hit a value
    403  * @pmp: 	Pointer to the associated mutex
    404  * @pcp: 	Pointer to the associated condition variable
    405  * @vp: 	Pointer to the variable being incremented
    406  * @mxv: 	Value to wait for
    407  */
    408 static inline void __wait_cv(pthread_mutex_t *pmp, pthread_cond_t *pcp,
    409 			     volatile int *vp, int mxv)
    410 {
    411 	pthread_mutex_lock(pmp);
    412 	while (*vp < mxv)
    413 		pthread_cond_wait(pcp, pmp);
    414 	*vp = 0;
    415 	pthread_mutex_unlock(pmp);
    416 }
    417 
    418 static inline void set_reclaim_done(void)
    419 {
    420 	__set_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
    421 		 nfiles);
    422 }
    423 
    424 static inline void wait_reclaims_done(void)
    425 {
    426 	__wait_cv(&reclaim_done_mutex, &reclaim_done_cond, &n_reclaims_done,
    427 		  nfiles);
    428 }
    429 
    430 static inline void set_replay_ready(void)
    431 {
    432 	__set_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
    433 		 nfiles);
    434 }
    435 
    436 static inline void wait_replays_ready(void)
    437 {
    438 	__wait_cv(&replay_ready_mutex, &replay_ready_cond, &n_replays_ready,
    439 		  nfiles);
    440 }
    441 
    442 static inline void set_replay_done(void)
    443 {
    444 	__set_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
    445 		nfiles);
    446 }
    447 
    448 static inline void wait_replays_done(void)
    449 {
    450 	__wait_cv(&replay_done_mutex, &replay_done_cond, &n_replays_done,
    451 		  nfiles);
    452 }
    453 
    454 static inline void set_iter_done(void)
    455 {
    456 	__set_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
    457 		nfiles);
    458 }
    459 
    460 static inline void wait_iters_done(void)
    461 {
    462 	__wait_cv(&iter_done_mutex, &iter_done_cond, &n_iters_done,
    463 		  nfiles);
    464 }
    465 
    466 /**
    467  * wait_iter_start - Wait for an iteration to start
    468  *
    469  * This is /slightly/ different: we are waiting for a value to become
    470  * non-zero, and then we decrement it and go on.
    471  */
    472 static inline void wait_iter_start(void)
    473 {
    474 	pthread_mutex_lock(&iter_start_mutex);
    475 	while (iter_start == 0)
    476 		pthread_cond_wait(&iter_start_cond, &iter_start_mutex);
    477 	assert(1 <= iter_start && iter_start <= nfiles);
    478 	iter_start--;
    479 	pthread_mutex_unlock(&iter_start_mutex);
    480 }
    481 
    482 /**
    483  * start_iter - Start an iteration at the replay thread level
    484  */
    485 static inline void start_iter(void)
    486 {
    487 	pthread_mutex_lock(&iter_start_mutex);
    488 	assert(iter_start == 0);
    489 	iter_start = nfiles;
    490 	pthread_cond_broadcast(&iter_start_cond);
    491 	pthread_mutex_unlock(&iter_start_mutex);
    492 }
    493 
    494 /*
    495  * ========================================================================
    496  * ==== CPU RELATED ROUTINES ==============================================
    497  * ========================================================================
    498  */
    499 
    500 /**
    501  * get_ncpus - Sets up the global 'ncpus' value
    502  */
    503 static void get_ncpus(void)
    504 {
    505 	cpu_set_t cpus;
    506 
    507 	if (sched_getaffinity(getpid(), sizeof(cpus), &cpus)) {
    508 		fatal("sched_getaffinity", ERR_SYSCALL, "Can't get CPU info\n");
    509 		/*NOTREACHED*/
    510 	}
    511 
    512 	/*
    513 	 * XXX This assumes (perhaps wrongly) that there are no /holes/
    514 	 * XXX in the mask.
    515 	 */
    516 	for (ncpus = 0; ncpus < CPU_SETSIZE && CPU_ISSET(ncpus, &cpus); ncpus++)
    517 		;
    518 	if (ncpus == 0) {
    519 		fatal(NULL, ERR_SYSCALL, "Insufficient number of CPUs\n");
    520 		/*NOTREACHED*/
    521 	}
    522 }
    523 
    524 /**
    525  * pin_to_cpu - Pin this thread to a specific CPU
    526  * @tip: Thread information
    527  */
    528 static void pin_to_cpu(struct thr_info *tip)
    529 {
    530 	cpu_set_t cpus;
    531 
    532 	assert(0 <= tip->cpu && tip->cpu < ncpus);
    533 
    534 	CPU_ZERO(&cpus);
    535 	CPU_SET(tip->cpu, &cpus);
    536 	if (sched_setaffinity(getpid(), sizeof(cpus), &cpus)) {
    537 		fatal("sched_setaffinity", ERR_SYSCALL, "Failed to pin CPU\n");
    538 		/*NOTREACHED*/
    539 	}
    540 
    541 	if (verbose > 1) {
    542 		int i;
    543 		cpu_set_t now;
    544 
    545 		(void)sched_getaffinity(getpid(), sizeof(now), &now);
    546 		fprintf(tip->vfp, "Pinned to CPU %02d ", tip->cpu);
    547 		for (i = 0; i < ncpus; i++)
    548 			fprintf(tip->vfp, "%1d", CPU_ISSET(i, &now));
    549 		fprintf(tip->vfp, "\n");
    550 	}
    551 }
    552 
    553 /*
    554  * ========================================================================
    555  * ==== INPUT DEVICE HANDLERS =============================================
    556  * ========================================================================
    557  */
    558 
    559 /**
    560  * add_input_dev - Add a device ('sd*') to the list of devices to handle
    561  */
    562 static void add_input_dev(char *devnm)
    563 {
    564 	struct list_head *p;
    565 	struct dev_info *dip;
    566 
    567 	__list_for_each(p, &input_devs) {
    568 		dip = list_entry(p, struct dev_info, head);
    569 		if (strcmp(dip->devnm, devnm) == 0)
    570 			return;
    571 	}
    572 
    573 	dip = malloc(sizeof(*dip));
    574 	dip->devnm = strdup(devnm);
    575 	list_add_tail(&dip->head, &input_devs);
    576 }
    577 
    578 /**
    579  * rem_input_dev - Remove resources associated with this device
    580  */
    581 static void rem_input_dev(struct dev_info *dip)
    582 {
    583 	list_del(&dip->head);
    584 	free(dip->devnm);
    585 	free(dip);
    586 }
    587 
    588 static void find_input_devs(char *idir)
    589 {
    590 	struct dirent *ent;
    591 	DIR *dir = opendir(idir);
    592 
    593 	if (dir == NULL) {
    594 		fatal(idir, ERR_ARGS, "Unable to open %s\n", idir);
    595 		/*NOTREACHED*/
    596 	}
    597 
    598 	while ((ent = readdir(dir)) != NULL) {
    599 		char *p, *dsf = malloc(256);
    600 
    601 		if (strstr(ent->d_name, ".replay.") == NULL)
    602 			continue;
    603 
    604 		dsf = strdup(ent->d_name);
    605 		p = index(dsf, '.');
    606 		assert(p != NULL);
    607 		*p = '\0';
    608 		add_input_dev(dsf);
    609 		free(dsf);
    610 	}
    611 
    612 	closedir(dir);
    613 }
    614 
    615 /*
    616  * ========================================================================
    617  * ==== MAP DEVICE INTERFACES =============================================
    618  * ========================================================================
    619  */
    620 
    621 /**
    622  * read_map_devs - Read in a set of device mapping from the provided file.
    623  * @file_name:	File containing device maps
    624  *
    625  * We support the notion of multiple such files being specifed on the cmd line
    626  */
    627 static void read_map_devs(char *file_name)
    628 {
    629 	FILE *fp;
    630 	char *from_dev, *to_dev;
    631 
    632 	fp = fopen(file_name, "r");
    633 	if (!fp) {
    634 		fatal(file_name, ERR_SYSCALL, "Could not open map devs file\n");
    635 		/*NOTREACHED*/
    636 	}
    637 
    638 	while (fscanf(fp, "%as %as", &from_dev, &to_dev) == 2) {
    639 		struct map_dev *mdp = malloc(sizeof(*mdp));
    640 
    641 		mdp->from_dev = from_dev;
    642 		mdp->to_dev = to_dev;
    643 		list_add_tail(&mdp->head, &map_devs);
    644 	}
    645 
    646 	fclose(fp);
    647 }
    648 
    649 /**
    650  * release_map_devs - Release resources associated with device mappings.
    651  */
    652 static void release_map_devs(void)
    653 {
    654 	struct list_head *p, *q;
    655 
    656 	list_for_each_safe(p, q, &map_devs) {
    657 		struct map_dev *mdp = list_entry(p, struct map_dev, head);
    658 
    659 		list_del(&mdp->head);
    660 
    661 		free(mdp->from_dev);
    662 		free(mdp->to_dev);
    663 		free(mdp);
    664 	}
    665 }
    666 
    667 /**
    668  * map_dev - Return the mapped device for that specified
    669  * @from_dev:	Device name as seen on recorded system
    670  *
    671  * Note: If there is no such mapping, we return the same name.
    672  */
    673 static char *map_dev(char *from_dev)
    674 {
    675 	struct list_head *p;
    676 
    677 	__list_for_each(p, &map_devs) {
    678 		struct map_dev *mdp = list_entry(p, struct map_dev, head);
    679 
    680 		if (strcmp(from_dev, mdp->from_dev) == 0)
    681 			return mdp->to_dev;
    682 	}
    683 
    684 	return from_dev;
    685 }
    686 
    687 /*
    688  * ========================================================================
    689  * ==== IOCB MANAGEMENT ROUTINES ==========================================
    690  * ========================================================================
    691  */
    692 
    693 /**
    694  * iocb_init - Initialize the fields of an IOCB
    695  * @tip: Per-thread information
    696  * iocbp: IOCB pointer to update
    697  */
    698 static void iocb_init(struct thr_info *tip, struct iocb_pkt *iocbp)
    699 {
    700 	iocbp->tip = tip;
    701 	iocbp->nbytes = 0;
    702 	iocbp->iocb.u.c.buf = NULL;
    703 }
    704 
    705 /**
    706  * iocb_setup - Set up an iocb with this AIOs information
    707  * @iocbp: IOCB pointer to update
    708  * @rw: Direction (0 == write, 1 == read)
    709  * @n: Number of bytes to transfer
    710  * @off: Offset (in bytes)
    711  */
    712 static void iocb_setup(struct iocb_pkt *iocbp, int rw, int n, long long off)
    713 {
    714 	char *buf;
    715 	struct iocb *iop = &iocbp->iocb;
    716 
    717 	assert(rw == 0 || rw == 1);
    718 	assert(0 < n && (n % nb_sec) == 0);
    719 	assert(0 <= off);
    720 
    721 	if (iocbp->nbytes) {
    722 		if (iocbp->nbytes >= n) {
    723 			buf = iop->u.c.buf;
    724 			goto prep;
    725 		}
    726 
    727 		assert(iop->u.c.buf);
    728 		free(iop->u.c.buf);
    729 	}
    730 
    731 	buf = buf_alloc(n);
    732 	iocbp->nbytes = n;
    733 
    734 prep:
    735 	if (rw)
    736 		io_prep_pread(iop, iocbp->tip->ofd, buf, n, off);
    737 	else {
    738 		assert(write_enabled);
    739 		io_prep_pwrite(iop, iocbp->tip->ofd, buf, n, off);
    740 		touch_memory(buf, n);
    741 	}
    742 
    743 	iop->data = iocbp;
    744 }
    745 
    746 /*
    747  * ========================================================================
    748  * ==== PER-THREAD SET UP & TEAR DOWN =====================================
    749  * ========================================================================
    750  */
    751 
    752 /**
    753  * tip_init - Per thread initialization function
    754  */
    755 static void tip_init(struct thr_info *tip)
    756 {
    757 	int i;
    758 
    759 	INIT_LIST_HEAD(&tip->free_iocbs);
    760 	INIT_LIST_HEAD(&tip->used_iocbs);
    761 
    762 	pthread_mutex_init(&tip->mutex, NULL);
    763 	pthread_cond_init(&tip->cond, NULL);
    764 
    765 	if (io_setup(naios, &tip->ctx)) {
    766 		fatal("io_setup", ERR_SYSCALL, "io_setup failed\n");
    767 		/*NOTREACHED*/
    768 	}
    769 
    770 	tip->ofd = -1;
    771 	tip->naios_out = 0;
    772 	tip->send_done = tip->reap_done = 0;
    773 	tip->send_wait = tip->reap_wait = 0;
    774 
    775 	memset(&tip->sub_thread, 0, sizeof(tip->sub_thread));
    776 	memset(&tip->rec_thread, 0, sizeof(tip->rec_thread));
    777 
    778 	for (i = 0; i < naios; i++) {
    779 		struct iocb_pkt *iocbp = buf_alloc(sizeof(*iocbp));
    780 
    781 		iocb_init(tip, iocbp);
    782 		list_add_tail(&iocbp->head, &tip->free_iocbs);
    783 	}
    784 	tip->naios_free = naios;
    785 
    786 	if (verbose > 1) {
    787 		char fn[MAXPATHLEN];
    788 
    789 		sprintf(fn, "%s/%s.%s.%d.rep", idir, tip->devnm, ibase,
    790 			tip->cpu);
    791 		tip->vfp = fopen(fn, "w");
    792 		if (!tip->vfp) {
    793 			fatal(fn, ERR_SYSCALL, "Failed to open report\n");
    794 			/*NOTREACHED*/
    795 		}
    796 
    797 		setlinebuf(tip->vfp);
    798 	}
    799 
    800 	if (pthread_create(&tip->sub_thread, NULL, replay_sub, tip)) {
    801 		fatal("pthread_create", ERR_SYSCALL,
    802 			"thread create failed\n");
    803 		/*NOTREACHED*/
    804 	}
    805 
    806 	if (pthread_create(&tip->rec_thread, NULL, replay_rec, tip)) {
    807 		fatal("pthread_create", ERR_SYSCALL,
    808 			"thread create failed\n");
    809 		/*NOTREACHED*/
    810 	}
    811 }
    812 
    813 /**
    814  * tip_release - Release resources associated with this thread
    815  */
    816 static void tip_release(struct thr_info *tip)
    817 {
    818 	struct list_head *p, *q;
    819 
    820 	assert(tip->send_done);
    821 	assert(tip->reap_done);
    822 	assert(list_len(&tip->used_iocbs) == 0);
    823 	assert(tip->naios_free == naios);
    824 
    825 	if (pthread_join(tip->sub_thread, NULL)) {
    826 		fatal("pthread_join", ERR_SYSCALL, "pthread sub join failed\n");
    827 		/*NOTREACHED*/
    828 	}
    829 	if (pthread_join(tip->rec_thread, NULL)) {
    830 		fatal("pthread_join", ERR_SYSCALL, "pthread rec join failed\n");
    831 		/*NOTREACHED*/
    832 	}
    833 
    834 	io_destroy(tip->ctx);
    835 
    836 	list_splice(&tip->used_iocbs, &tip->free_iocbs);
    837 	list_for_each_safe(p, q, &tip->free_iocbs) {
    838 		struct iocb_pkt *iocbp = list_entry(p, struct iocb_pkt, head);
    839 
    840 		list_del(&iocbp->head);
    841 		if (iocbp->nbytes)
    842 			free(iocbp->iocb.u.c.buf);
    843 		free(iocbp);
    844 	}
    845 
    846 	pthread_cond_destroy(&tip->cond);
    847 	pthread_mutex_destroy(&tip->mutex);
    848 }
    849 
    850 /**
    851  * add_input_file - Allocate and initialize per-input file structure
    852  * @cpu: CPU for this file
    853  * @devnm: Device name for this file
    854  * @file_name: Fully qualifed input file name
    855  */
    856 static void add_input_file(int cpu, char *devnm, char *file_name)
    857 {
    858 	struct stat buf;
    859 	struct io_file_hdr hdr;
    860 	struct thr_info *tip = buf_alloc(sizeof(*tip));
    861 	__u64 my_version = mk_btversion(btver_mjr, btver_mnr, btver_sub);
    862 
    863 	assert(0 <= cpu && cpu < ncpus);
    864 
    865 	memset(&hdr, 0, sizeof(hdr));
    866 	memset(tip, 0, sizeof(*tip));
    867 	tip->cpu = cpu % cpus_to_use;
    868 	tip->iterations = def_iterations;
    869 
    870 	tip->ifd = open(file_name, O_RDONLY);
    871 	if (tip->ifd < 0) {
    872 		fatal(file_name, ERR_ARGS, "Unable to open\n");
    873 		/*NOTREACHED*/
    874 	}
    875 	if (fstat(tip->ifd, &buf) < 0) {
    876 		fatal(file_name, ERR_SYSCALL, "fstat failed\n");
    877 		/*NOTREACHED*/
    878 	}
    879 	if (buf.st_size < (off_t)sizeof(hdr)) {
    880 		if (verbose)
    881 			fprintf(stderr, "\t%s empty\n", file_name);
    882 		goto empty_file;
    883 	}
    884 
    885 	if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
    886 		fatal(file_name, ERR_ARGS, "Header read failed\n");
    887 		/*NOTREACHED*/
    888 	}
    889 
    890 	if (hdr.version != my_version) {
    891 		fprintf(stderr, "%llx %llx %llx %llx\n",
    892 			(long long unsigned)hdr.version,
    893 			(long long unsigned)hdr.genesis,
    894 			(long long unsigned)hdr.nbunches,
    895 			(long long unsigned)hdr.total_pkts);
    896 		fatal(NULL, ERR_ARGS,
    897 			"BT version mismatch: %lx versus my %lx\n",
    898 			(long)hdr.version, (long)my_version);
    899 
    900 	}
    901 
    902 	if (hdr.nbunches == 0) {
    903 empty_file:
    904 		close(tip->ifd);
    905 		free(tip);
    906 		return;
    907 	}
    908 
    909 	if (hdr.genesis < genesis) {
    910 		if (verbose > 1)
    911 			fprintf(stderr, "Setting genesis to %llu.%llu\n",
    912 				du64_to_sec(hdr.genesis),
    913 				du64_to_nsec(hdr.genesis));
    914 		genesis = hdr.genesis;
    915 	}
    916 
    917 	tip->devnm = strdup(devnm);
    918 	tip->file_name = strdup(file_name);
    919 
    920 	list_add_tail(&tip->head, &input_files);
    921 
    922 	if (verbose)
    923 		fprintf(stderr, "Added %s %llu\n", file_name,
    924 			(long long)hdr.genesis);
    925 }
    926 
    927 /**
    928  * rem_input_file - Release resources associated with an input file
    929  * @tip: Per-input file information
    930  */
    931 static void rem_input_file(struct thr_info *tip)
    932 {
    933 	list_del(&tip->head);
    934 
    935 	tip_release(tip);
    936 
    937 	close(tip->ofd);
    938 	close(tip->ifd);
    939 	free(tip->file_name);
    940 	free(tip->devnm);
    941 	free(tip);
    942 }
    943 
    944 /**
    945  * rem_input_files - Remove all input files
    946  */
    947 static void rem_input_files(void)
    948 {
    949 	struct list_head *p, *q;
    950 
    951 	list_for_each_safe(p, q, &input_files) {
    952 		rem_input_file(list_entry(p, struct thr_info, head));
    953 	}
    954 }
    955 
    956 /**
    957  * __find_input_files - Find input files associated with this device (per cpu)
    958  */
    959 static void __find_input_files(struct dev_info *dip)
    960 {
    961 	int cpu = 0;
    962 
    963 	for (;;) {
    964 		char full_name[MAXPATHLEN];
    965 
    966 		sprintf(full_name, "%s/%s.%s.%d", idir, dip->devnm, ibase, cpu);
    967 		if (access(full_name, R_OK) != 0)
    968 			break;
    969 
    970 		add_input_file(cpu, dip->devnm, full_name);
    971 		cpu++;
    972 	}
    973 
    974 	if (!cpu) {
    975 		fatal(NULL, ERR_ARGS, "No traces found for %s\n", dip->devnm);
    976 		/*NOTREACHED*/
    977 	}
    978 
    979 	rem_input_dev(dip);
    980 }
    981 
    982 
    983 /**
    984  * find_input_files - Find input files for all devices
    985  */
    986 static void find_input_files(void)
    987 {
    988 	struct list_head *p, *q;
    989 
    990 	list_for_each_safe(p, q, &input_devs) {
    991 		__find_input_files(list_entry(p, struct dev_info, head));
    992 	}
    993 }
    994 
    995 /*
    996  * ========================================================================
    997  * ==== RECLAIM ROUTINES ==================================================
    998  * ========================================================================
    999  */
   1000 
   1001 /**
   1002  * reap_wait_aios - Wait for and return number of outstanding AIOs
   1003  *
   1004  * Will return 0 if we are done
   1005  */
   1006 static int reap_wait_aios(struct thr_info *tip)
   1007 {
   1008 	int naios = 0;
   1009 
   1010 	if (!is_reap_done(tip)) {
   1011 		pthread_mutex_lock(&tip->mutex);
   1012 		while (tip->naios_out == 0) {
   1013 			tip->reap_wait = 1;
   1014 			if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
   1015 				fatal("pthread_cond_wait", ERR_SYSCALL,
   1016 					"nfree_current cond wait failed\n");
   1017 				/*NOTREACHED*/
   1018 			}
   1019 		}
   1020 		naios = tip->naios_out;
   1021 		pthread_mutex_unlock(&tip->mutex);
   1022 	}
   1023 	assert(is_reap_done(tip) || naios > 0);
   1024 
   1025 	return is_reap_done(tip) ? 0 : naios;
   1026 }
   1027 
   1028 /**
   1029  * reclaim_ios - Reclaim AIOs completed, recycle IOCBs
   1030  * @tip: Per-thread information
   1031  * @naios_out: Number of AIOs we have outstanding (min)
   1032  */
   1033 static void reclaim_ios(struct thr_info *tip, long naios_out)
   1034 {
   1035 	long i, ndone;
   1036 	struct io_event *evp, events[naios_out];
   1037 
   1038 again:
   1039 	assert(naios > 0);
   1040 	for (;;) {
   1041 		ndone = io_getevents(tip->ctx, 1, naios_out, events, NULL);
   1042 		if (ndone > 0)
   1043 			break;
   1044 
   1045 		if (errno && errno != EINTR) {
   1046 			fatal("io_getevents", ERR_SYSCALL,
   1047 				"io_getevents failed\n");
   1048 			/*NOTREACHED*/
   1049 		}
   1050 	}
   1051 	assert(0 < ndone && ndone <= naios_out);
   1052 
   1053 	pthread_mutex_lock(&tip->mutex);
   1054 	for (i = 0, evp = events; i < ndone; i++, evp++) {
   1055 		struct iocb_pkt *iocbp = evp->data;
   1056 
   1057                 if (evp->res != iocbp->iocb.u.c.nbytes) {
   1058                         fatal(NULL, ERR_SYSCALL,
   1059                               "Event failure %ld/%ld\t(%ld + %ld)\n",
   1060                               (long)evp->res, (long)evp->res2,
   1061                               (long)iocbp->iocb.u.c.offset / nb_sec,
   1062 			      (long)iocbp->iocb.u.c.nbytes / nb_sec);
   1063                         /*NOTREACHED*/
   1064                 }
   1065 
   1066 		list_move_tail(&iocbp->head, &tip->free_iocbs);
   1067 	}
   1068 
   1069 	tip->naios_free += ndone;
   1070 	tip->naios_out -= ndone;
   1071 	naios_out = minl(naios_out, tip->naios_out);
   1072 
   1073 	if (tip->send_wait) {
   1074 		tip->send_wait = 0;
   1075 		pthread_cond_signal(&tip->cond);
   1076 	}
   1077 	pthread_mutex_unlock(&tip->mutex);
   1078 
   1079 	/*
   1080 	 * Short cut: If we /know/ there are some more AIOs, go handle them
   1081 	 */
   1082 	if (naios_out)
   1083 		goto again;
   1084 }
   1085 
   1086 /**
   1087  * replay_rec - Worker thread to reclaim AIOs
   1088  * @arg: Pointer to thread information
   1089  */
   1090 static void *replay_rec(void *arg)
   1091 {
   1092 	long naios_out;
   1093 	struct thr_info *tip = arg;
   1094 
   1095 	while ((naios_out = reap_wait_aios(tip)) > 0)
   1096 		reclaim_ios(tip, naios_out);
   1097 
   1098 	assert(tip->send_done);
   1099 	tip->reap_done = 1;
   1100 	set_reclaim_done();
   1101 
   1102 	return NULL;
   1103 }
   1104 
   1105 /*
   1106  * ========================================================================
   1107  * ==== REPLAY ROUTINES ===================================================
   1108  * ========================================================================
   1109  */
   1110 
   1111 /**
   1112  * next_bunch - Retrieve next bunch of AIOs to process
   1113  * @tip: Per-thread information
   1114  * @bunch: Bunch information
   1115  *
   1116  * Returns TRUE if we recovered a bunch of IOs, else hit EOF
   1117  */
   1118 static int next_bunch(struct thr_info *tip, struct io_bunch *bunch)
   1119 {
   1120 	size_t count, result;
   1121 
   1122 	result = read(tip->ifd, &bunch->hdr, sizeof(bunch->hdr));
   1123 	if (result != sizeof(bunch->hdr)) {
   1124 		if (result == 0)
   1125 			return 0;
   1126 
   1127 		fatal(tip->file_name, ERR_SYSCALL, "Short hdr(%ld)\n",
   1128 			(long)result);
   1129 		/*NOTREACHED*/
   1130 	}
   1131 	assert(bunch->hdr.npkts <= BT_MAX_PKTS);
   1132 
   1133 	count = bunch->hdr.npkts * sizeof(struct io_pkt);
   1134 	result = read(tip->ifd, &bunch->pkts, count);
   1135 	if (result != count) {
   1136 		fatal(tip->file_name, ERR_SYSCALL, "Short pkts(%ld/%ld)\n",
   1137 			(long)result, (long)count);
   1138 		/*NOTREACHED*/
   1139 	}
   1140 
   1141 	return 1;
   1142 }
   1143 
   1144 /**
   1145  * nfree_current - Returns current number of AIOs that are free
   1146  *
   1147  * Will wait for available ones...
   1148  *
   1149  * Returns 0 if we have some condition that causes us to exit
   1150  */
   1151 static int nfree_current(struct thr_info *tip)
   1152 {
   1153 	int nfree = 0;
   1154 
   1155 	pthread_mutex_lock(&tip->mutex);
   1156 	while (!is_send_done(tip) && ((nfree = tip->naios_free) == 0)) {
   1157 		tip->send_wait = 1;
   1158 		if (pthread_cond_wait(&tip->cond, &tip->mutex)) {
   1159 			fatal("pthread_cond_wait", ERR_SYSCALL,
   1160 				"nfree_current cond wait failed\n");
   1161 			/*NOTREACHED*/
   1162 		}
   1163 	}
   1164 	pthread_mutex_unlock(&tip->mutex);
   1165 
   1166 	return nfree;
   1167 }
   1168 
   1169 /**
   1170  * stall - Stall for the number of nanoseconds requested
   1171  *
   1172  * We may be late, in which case we just return.
   1173  */
   1174 static void stall(struct thr_info *tip, long long oclock)
   1175 {
   1176 	struct timespec req;
   1177 	long long dreal, tclock = gettime() - rgenesis;
   1178 
   1179 	oclock /= acc_factor;
   1180 
   1181 	if (verbose > 1)
   1182 		fprintf(tip->vfp, "   stall(%lld.%09lld, %lld.%09lld)\n",
   1183 			du64_to_sec(oclock), du64_to_nsec(oclock),
   1184 			du64_to_sec(tclock), du64_to_nsec(tclock));
   1185 
   1186 	while (!is_send_done(tip) && tclock < oclock) {
   1187 		dreal = oclock - tclock;
   1188 		req.tv_sec = dreal / (1000 * 1000 * 1000);
   1189 		req.tv_nsec = dreal % (1000 * 1000 * 1000);
   1190 
   1191 		if (verbose > 1) {
   1192 			fprintf(tip->vfp, "++ stall(%lld.%09lld) ++\n",
   1193 				(long long)req.tv_sec,
   1194 				(long long)req.tv_nsec);
   1195 		}
   1196 
   1197 		if (nanosleep(&req, NULL) < 0 && signal_done)
   1198 			break;
   1199 
   1200 		tclock = gettime() - rgenesis;
   1201 	}
   1202 }
   1203 
   1204 /**
   1205  * iocbs_map - Map a set of AIOs onto a set of IOCBs
   1206  * @tip: Per-thread information
   1207  * @list: List of AIOs created
   1208  * @pkts: AIOs to map
   1209  * @ntodo: Number of AIOs to map
   1210  */
   1211 static void iocbs_map(struct thr_info *tip, struct iocb **list,
   1212 					     struct io_pkt *pkts, int ntodo)
   1213 {
   1214 	int i;
   1215 	struct io_pkt *pkt;
   1216 
   1217 	assert(0 < ntodo && ntodo <= naios);
   1218 
   1219 	pthread_mutex_lock(&tip->mutex);
   1220 	assert(ntodo <= list_len(&tip->free_iocbs));
   1221 	for (i = 0, pkt = pkts; i < ntodo; i++, pkt++) {
   1222 		__u32 rw = pkt->rw;
   1223 		struct iocb_pkt *iocbp;
   1224 
   1225 		if (!pkt->rw && !write_enabled)
   1226 			rw = 1;
   1227 
   1228 		if (verbose > 1)
   1229 			fprintf(tip->vfp, "\t%10llu + %10llu %c%c\n",
   1230 				(unsigned long long)pkt->sector,
   1231 				(unsigned long long)pkt->nbytes / nb_sec,
   1232 				rw ? 'R' : 'W',
   1233 				(rw == 1 && pkt->rw == 0) ? '!' : ' ');
   1234 
   1235 		iocbp = list_entry(tip->free_iocbs.next, struct iocb_pkt, head);
   1236 		iocb_setup(iocbp, rw, pkt->nbytes, pkt->sector * nb_sec);
   1237 
   1238 		list_move_tail(&iocbp->head, &tip->used_iocbs);
   1239 		list[i] = &iocbp->iocb;
   1240 	}
   1241 
   1242 	tip->naios_free -= ntodo;
   1243 	assert(tip->naios_free >= 0);
   1244 	pthread_mutex_unlock(&tip->mutex);
   1245 }
   1246 
   1247 /**
   1248  * process_bunch - Process a bunch of requests
   1249  * @tip: Per-thread information
   1250  * @bunch: Bunch to process
   1251  */
   1252 static void process_bunch(struct thr_info *tip, struct io_bunch *bunch)
   1253 {
   1254 	__u64 i = 0;
   1255 	struct iocb *list[bunch->hdr.npkts];
   1256 
   1257 	assert(0 < bunch->hdr.npkts && bunch->hdr.npkts <= BT_MAX_PKTS);
   1258 	while (!is_send_done(tip) && (i < bunch->hdr.npkts)) {
   1259 		long ndone;
   1260 		int ntodo = min(nfree_current(tip), bunch->hdr.npkts - i);
   1261 
   1262 		assert(0 < ntodo && ntodo <= naios);
   1263 		iocbs_map(tip, list, &bunch->pkts[i], ntodo);
   1264 		if (!no_stalls)
   1265 			stall(tip, bunch->hdr.time_stamp - genesis);
   1266 
   1267 		if (ntodo) {
   1268 			if (verbose > 1)
   1269 				fprintf(tip->vfp, "submit(%d)\n", ntodo);
   1270 			ndone = io_submit(tip->ctx, ntodo, list);
   1271 			if (ndone != (long)ntodo) {
   1272 				fatal("io_submit", ERR_SYSCALL,
   1273 					"%d: io_submit(%d:%ld) failed (%s)\n",
   1274 					tip->cpu, ntodo, ndone,
   1275 					strerror(labs(ndone)));
   1276 				/*NOTREACHED*/
   1277 			}
   1278 
   1279 			pthread_mutex_lock(&tip->mutex);
   1280 			tip->naios_out += ndone;
   1281 			assert(tip->naios_out <= naios);
   1282 			if (tip->reap_wait) {
   1283 				tip->reap_wait = 0;
   1284 				pthread_cond_signal(&tip->cond);
   1285 			}
   1286 			pthread_mutex_unlock(&tip->mutex);
   1287 
   1288 			i += ndone;
   1289 			assert(i <= bunch->hdr.npkts);
   1290 		}
   1291 	}
   1292 }
   1293 
   1294 /**
   1295  * reset_input_file - Reset the input file for the next iteration
   1296  * @tip: Thread information
   1297  *
   1298  * We also do a dummy read of the file header to get us to the first bunch.
   1299  */
   1300 static void reset_input_file(struct thr_info *tip)
   1301 {
   1302 	struct io_file_hdr hdr;
   1303 
   1304 	lseek(tip->ifd, 0, 0);
   1305 
   1306 	if (read(tip->ifd, &hdr, sizeof(hdr)) != sizeof(hdr)) {
   1307 		fatal(tip->file_name, ERR_ARGS, "Header reread failed\n");
   1308 		/*NOTREACHED*/
   1309 	}
   1310 }
   1311 
   1312 /**
   1313  * replay_sub - Worker thread to submit AIOs that are being replayed
   1314  */
   1315 static void *replay_sub(void *arg)
   1316 {
   1317 	char path[MAXPATHLEN];
   1318 	struct io_bunch bunch;
   1319 	struct thr_info *tip = arg;
   1320 	int oflags;
   1321 
   1322 	pin_to_cpu(tip);
   1323 
   1324 	sprintf(path, "/dev/%s", map_dev(tip->devnm));
   1325 
   1326 #ifdef O_NOATIME
   1327 	oflags = O_NOATIME;
   1328 #else
   1329 	oflags = 0;
   1330 #endif
   1331 	tip->ofd = open(path, O_RDWR | O_DIRECT | oflags);
   1332 	if (tip->ofd < 0) {
   1333 		fatal(path, ERR_SYSCALL, "Failed device open\n");
   1334 		/*NOTREACHED*/
   1335 	}
   1336 
   1337 	set_replay_ready();
   1338 	while (!is_send_done(tip) && tip->iterations--) {
   1339 		wait_iter_start();
   1340 		if (verbose > 1)
   1341 			fprintf(tip->vfp, "\n=== %d ===\n", tip->iterations);
   1342 		while (!is_send_done(tip) && next_bunch(tip, &bunch))
   1343 			process_bunch(tip, &bunch);
   1344 		set_iter_done();
   1345 		reset_input_file(tip);
   1346 	}
   1347 	tip->send_done = 1;
   1348 	set_replay_done();
   1349 
   1350 	return NULL;
   1351 }
   1352 
   1353 /*
   1354  * ========================================================================
   1355  * ==== COMMAND LINE ARGUMENT HANDLING ====================================
   1356  * ========================================================================
   1357  */
   1358 
   1359 static char usage_str[] = 						\
   1360         "\n"								\
   1361         "\t[ -c <cpus> : --cpus=<cpus>           ] Default: 1\n"        \
   1362         "\t[ -d <dir>  : --input-directory=<dir> ] Default: .\n"        \
   1363 	"\t[ -F        : --find-records          ] Default: Off\n"	\
   1364         "\t[ -h        : --help                  ] Default: Off\n"      \
   1365         "\t[ -i <base> : --input-base=<base>     ] Default: replay\n"   \
   1366         "\t[ -I <iters>: --iterations=<iters>    ] Default: 1\n"        \
   1367         "\t[ -M <file> : --map-devs=<file>       ] Default: None\n"     \
   1368         "\t[ -N        : --no-stalls             ] Default: Off\n"      \
   1369         "\t[ -x        : --acc-factor            ] Default: 1\n"	\
   1370         "\t[ -v        : --verbose               ] Default: Off\n"      \
   1371         "\t[ -V        : --version               ] Default: Off\n"      \
   1372         "\t[ -W        : --write-enable          ] Default: Off\n"      \
   1373         "\t<dev...>                                Default: None\n"     \
   1374         "\n";
   1375 
   1376 #define S_OPTS	"c:d:Fhi:I:M:Nx:t:vVW"
   1377 static struct option l_opts[] = {
   1378 	{
   1379 		.name = "cpus",
   1380 		.has_arg = required_argument,
   1381 		.flag = NULL,
   1382 		.val = 'c'
   1383 	},
   1384 	{
   1385 		.name = "input-directory",
   1386 		.has_arg = required_argument,
   1387 		.flag = NULL,
   1388 		.val = 'd'
   1389 	},
   1390 	{
   1391 		.name = "find-records",
   1392 		.has_arg = no_argument,
   1393 		.flag = NULL,
   1394 		.val = 'F'
   1395 	},
   1396 	{
   1397 		.name = "help",
   1398 		.has_arg = no_argument,
   1399 		.flag = NULL,
   1400 		.val = 'h'
   1401 	},
   1402 	{
   1403 		.name = "input-base",
   1404 		.has_arg = required_argument,
   1405 		.flag = NULL,
   1406 		.val = 'i'
   1407 	},
   1408 	{
   1409 		.name = "iterations",
   1410 		.has_arg = required_argument,
   1411 		.flag = NULL,
   1412 		.val = 'I'
   1413 	},
   1414 	{
   1415 		.name = "map-devs",
   1416 		.has_arg = required_argument,
   1417 		.flag = NULL,
   1418 		.val = 'M'
   1419 	},
   1420 	{
   1421 		.name = "no-stalls",
   1422 		.has_arg = no_argument,
   1423 		.flag = NULL,
   1424 		.val = 'N'
   1425 	},
   1426 	{
   1427 		.name = "acc-factor",
   1428 		.has_arg = required_argument,
   1429 		.flag = NULL,
   1430 		.val = 'x'
   1431 	},
   1432 	{
   1433 		.name = "verbose",
   1434 		.has_arg = no_argument,
   1435 		.flag = NULL,
   1436 		.val = 'v'
   1437 	},
   1438 	{
   1439 		.name = "version",
   1440 		.has_arg = no_argument,
   1441 		.flag = NULL,
   1442 		.val = 'V'
   1443 	},
   1444 	{
   1445 		.name = "write-enable",
   1446 		.has_arg = no_argument,
   1447 		.flag = NULL,
   1448 		.val = 'W'
   1449 	},
   1450 	{
   1451 		.name = NULL
   1452 	}
   1453 };
   1454 
   1455 /**
   1456  * handle_args: Parse passed in argument list
   1457  * @argc: Number of arguments in argv
   1458  * @argv: Arguments passed in
   1459  *
   1460  * Does rudimentary parameter verification as well.
   1461  */
   1462 static void handle_args(int argc, char *argv[])
   1463 {
   1464 	int c;
   1465 	int r;
   1466 
   1467 	while ((c = getopt_long(argc, argv, S_OPTS, l_opts, NULL)) != -1) {
   1468 		switch (c) {
   1469 		case 'c':
   1470 			cpus_to_use = atoi(optarg);
   1471 			if (cpus_to_use <= 0 || cpus_to_use > ncpus) {
   1472 				fatal(NULL, ERR_ARGS,
   1473 				      "Invalid number of cpus %d (0<x<%d)\n",
   1474 				      cpus_to_use, ncpus);
   1475 				/*NOTREACHED*/
   1476 			}
   1477 			break;
   1478 
   1479 		case 'd':
   1480 			idir = optarg;
   1481 			if (access(idir, R_OK | X_OK) != 0) {
   1482 				fatal(idir, ERR_ARGS,
   1483 				      "Invalid input directory specified\n");
   1484 				/*NOTREACHED*/
   1485 			}
   1486 			break;
   1487 
   1488 		case 'F':
   1489 			find_records = 1;
   1490 			break;
   1491 
   1492 		case 'h':
   1493 			usage();
   1494 			exit(0);
   1495 			/*NOTREACHED*/
   1496 
   1497 		case 'i':
   1498 			ibase = optarg;
   1499 			break;
   1500 
   1501 		case 'I':
   1502 			def_iterations = atoi(optarg);
   1503 			if (def_iterations <= 0) {
   1504 				fprintf(stderr,
   1505 					"Invalid number of iterations %d\n",
   1506 					def_iterations);
   1507 				exit(ERR_ARGS);
   1508 				/*NOTREACHED*/
   1509 			}
   1510 			break;
   1511 
   1512 		case 'M':
   1513 			read_map_devs(optarg);
   1514 			break;
   1515 
   1516 		case 'N':
   1517 			no_stalls = 1;
   1518 			break;
   1519 
   1520 		case 'x':
   1521 			r = sscanf(optarg,"%u",&acc_factor);
   1522 			if (r!=1) {
   1523 				fprintf(stderr,
   1524 					"Invalid acceleration factor\n");
   1525 				exit(ERR_ARGS);
   1526 				/*NOTREACHED*/
   1527 			}
   1528 			break;
   1529 
   1530 		case 'V':
   1531 			fprintf(stderr, "btreplay -- version %s\n",
   1532 				my_btversion);
   1533 			fprintf(stderr, "            Built on %s\n",
   1534 				build_date);
   1535 			exit(0);
   1536 			/*NOTREACHED*/
   1537 
   1538 		case 'v':
   1539 			verbose++;
   1540 			break;
   1541 
   1542 		case 'W':
   1543 			write_enabled = 1;
   1544 			break;
   1545 
   1546 		default:
   1547 			usage();
   1548 			fatal(NULL, ERR_ARGS,
   1549 			      "Invalid command line argument %c\n", c);
   1550 			/*NOTREACHED*/
   1551 		}
   1552 	}
   1553 
   1554 	while (optind < argc)
   1555 		add_input_dev(argv[optind++]);
   1556 
   1557 	if (find_records)
   1558 		find_input_devs(idir);
   1559 
   1560 	if (list_len(&input_devs) == 0) {
   1561 		fatal(NULL, ERR_ARGS, "Missing required input dev name(s)\n");
   1562 		/*NOTREACHED*/
   1563 	}
   1564 
   1565 	if (cpus_to_use < 0)
   1566 		cpus_to_use = ncpus;
   1567 }
   1568 
   1569 /*
   1570  * ========================================================================
   1571  * ==== MAIN ROUTINE ======================================================
   1572  * ========================================================================
   1573  */
   1574 
   1575 /**
   1576  * set_signal_done - Signal handler, catches signals & sets signal_done
   1577  */
   1578 static void set_signal_done(__attribute__((__unused__))int signum)
   1579 {
   1580 	signal_done = 1;
   1581 }
   1582 
   1583 /**
   1584  * main -
   1585  * @argc: Number of arguments
   1586  * @argv: Array of arguments
   1587  */
   1588 int main(int argc, char *argv[])
   1589 {
   1590 	int i;
   1591 	struct list_head *p;
   1592 
   1593 	pgsize = getpagesize();
   1594 	assert(pgsize > 0);
   1595 
   1596 	setup_signal(SIGINT, set_signal_done);
   1597 	setup_signal(SIGTERM, set_signal_done);
   1598 
   1599 	get_ncpus();
   1600 	handle_args(argc, argv);
   1601 	find_input_files();
   1602 
   1603 	nfiles = list_len(&input_files);
   1604 	__list_for_each(p, &input_files) {
   1605 		tip_init(list_entry(p, struct thr_info, head));
   1606 	}
   1607 
   1608 	wait_replays_ready();
   1609 	for (i = 0; i < def_iterations; i++) {
   1610 		rgenesis = gettime();
   1611 		start_iter();
   1612 		if (verbose)
   1613 			fprintf(stderr, "I");
   1614 		wait_iters_done();
   1615 	}
   1616 
   1617 	wait_replays_done();
   1618 	wait_reclaims_done();
   1619 
   1620 	if (verbose)
   1621 		fprintf(stderr, "\n");
   1622 
   1623 	rem_input_files();
   1624 	release_map_devs();
   1625 
   1626 	return 0;
   1627 }
   1628