Home | History | Annotate | Download | only in fio
      1 #include <unistd.h>
      2 #include <fcntl.h>
      3 #include <string.h>
      4 #include <signal.h>
      5 #include <time.h>
      6 #include <assert.h>
      7 
      8 #include "fio.h"
      9 #include "hash.h"
     10 #include "verify.h"
     11 #include "trim.h"
     12 #include "lib/rand.h"
     13 #include "lib/axmap.h"
     14 #include "err.h"
     15 
     16 struct io_completion_data {
     17 	int nr;				/* input */
     18 
     19 	int error;			/* output */
     20 	uint64_t bytes_done[DDIR_RWDIR_CNT];	/* output */
     21 	struct timeval time;		/* output */
     22 };
     23 
     24 /*
     25  * The ->io_axmap contains a map of blocks we have or have not done io
     26  * to yet. Used to make sure we cover the entire range in a fair fashion.
     27  */
     28 static int random_map_free(struct fio_file *f, const uint64_t block)
     29 {
     30 	return !axmap_isset(f->io_axmap, block);
     31 }
     32 
     33 /*
     34  * Mark a given offset as used in the map.
     35  */
     36 static void mark_random_map(struct thread_data *td, struct io_u *io_u)
     37 {
     38 	unsigned int min_bs = td->o.rw_min_bs;
     39 	struct fio_file *f = io_u->file;
     40 	unsigned int nr_blocks;
     41 	uint64_t block;
     42 
     43 	block = (io_u->offset - f->file_offset) / (uint64_t) min_bs;
     44 	nr_blocks = (io_u->buflen + min_bs - 1) / min_bs;
     45 
     46 	if (!(io_u->flags & IO_U_F_BUSY_OK))
     47 		nr_blocks = axmap_set_nr(f->io_axmap, block, nr_blocks);
     48 
     49 	if ((nr_blocks * min_bs) < io_u->buflen)
     50 		io_u->buflen = nr_blocks * min_bs;
     51 }
     52 
     53 static uint64_t last_block(struct thread_data *td, struct fio_file *f,
     54 			   enum fio_ddir ddir)
     55 {
     56 	uint64_t max_blocks;
     57 	uint64_t max_size;
     58 
     59 	assert(ddir_rw(ddir));
     60 
     61 	/*
     62 	 * Hmm, should we make sure that ->io_size <= ->real_file_size?
     63 	 */
     64 	max_size = f->io_size;
     65 	if (max_size > f->real_file_size)
     66 		max_size = f->real_file_size;
     67 
     68 	if (td->o.zone_range)
     69 		max_size = td->o.zone_range;
     70 
     71 	if (td->o.min_bs[ddir] > td->o.ba[ddir])
     72 		max_size -= td->o.min_bs[ddir] - td->o.ba[ddir];
     73 
     74 	max_blocks = max_size / (uint64_t) td->o.ba[ddir];
     75 	if (!max_blocks)
     76 		return 0;
     77 
     78 	return max_blocks;
     79 }
     80 
     81 struct rand_off {
     82 	struct flist_head list;
     83 	uint64_t off;
     84 };
     85 
     86 static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
     87 				  enum fio_ddir ddir, uint64_t *b)
     88 {
     89 	uint64_t r;
     90 
     91 	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE) {
     92 		uint64_t lastb;
     93 
     94 		lastb = last_block(td, f, ddir);
     95 		if (!lastb)
     96 			return 1;
     97 
     98 		r = __rand(&td->random_state);
     99 
    100 		dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
    101 
    102 		*b = lastb * (r / ((uint64_t) FRAND_MAX + 1.0));
    103 	} else {
    104 		uint64_t off = 0;
    105 
    106 		assert(fio_file_lfsr(f));
    107 
    108 		if (lfsr_next(&f->lfsr, &off))
    109 			return 1;
    110 
    111 		*b = off;
    112 	}
    113 
    114 	/*
    115 	 * if we are not maintaining a random map, we are done.
    116 	 */
    117 	if (!file_randommap(td, f))
    118 		goto ret;
    119 
    120 	/*
    121 	 * calculate map offset and check if it's free
    122 	 */
    123 	if (random_map_free(f, *b))
    124 		goto ret;
    125 
    126 	dprint(FD_RANDOM, "get_next_rand_offset: offset %llu busy\n",
    127 						(unsigned long long) *b);
    128 
    129 	*b = axmap_next_free(f->io_axmap, *b);
    130 	if (*b == (uint64_t) -1ULL)
    131 		return 1;
    132 ret:
    133 	return 0;
    134 }
    135 
    136 static int __get_next_rand_offset_zipf(struct thread_data *td,
    137 				       struct fio_file *f, enum fio_ddir ddir,
    138 				       uint64_t *b)
    139 {
    140 	*b = zipf_next(&f->zipf);
    141 	return 0;
    142 }
    143 
    144 static int __get_next_rand_offset_pareto(struct thread_data *td,
    145 					 struct fio_file *f, enum fio_ddir ddir,
    146 					 uint64_t *b)
    147 {
    148 	*b = pareto_next(&f->zipf);
    149 	return 0;
    150 }
    151 
    152 static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b)
    153 {
    154 	struct rand_off *r1 = flist_entry(a, struct rand_off, list);
    155 	struct rand_off *r2 = flist_entry(b, struct rand_off, list);
    156 
    157 	return r1->off - r2->off;
    158 }
    159 
    160 static int get_off_from_method(struct thread_data *td, struct fio_file *f,
    161 			       enum fio_ddir ddir, uint64_t *b)
    162 {
    163 	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
    164 		return __get_next_rand_offset(td, f, ddir, b);
    165 	else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
    166 		return __get_next_rand_offset_zipf(td, f, ddir, b);
    167 	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
    168 		return __get_next_rand_offset_pareto(td, f, ddir, b);
    169 
    170 	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
    171 	return 1;
    172 }
    173 
    174 /*
    175  * Sort the reads for a verify phase in batches of verifysort_nr, if
    176  * specified.
    177  */
    178 static inline int should_sort_io(struct thread_data *td)
    179 {
    180 	if (!td->o.verifysort_nr || !td->o.do_verify)
    181 		return 0;
    182 	if (!td_random(td))
    183 		return 0;
    184 	if (td->runstate != TD_VERIFYING)
    185 		return 0;
    186 	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE)
    187 		return 0;
    188 
    189 	return 1;
    190 }
    191 
    192 static int should_do_random(struct thread_data *td, enum fio_ddir ddir)
    193 {
    194 	unsigned int v;
    195 	unsigned long r;
    196 
    197 	if (td->o.perc_rand[ddir] == 100)
    198 		return 1;
    199 
    200 	r = __rand(&td->seq_rand_state[ddir]);
    201 	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
    202 
    203 	return v <= td->o.perc_rand[ddir];
    204 }
    205 
    206 static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
    207 				enum fio_ddir ddir, uint64_t *b)
    208 {
    209 	struct rand_off *r;
    210 	int i, ret = 1;
    211 
    212 	if (!should_sort_io(td))
    213 		return get_off_from_method(td, f, ddir, b);
    214 
    215 	if (!flist_empty(&td->next_rand_list)) {
    216 fetch:
    217 		r = flist_first_entry(&td->next_rand_list, struct rand_off, list);
    218 		flist_del(&r->list);
    219 		*b = r->off;
    220 		free(r);
    221 		return 0;
    222 	}
    223 
    224 	for (i = 0; i < td->o.verifysort_nr; i++) {
    225 		r = malloc(sizeof(*r));
    226 
    227 		ret = get_off_from_method(td, f, ddir, &r->off);
    228 		if (ret) {
    229 			free(r);
    230 			break;
    231 		}
    232 
    233 		flist_add(&r->list, &td->next_rand_list);
    234 	}
    235 
    236 	if (ret && !i)
    237 		return ret;
    238 
    239 	assert(!flist_empty(&td->next_rand_list));
    240 	flist_sort(NULL, &td->next_rand_list, flist_cmp);
    241 	goto fetch;
    242 }
    243 
    244 static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
    245 			       enum fio_ddir ddir, uint64_t *b)
    246 {
    247 	if (!get_next_rand_offset(td, f, ddir, b))
    248 		return 0;
    249 
    250 	if (td->o.time_based) {
    251 		fio_file_reset(td, f);
    252 		if (!get_next_rand_offset(td, f, ddir, b))
    253 			return 0;
    254 	}
    255 
    256 	dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n",
    257 			f->file_name, (unsigned long long) f->last_pos[ddir],
    258 			(unsigned long long) f->real_file_size);
    259 	return 1;
    260 }
    261 
    262 static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
    263 			       enum fio_ddir ddir, uint64_t *offset)
    264 {
    265 	struct thread_options *o = &td->o;
    266 
    267 	assert(ddir_rw(ddir));
    268 
    269 	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
    270 	    o->time_based)
    271 		f->last_pos[ddir] = f->last_pos[ddir] - f->io_size;
    272 
    273 	if (f->last_pos[ddir] < f->real_file_size) {
    274 		uint64_t pos;
    275 
    276 		if (f->last_pos[ddir] == f->file_offset && o->ddir_seq_add < 0)
    277 			f->last_pos[ddir] = f->real_file_size;
    278 
    279 		pos = f->last_pos[ddir] - f->file_offset;
    280 		if (pos && o->ddir_seq_add) {
    281 			pos += o->ddir_seq_add;
    282 
    283 			/*
    284 			 * If we reach beyond the end of the file
    285 			 * with holed IO, wrap around to the
    286 			 * beginning again.
    287 			 */
    288 			if (pos >= f->real_file_size)
    289 				pos = f->file_offset;
    290 		}
    291 
    292 		*offset = pos;
    293 		return 0;
    294 	}
    295 
    296 	return 1;
    297 }
    298 
    299 static int get_next_block(struct thread_data *td, struct io_u *io_u,
    300 			  enum fio_ddir ddir, int rw_seq,
    301 			  unsigned int *is_random)
    302 {
    303 	struct fio_file *f = io_u->file;
    304 	uint64_t b, offset;
    305 	int ret;
    306 
    307 	assert(ddir_rw(ddir));
    308 
    309 	b = offset = -1ULL;
    310 
    311 	if (rw_seq) {
    312 		if (td_random(td)) {
    313 			if (should_do_random(td, ddir)) {
    314 				ret = get_next_rand_block(td, f, ddir, &b);
    315 				*is_random = 1;
    316 			} else {
    317 				*is_random = 0;
    318 				io_u->flags |= IO_U_F_BUSY_OK;
    319 				ret = get_next_seq_offset(td, f, ddir, &offset);
    320 				if (ret)
    321 					ret = get_next_rand_block(td, f, ddir, &b);
    322 			}
    323 		} else {
    324 			*is_random = 0;
    325 			ret = get_next_seq_offset(td, f, ddir, &offset);
    326 		}
    327 	} else {
    328 		io_u->flags |= IO_U_F_BUSY_OK;
    329 		*is_random = 0;
    330 
    331 		if (td->o.rw_seq == RW_SEQ_SEQ) {
    332 			ret = get_next_seq_offset(td, f, ddir, &offset);
    333 			if (ret) {
    334 				ret = get_next_rand_block(td, f, ddir, &b);
    335 				*is_random = 0;
    336 			}
    337 		} else if (td->o.rw_seq == RW_SEQ_IDENT) {
    338 			if (f->last_start[ddir] != -1ULL)
    339 				offset = f->last_start[ddir] - f->file_offset;
    340 			else
    341 				offset = 0;
    342 			ret = 0;
    343 		} else {
    344 			log_err("fio: unknown rw_seq=%d\n", td->o.rw_seq);
    345 			ret = 1;
    346 		}
    347 	}
    348 
    349 	if (!ret) {
    350 		if (offset != -1ULL)
    351 			io_u->offset = offset;
    352 		else if (b != -1ULL)
    353 			io_u->offset = b * td->o.ba[ddir];
    354 		else {
    355 			log_err("fio: bug in offset generation: offset=%llu, b=%llu\n", (unsigned long long) offset, (unsigned long long) b);
    356 			ret = 1;
    357 		}
    358 	}
    359 
    360 	return ret;
    361 }
    362 
    363 /*
    364  * For random io, generate a random new block and see if it's used. Repeat
    365  * until we find a free one. For sequential io, just return the end of
    366  * the last io issued.
    367  */
    368 static int __get_next_offset(struct thread_data *td, struct io_u *io_u,
    369 			     unsigned int *is_random)
    370 {
    371 	struct fio_file *f = io_u->file;
    372 	enum fio_ddir ddir = io_u->ddir;
    373 	int rw_seq_hit = 0;
    374 
    375 	assert(ddir_rw(ddir));
    376 
    377 	if (td->o.ddir_seq_nr && !--td->ddir_seq_nr) {
    378 		rw_seq_hit = 1;
    379 		td->ddir_seq_nr = td->o.ddir_seq_nr;
    380 	}
    381 
    382 	if (get_next_block(td, io_u, ddir, rw_seq_hit, is_random))
    383 		return 1;
    384 
    385 	if (io_u->offset >= f->io_size) {
    386 		dprint(FD_IO, "get_next_offset: offset %llu >= io_size %llu\n",
    387 					(unsigned long long) io_u->offset,
    388 					(unsigned long long) f->io_size);
    389 		return 1;
    390 	}
    391 
    392 	io_u->offset += f->file_offset;
    393 	if (io_u->offset >= f->real_file_size) {
    394 		dprint(FD_IO, "get_next_offset: offset %llu >= size %llu\n",
    395 					(unsigned long long) io_u->offset,
    396 					(unsigned long long) f->real_file_size);
    397 		return 1;
    398 	}
    399 
    400 	return 0;
    401 }
    402 
    403 static int get_next_offset(struct thread_data *td, struct io_u *io_u,
    404 			   unsigned int *is_random)
    405 {
    406 	if (td->flags & TD_F_PROFILE_OPS) {
    407 		struct prof_io_ops *ops = &td->prof_io_ops;
    408 
    409 		if (ops->fill_io_u_off)
    410 			return ops->fill_io_u_off(td, io_u, is_random);
    411 	}
    412 
    413 	return __get_next_offset(td, io_u, is_random);
    414 }
    415 
    416 static inline int io_u_fits(struct thread_data *td, struct io_u *io_u,
    417 			    unsigned int buflen)
    418 {
    419 	struct fio_file *f = io_u->file;
    420 
    421 	return io_u->offset + buflen <= f->io_size + get_start_offset(td, f);
    422 }
    423 
    424 static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u,
    425 				      unsigned int is_random)
    426 {
    427 	int ddir = io_u->ddir;
    428 	unsigned int buflen = 0;
    429 	unsigned int minbs, maxbs;
    430 	unsigned long r;
    431 
    432 	assert(ddir_rw(ddir));
    433 
    434 	if (td->o.bs_is_seq_rand)
    435 		ddir = is_random ? DDIR_WRITE: DDIR_READ;
    436 
    437 	minbs = td->o.min_bs[ddir];
    438 	maxbs = td->o.max_bs[ddir];
    439 
    440 	if (minbs == maxbs)
    441 		return minbs;
    442 
    443 	/*
    444 	 * If we can't satisfy the min block size from here, then fail
    445 	 */
    446 	if (!io_u_fits(td, io_u, minbs))
    447 		return 0;
    448 
    449 	do {
    450 		r = __rand(&td->bsrange_state);
    451 
    452 		if (!td->o.bssplit_nr[ddir]) {
    453 			buflen = 1 + (unsigned int) ((double) maxbs *
    454 					(r / (FRAND_MAX + 1.0)));
    455 			if (buflen < minbs)
    456 				buflen = minbs;
    457 		} else {
    458 			long perc = 0;
    459 			unsigned int i;
    460 
    461 			for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
    462 				struct bssplit *bsp = &td->o.bssplit[ddir][i];
    463 
    464 				buflen = bsp->bs;
    465 				perc += bsp->perc;
    466 				if ((r <= ((FRAND_MAX / 100L) * perc)) &&
    467 				    io_u_fits(td, io_u, buflen))
    468 					break;
    469 			}
    470 		}
    471 
    472 		if (td->o.do_verify && td->o.verify != VERIFY_NONE)
    473 			buflen = (buflen + td->o.verify_interval - 1) &
    474 				~(td->o.verify_interval - 1);
    475 
    476 		if (!td->o.bs_unaligned && is_power_of_2(minbs))
    477 			buflen = (buflen + minbs - 1) & ~(minbs - 1);
    478 
    479 	} while (!io_u_fits(td, io_u, buflen));
    480 
    481 	return buflen;
    482 }
    483 
    484 static unsigned int get_next_buflen(struct thread_data *td, struct io_u *io_u,
    485 				    unsigned int is_random)
    486 {
    487 	if (td->flags & TD_F_PROFILE_OPS) {
    488 		struct prof_io_ops *ops = &td->prof_io_ops;
    489 
    490 		if (ops->fill_io_u_size)
    491 			return ops->fill_io_u_size(td, io_u, is_random);
    492 	}
    493 
    494 	return __get_next_buflen(td, io_u, is_random);
    495 }
    496 
    497 static void set_rwmix_bytes(struct thread_data *td)
    498 {
    499 	unsigned int diff;
    500 
    501 	/*
    502 	 * we do time or byte based switch. this is needed because
    503 	 * buffered writes may issue a lot quicker than they complete,
    504 	 * whereas reads do not.
    505 	 */
    506 	diff = td->o.rwmix[td->rwmix_ddir ^ 1];
    507 	td->rwmix_issues = (td->io_issues[td->rwmix_ddir] * diff) / 100;
    508 }
    509 
    510 static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
    511 {
    512 	unsigned int v;
    513 	unsigned long r;
    514 
    515 	r = __rand(&td->rwmix_state);
    516 	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
    517 
    518 	if (v <= td->o.rwmix[DDIR_READ])
    519 		return DDIR_READ;
    520 
    521 	return DDIR_WRITE;
    522 }
    523 
    524 void io_u_quiesce(struct thread_data *td)
    525 {
    526 	/*
    527 	 * We are going to sleep, ensure that we flush anything pending as
    528 	 * not to skew our latency numbers.
    529 	 *
    530 	 * Changed to only monitor 'in flight' requests here instead of the
    531 	 * td->cur_depth, b/c td->cur_depth does not accurately represent
    532 	 * io's that have been actually submitted to an async engine,
    533 	 * and cur_depth is meaningless for sync engines.
    534 	 */
    535 	if (td->io_u_queued || td->cur_depth) {
    536 		int fio_unused ret;
    537 
    538 		ret = td_io_commit(td);
    539 	}
    540 
    541 	while (td->io_u_in_flight) {
    542 		int fio_unused ret;
    543 
    544 		ret = io_u_queued_complete(td, 1, NULL);
    545 	}
    546 }
    547 
    548 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
    549 {
    550 	enum fio_ddir odir = ddir ^ 1;
    551 	long usec;
    552 
    553 	assert(ddir_rw(ddir));
    554 
    555 	if (td->rate_pending_usleep[ddir] <= 0)
    556 		return ddir;
    557 
    558 	/*
    559 	 * We have too much pending sleep in this direction. See if we
    560 	 * should switch.
    561 	 */
    562 	if (td_rw(td) && td->o.rwmix[odir]) {
    563 		/*
    564 		 * Other direction does not have too much pending, switch
    565 		 */
    566 		if (td->rate_pending_usleep[odir] < 100000)
    567 			return odir;
    568 
    569 		/*
    570 		 * Both directions have pending sleep. Sleep the minimum time
    571 		 * and deduct from both.
    572 		 */
    573 		if (td->rate_pending_usleep[ddir] <=
    574 			td->rate_pending_usleep[odir]) {
    575 			usec = td->rate_pending_usleep[ddir];
    576 		} else {
    577 			usec = td->rate_pending_usleep[odir];
    578 			ddir = odir;
    579 		}
    580 	} else
    581 		usec = td->rate_pending_usleep[ddir];
    582 
    583 	io_u_quiesce(td);
    584 
    585 	usec = usec_sleep(td, usec);
    586 
    587 	td->rate_pending_usleep[ddir] -= usec;
    588 
    589 	odir = ddir ^ 1;
    590 	if (td_rw(td) && __should_check_rate(td, odir))
    591 		td->rate_pending_usleep[odir] -= usec;
    592 
    593 	if (ddir == DDIR_TRIM)
    594 		return DDIR_TRIM;
    595 
    596 	return ddir;
    597 }
    598 
    599 /*
    600  * Return the data direction for the next io_u. If the job is a
    601  * mixed read/write workload, check the rwmix cycle and switch if
    602  * necessary.
    603  */
    604 static enum fio_ddir get_rw_ddir(struct thread_data *td)
    605 {
    606 	enum fio_ddir ddir;
    607 
    608 	/*
    609 	 * see if it's time to fsync
    610 	 */
    611 	if (td->o.fsync_blocks &&
    612 	   !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
    613 	     td->io_issues[DDIR_WRITE] && should_fsync(td))
    614 		return DDIR_SYNC;
    615 
    616 	/*
    617 	 * see if it's time to fdatasync
    618 	 */
    619 	if (td->o.fdatasync_blocks &&
    620 	   !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
    621 	     td->io_issues[DDIR_WRITE] && should_fsync(td))
    622 		return DDIR_DATASYNC;
    623 
    624 	/*
    625 	 * see if it's time to sync_file_range
    626 	 */
    627 	if (td->sync_file_range_nr &&
    628 	   !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
    629 	     td->io_issues[DDIR_WRITE] && should_fsync(td))
    630 		return DDIR_SYNC_FILE_RANGE;
    631 
    632 	if (td_rw(td)) {
    633 		/*
    634 		 * Check if it's time to seed a new data direction.
    635 		 */
    636 		if (td->io_issues[td->rwmix_ddir] >= td->rwmix_issues) {
    637 			/*
    638 			 * Put a top limit on how many bytes we do for
    639 			 * one data direction, to avoid overflowing the
    640 			 * ranges too much
    641 			 */
    642 			ddir = get_rand_ddir(td);
    643 
    644 			if (ddir != td->rwmix_ddir)
    645 				set_rwmix_bytes(td);
    646 
    647 			td->rwmix_ddir = ddir;
    648 		}
    649 		ddir = td->rwmix_ddir;
    650 	} else if (td_read(td))
    651 		ddir = DDIR_READ;
    652 	else if (td_write(td))
    653 		ddir = DDIR_WRITE;
    654 	else
    655 		ddir = DDIR_TRIM;
    656 
    657 	td->rwmix_ddir = rate_ddir(td, ddir);
    658 	return td->rwmix_ddir;
    659 }
    660 
    661 static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
    662 {
    663 	io_u->ddir = io_u->acct_ddir = get_rw_ddir(td);
    664 
    665 	if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) &&
    666 	    td->o.barrier_blocks &&
    667 	   !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) &&
    668 	     td->io_issues[DDIR_WRITE])
    669 		io_u->flags |= IO_U_F_BARRIER;
    670 }
    671 
    672 void put_file_log(struct thread_data *td, struct fio_file *f)
    673 {
    674 	unsigned int ret = put_file(td, f);
    675 
    676 	if (ret)
    677 		td_verror(td, ret, "file close");
    678 }
    679 
    680 void put_io_u(struct thread_data *td, struct io_u *io_u)
    681 {
    682 	td_io_u_lock(td);
    683 
    684 	if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT))
    685 		put_file_log(td, io_u->file);
    686 
    687 	io_u->file = NULL;
    688 	io_u->flags |= IO_U_F_FREE;
    689 
    690 	if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
    691 		td->cur_depth--;
    692 	io_u_qpush(&td->io_u_freelist, io_u);
    693 	td_io_u_unlock(td);
    694 	td_io_u_free_notify(td);
    695 }
    696 
    697 void clear_io_u(struct thread_data *td, struct io_u *io_u)
    698 {
    699 	io_u->flags &= ~IO_U_F_FLIGHT;
    700 	put_io_u(td, io_u);
    701 }
    702 
    703 void requeue_io_u(struct thread_data *td, struct io_u **io_u)
    704 {
    705 	struct io_u *__io_u = *io_u;
    706 	enum fio_ddir ddir = acct_ddir(__io_u);
    707 
    708 	dprint(FD_IO, "requeue %p\n", __io_u);
    709 
    710 	td_io_u_lock(td);
    711 
    712 	__io_u->flags |= IO_U_F_FREE;
    713 	if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(ddir))
    714 		td->io_issues[ddir]--;
    715 
    716 	__io_u->flags &= ~IO_U_F_FLIGHT;
    717 	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH)
    718 		td->cur_depth--;
    719 
    720 	io_u_rpush(&td->io_u_requeues, __io_u);
    721 	td_io_u_unlock(td);
    722 	*io_u = NULL;
    723 }
    724 
    725 static int fill_io_u(struct thread_data *td, struct io_u *io_u)
    726 {
    727 	unsigned int is_random;
    728 
    729 	if (td->io_ops->flags & FIO_NOIO)
    730 		goto out;
    731 
    732 	set_rw_ddir(td, io_u);
    733 
    734 	/*
    735 	 * fsync() or fdatasync() or trim etc, we are done
    736 	 */
    737 	if (!ddir_rw(io_u->ddir))
    738 		goto out;
    739 
    740 	/*
    741 	 * See if it's time to switch to a new zone
    742 	 */
    743 	if (td->zone_bytes >= td->o.zone_size && td->o.zone_skip) {
    744 		struct fio_file *f = io_u->file;
    745 
    746 		td->zone_bytes = 0;
    747 		f->file_offset += td->o.zone_range + td->o.zone_skip;
    748 
    749 		/*
    750 		 * Wrap from the beginning, if we exceed the file size
    751 		 */
    752 		if (f->file_offset >= f->real_file_size)
    753 			f->file_offset = f->real_file_size - f->file_offset;
    754 		f->last_pos[io_u->ddir] = f->file_offset;
    755 		td->io_skip_bytes += td->o.zone_skip;
    756 	}
    757 
    758 	/*
    759 	 * No log, let the seq/rand engine retrieve the next buflen and
    760 	 * position.
    761 	 */
    762 	if (get_next_offset(td, io_u, &is_random)) {
    763 		dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
    764 		return 1;
    765 	}
    766 
    767 	io_u->buflen = get_next_buflen(td, io_u, is_random);
    768 	if (!io_u->buflen) {
    769 		dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
    770 		return 1;
    771 	}
    772 
    773 	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
    774 		dprint(FD_IO, "io_u %p, offset too large\n", io_u);
    775 		dprint(FD_IO, "  off=%llu/%lu > %llu\n",
    776 			(unsigned long long) io_u->offset, io_u->buflen,
    777 			(unsigned long long) io_u->file->real_file_size);
    778 		return 1;
    779 	}
    780 
    781 	/*
    782 	 * mark entry before potentially trimming io_u
    783 	 */
    784 	if (td_random(td) && file_randommap(td, io_u->file))
    785 		mark_random_map(td, io_u);
    786 
    787 out:
    788 	dprint_io_u(io_u, "fill_io_u");
    789 	td->zone_bytes += io_u->buflen;
    790 	return 0;
    791 }
    792 
    793 static void __io_u_mark_map(unsigned int *map, unsigned int nr)
    794 {
    795 	int idx = 0;
    796 
    797 	switch (nr) {
    798 	default:
    799 		idx = 6;
    800 		break;
    801 	case 33 ... 64:
    802 		idx = 5;
    803 		break;
    804 	case 17 ... 32:
    805 		idx = 4;
    806 		break;
    807 	case 9 ... 16:
    808 		idx = 3;
    809 		break;
    810 	case 5 ... 8:
    811 		idx = 2;
    812 		break;
    813 	case 1 ... 4:
    814 		idx = 1;
    815 	case 0:
    816 		break;
    817 	}
    818 
    819 	map[idx]++;
    820 }
    821 
    822 void io_u_mark_submit(struct thread_data *td, unsigned int nr)
    823 {
    824 	__io_u_mark_map(td->ts.io_u_submit, nr);
    825 	td->ts.total_submit++;
    826 }
    827 
    828 void io_u_mark_complete(struct thread_data *td, unsigned int nr)
    829 {
    830 	__io_u_mark_map(td->ts.io_u_complete, nr);
    831 	td->ts.total_complete++;
    832 }
    833 
    834 void io_u_mark_depth(struct thread_data *td, unsigned int nr)
    835 {
    836 	int idx = 0;
    837 
    838 	switch (td->cur_depth) {
    839 	default:
    840 		idx = 6;
    841 		break;
    842 	case 32 ... 63:
    843 		idx = 5;
    844 		break;
    845 	case 16 ... 31:
    846 		idx = 4;
    847 		break;
    848 	case 8 ... 15:
    849 		idx = 3;
    850 		break;
    851 	case 4 ... 7:
    852 		idx = 2;
    853 		break;
    854 	case 2 ... 3:
    855 		idx = 1;
    856 	case 1:
    857 		break;
    858 	}
    859 
    860 	td->ts.io_u_map[idx] += nr;
    861 }
    862 
    863 static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
    864 {
    865 	int idx = 0;
    866 
    867 	assert(usec < 1000);
    868 
    869 	switch (usec) {
    870 	case 750 ... 999:
    871 		idx = 9;
    872 		break;
    873 	case 500 ... 749:
    874 		idx = 8;
    875 		break;
    876 	case 250 ... 499:
    877 		idx = 7;
    878 		break;
    879 	case 100 ... 249:
    880 		idx = 6;
    881 		break;
    882 	case 50 ... 99:
    883 		idx = 5;
    884 		break;
    885 	case 20 ... 49:
    886 		idx = 4;
    887 		break;
    888 	case 10 ... 19:
    889 		idx = 3;
    890 		break;
    891 	case 4 ... 9:
    892 		idx = 2;
    893 		break;
    894 	case 2 ... 3:
    895 		idx = 1;
    896 	case 0 ... 1:
    897 		break;
    898 	}
    899 
    900 	assert(idx < FIO_IO_U_LAT_U_NR);
    901 	td->ts.io_u_lat_u[idx]++;
    902 }
    903 
    904 static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
    905 {
    906 	int idx = 0;
    907 
    908 	switch (msec) {
    909 	default:
    910 		idx = 11;
    911 		break;
    912 	case 1000 ... 1999:
    913 		idx = 10;
    914 		break;
    915 	case 750 ... 999:
    916 		idx = 9;
    917 		break;
    918 	case 500 ... 749:
    919 		idx = 8;
    920 		break;
    921 	case 250 ... 499:
    922 		idx = 7;
    923 		break;
    924 	case 100 ... 249:
    925 		idx = 6;
    926 		break;
    927 	case 50 ... 99:
    928 		idx = 5;
    929 		break;
    930 	case 20 ... 49:
    931 		idx = 4;
    932 		break;
    933 	case 10 ... 19:
    934 		idx = 3;
    935 		break;
    936 	case 4 ... 9:
    937 		idx = 2;
    938 		break;
    939 	case 2 ... 3:
    940 		idx = 1;
    941 	case 0 ... 1:
    942 		break;
    943 	}
    944 
    945 	assert(idx < FIO_IO_U_LAT_M_NR);
    946 	td->ts.io_u_lat_m[idx]++;
    947 }
    948 
    949 static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
    950 {
    951 	if (usec < 1000)
    952 		io_u_mark_lat_usec(td, usec);
    953 	else
    954 		io_u_mark_lat_msec(td, usec / 1000);
    955 }
    956 
    957 /*
    958  * Get next file to service by choosing one at random
    959  */
    960 static struct fio_file *get_next_file_rand(struct thread_data *td,
    961 					   enum fio_file_flags goodf,
    962 					   enum fio_file_flags badf)
    963 {
    964 	struct fio_file *f;
    965 	int fno;
    966 
    967 	do {
    968 		int opened = 0;
    969 		unsigned long r;
    970 
    971 		r = __rand(&td->next_file_state);
    972 		fno = (unsigned int) ((double) td->o.nr_files
    973 				* (r / (FRAND_MAX + 1.0)));
    974 
    975 		f = td->files[fno];
    976 		if (fio_file_done(f))
    977 			continue;
    978 
    979 		if (!fio_file_open(f)) {
    980 			int err;
    981 
    982 			if (td->nr_open_files >= td->o.open_files)
    983 				return ERR_PTR(-EBUSY);
    984 
    985 			err = td_io_open_file(td, f);
    986 			if (err)
    987 				continue;
    988 			opened = 1;
    989 		}
    990 
    991 		if ((!goodf || (f->flags & goodf)) && !(f->flags & badf)) {
    992 			dprint(FD_FILE, "get_next_file_rand: %p\n", f);
    993 			return f;
    994 		}
    995 		if (opened)
    996 			td_io_close_file(td, f);
    997 	} while (1);
    998 }
    999 
   1000 /*
   1001  * Get next file to service by doing round robin between all available ones
   1002  */
   1003 static struct fio_file *get_next_file_rr(struct thread_data *td, int goodf,
   1004 					 int badf)
   1005 {
   1006 	unsigned int old_next_file = td->next_file;
   1007 	struct fio_file *f;
   1008 
   1009 	do {
   1010 		int opened = 0;
   1011 
   1012 		f = td->files[td->next_file];
   1013 
   1014 		td->next_file++;
   1015 		if (td->next_file >= td->o.nr_files)
   1016 			td->next_file = 0;
   1017 
   1018 		dprint(FD_FILE, "trying file %s %x\n", f->file_name, f->flags);
   1019 		if (fio_file_done(f)) {
   1020 			f = NULL;
   1021 			continue;
   1022 		}
   1023 
   1024 		if (!fio_file_open(f)) {
   1025 			int err;
   1026 
   1027 			if (td->nr_open_files >= td->o.open_files)
   1028 				return ERR_PTR(-EBUSY);
   1029 
   1030 			err = td_io_open_file(td, f);
   1031 			if (err) {
   1032 				dprint(FD_FILE, "error %d on open of %s\n",
   1033 					err, f->file_name);
   1034 				f = NULL;
   1035 				continue;
   1036 			}
   1037 			opened = 1;
   1038 		}
   1039 
   1040 		dprint(FD_FILE, "goodf=%x, badf=%x, ff=%x\n", goodf, badf,
   1041 								f->flags);
   1042 		if ((!goodf || (f->flags & goodf)) && !(f->flags & badf))
   1043 			break;
   1044 
   1045 		if (opened)
   1046 			td_io_close_file(td, f);
   1047 
   1048 		f = NULL;
   1049 	} while (td->next_file != old_next_file);
   1050 
   1051 	dprint(FD_FILE, "get_next_file_rr: %p\n", f);
   1052 	return f;
   1053 }
   1054 
   1055 static struct fio_file *__get_next_file(struct thread_data *td)
   1056 {
   1057 	struct fio_file *f;
   1058 
   1059 	assert(td->o.nr_files <= td->files_index);
   1060 
   1061 	if (td->nr_done_files >= td->o.nr_files) {
   1062 		dprint(FD_FILE, "get_next_file: nr_open=%d, nr_done=%d,"
   1063 				" nr_files=%d\n", td->nr_open_files,
   1064 						  td->nr_done_files,
   1065 						  td->o.nr_files);
   1066 		return NULL;
   1067 	}
   1068 
   1069 	f = td->file_service_file;
   1070 	if (f && fio_file_open(f) && !fio_file_closing(f)) {
   1071 		if (td->o.file_service_type == FIO_FSERVICE_SEQ)
   1072 			goto out;
   1073 		if (td->file_service_left--)
   1074 			goto out;
   1075 	}
   1076 
   1077 	if (td->o.file_service_type == FIO_FSERVICE_RR ||
   1078 	    td->o.file_service_type == FIO_FSERVICE_SEQ)
   1079 		f = get_next_file_rr(td, FIO_FILE_open, FIO_FILE_closing);
   1080 	else
   1081 		f = get_next_file_rand(td, FIO_FILE_open, FIO_FILE_closing);
   1082 
   1083 	if (IS_ERR(f))
   1084 		return f;
   1085 
   1086 	td->file_service_file = f;
   1087 	td->file_service_left = td->file_service_nr - 1;
   1088 out:
   1089 	if (f)
   1090 		dprint(FD_FILE, "get_next_file: %p [%s]\n", f, f->file_name);
   1091 	else
   1092 		dprint(FD_FILE, "get_next_file: NULL\n");
   1093 	return f;
   1094 }
   1095 
   1096 static struct fio_file *get_next_file(struct thread_data *td)
   1097 {
   1098 	if (td->flags & TD_F_PROFILE_OPS) {
   1099 		struct prof_io_ops *ops = &td->prof_io_ops;
   1100 
   1101 		if (ops->get_next_file)
   1102 			return ops->get_next_file(td);
   1103 	}
   1104 
   1105 	return __get_next_file(td);
   1106 }
   1107 
   1108 static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
   1109 {
   1110 	struct fio_file *f;
   1111 
   1112 	do {
   1113 		f = get_next_file(td);
   1114 		if (IS_ERR_OR_NULL(f))
   1115 			return PTR_ERR(f);
   1116 
   1117 		io_u->file = f;
   1118 		get_file(f);
   1119 
   1120 		if (!fill_io_u(td, io_u))
   1121 			break;
   1122 
   1123 		put_file_log(td, f);
   1124 		td_io_close_file(td, f);
   1125 		io_u->file = NULL;
   1126 		fio_file_set_done(f);
   1127 		td->nr_done_files++;
   1128 		dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
   1129 					td->nr_done_files, td->o.nr_files);
   1130 	} while (1);
   1131 
   1132 	return 0;
   1133 }
   1134 
   1135 static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
   1136 		      unsigned long tusec, unsigned long max_usec)
   1137 {
   1138 	if (!td->error)
   1139 		log_err("fio: latency of %lu usec exceeds specified max (%lu usec)\n", tusec, max_usec);
   1140 	td_verror(td, ETIMEDOUT, "max latency exceeded");
   1141 	icd->error = ETIMEDOUT;
   1142 }
   1143 
   1144 static void lat_new_cycle(struct thread_data *td)
   1145 {
   1146 	fio_gettime(&td->latency_ts, NULL);
   1147 	td->latency_ios = ddir_rw_sum(td->io_blocks);
   1148 	td->latency_failed = 0;
   1149 }
   1150 
   1151 /*
   1152  * We had an IO outside the latency target. Reduce the queue depth. If we
   1153  * are at QD=1, then it's time to give up.
   1154  */
   1155 static int __lat_target_failed(struct thread_data *td)
   1156 {
   1157 	if (td->latency_qd == 1)
   1158 		return 1;
   1159 
   1160 	td->latency_qd_high = td->latency_qd;
   1161 
   1162 	if (td->latency_qd == td->latency_qd_low)
   1163 		td->latency_qd_low--;
   1164 
   1165 	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
   1166 
   1167 	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
   1168 
   1169 	/*
   1170 	 * When we ramp QD down, quiesce existing IO to prevent
   1171 	 * a storm of ramp downs due to pending higher depth.
   1172 	 */
   1173 	io_u_quiesce(td);
   1174 	lat_new_cycle(td);
   1175 	return 0;
   1176 }
   1177 
   1178 static int lat_target_failed(struct thread_data *td)
   1179 {
   1180 	if (td->o.latency_percentile.u.f == 100.0)
   1181 		return __lat_target_failed(td);
   1182 
   1183 	td->latency_failed++;
   1184 	return 0;
   1185 }
   1186 
   1187 void lat_target_init(struct thread_data *td)
   1188 {
   1189 	td->latency_end_run = 0;
   1190 
   1191 	if (td->o.latency_target) {
   1192 		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
   1193 		fio_gettime(&td->latency_ts, NULL);
   1194 		td->latency_qd = 1;
   1195 		td->latency_qd_high = td->o.iodepth;
   1196 		td->latency_qd_low = 1;
   1197 		td->latency_ios = ddir_rw_sum(td->io_blocks);
   1198 	} else
   1199 		td->latency_qd = td->o.iodepth;
   1200 }
   1201 
   1202 void lat_target_reset(struct thread_data *td)
   1203 {
   1204 	if (!td->latency_end_run)
   1205 		lat_target_init(td);
   1206 }
   1207 
   1208 static void lat_target_success(struct thread_data *td)
   1209 {
   1210 	const unsigned int qd = td->latency_qd;
   1211 	struct thread_options *o = &td->o;
   1212 
   1213 	td->latency_qd_low = td->latency_qd;
   1214 
   1215 	/*
   1216 	 * If we haven't failed yet, we double up to a failing value instead
   1217 	 * of bisecting from highest possible queue depth. If we have set
   1218 	 * a limit other than td->o.iodepth, bisect between that.
   1219 	 */
   1220 	if (td->latency_qd_high != o->iodepth)
   1221 		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
   1222 	else
   1223 		td->latency_qd *= 2;
   1224 
   1225 	if (td->latency_qd > o->iodepth)
   1226 		td->latency_qd = o->iodepth;
   1227 
   1228 	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
   1229 
   1230 	/*
   1231 	 * Same as last one, we are done. Let it run a latency cycle, so
   1232 	 * we get only the results from the targeted depth.
   1233 	 */
   1234 	if (td->latency_qd == qd) {
   1235 		if (td->latency_end_run) {
   1236 			dprint(FD_RATE, "We are done\n");
   1237 			td->done = 1;
   1238 		} else {
   1239 			dprint(FD_RATE, "Quiesce and final run\n");
   1240 			io_u_quiesce(td);
   1241 			td->latency_end_run = 1;
   1242 			reset_all_stats(td);
   1243 			reset_io_stats(td);
   1244 		}
   1245 	}
   1246 
   1247 	lat_new_cycle(td);
   1248 }
   1249 
   1250 /*
   1251  * Check if we can bump the queue depth
   1252  */
   1253 void lat_target_check(struct thread_data *td)
   1254 {
   1255 	uint64_t usec_window;
   1256 	uint64_t ios;
   1257 	double success_ios;
   1258 
   1259 	usec_window = utime_since_now(&td->latency_ts);
   1260 	if (usec_window < td->o.latency_window)
   1261 		return;
   1262 
   1263 	ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
   1264 	success_ios = (double) (ios - td->latency_failed) / (double) ios;
   1265 	success_ios *= 100.0;
   1266 
   1267 	dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
   1268 
   1269 	if (success_ios >= td->o.latency_percentile.u.f)
   1270 		lat_target_success(td);
   1271 	else
   1272 		__lat_target_failed(td);
   1273 }
   1274 
   1275 /*
   1276  * If latency target is enabled, we might be ramping up or down and not
   1277  * using the full queue depth available.
   1278  */
   1279 int queue_full(const struct thread_data *td)
   1280 {
   1281 	const int qempty = io_u_qempty(&td->io_u_freelist);
   1282 
   1283 	if (qempty)
   1284 		return 1;
   1285 	if (!td->o.latency_target)
   1286 		return 0;
   1287 
   1288 	return td->cur_depth >= td->latency_qd;
   1289 }
   1290 
   1291 struct io_u *__get_io_u(struct thread_data *td)
   1292 {
   1293 	struct io_u *io_u = NULL;
   1294 
   1295 	if (td->stop_io)
   1296 		return NULL;
   1297 
   1298 	td_io_u_lock(td);
   1299 
   1300 again:
   1301 	if (!io_u_rempty(&td->io_u_requeues))
   1302 		io_u = io_u_rpop(&td->io_u_requeues);
   1303 	else if (!queue_full(td)) {
   1304 		io_u = io_u_qpop(&td->io_u_freelist);
   1305 
   1306 		io_u->file = NULL;
   1307 		io_u->buflen = 0;
   1308 		io_u->resid = 0;
   1309 		io_u->end_io = NULL;
   1310 	}
   1311 
   1312 	if (io_u) {
   1313 		assert(io_u->flags & IO_U_F_FREE);
   1314 		io_u->flags &= ~(IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
   1315 				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
   1316 				 IO_U_F_VER_LIST);
   1317 
   1318 		io_u->error = 0;
   1319 		io_u->acct_ddir = -1;
   1320 		td->cur_depth++;
   1321 		io_u->flags |= IO_U_F_IN_CUR_DEPTH;
   1322 		io_u->ipo = NULL;
   1323 	} else if (td->o.verify_async) {
   1324 		/*
   1325 		 * We ran out, wait for async verify threads to finish and
   1326 		 * return one
   1327 		 */
   1328 		pthread_cond_wait(&td->free_cond, &td->io_u_lock);
   1329 		goto again;
   1330 	}
   1331 
   1332 	td_io_u_unlock(td);
   1333 	return io_u;
   1334 }
   1335 
   1336 static int check_get_trim(struct thread_data *td, struct io_u *io_u)
   1337 {
   1338 	if (!(td->flags & TD_F_TRIM_BACKLOG))
   1339 		return 0;
   1340 
   1341 	if (td->trim_entries) {
   1342 		int get_trim = 0;
   1343 
   1344 		if (td->trim_batch) {
   1345 			td->trim_batch--;
   1346 			get_trim = 1;
   1347 		} else if (!(td->io_hist_len % td->o.trim_backlog) &&
   1348 			 td->last_ddir != DDIR_READ) {
   1349 			td->trim_batch = td->o.trim_batch;
   1350 			if (!td->trim_batch)
   1351 				td->trim_batch = td->o.trim_backlog;
   1352 			get_trim = 1;
   1353 		}
   1354 
   1355 		if (get_trim && !get_next_trim(td, io_u))
   1356 			return 1;
   1357 	}
   1358 
   1359 	return 0;
   1360 }
   1361 
   1362 static int check_get_verify(struct thread_data *td, struct io_u *io_u)
   1363 {
   1364 	if (!(td->flags & TD_F_VER_BACKLOG))
   1365 		return 0;
   1366 
   1367 	if (td->io_hist_len) {
   1368 		int get_verify = 0;
   1369 
   1370 		if (td->verify_batch)
   1371 			get_verify = 1;
   1372 		else if (!(td->io_hist_len % td->o.verify_backlog) &&
   1373 			 td->last_ddir != DDIR_READ) {
   1374 			td->verify_batch = td->o.verify_batch;
   1375 			if (!td->verify_batch)
   1376 				td->verify_batch = td->o.verify_backlog;
   1377 			get_verify = 1;
   1378 		}
   1379 
   1380 		if (get_verify && !get_next_verify(td, io_u)) {
   1381 			td->verify_batch--;
   1382 			return 1;
   1383 		}
   1384 	}
   1385 
   1386 	return 0;
   1387 }
   1388 
   1389 /*
   1390  * Fill offset and start time into the buffer content, to prevent too
   1391  * easy compressible data for simple de-dupe attempts. Do this for every
   1392  * 512b block in the range, since that should be the smallest block size
   1393  * we can expect from a device.
   1394  */
   1395 static void small_content_scramble(struct io_u *io_u)
   1396 {
   1397 	unsigned int i, nr_blocks = io_u->buflen / 512;
   1398 	uint64_t boffset;
   1399 	unsigned int offset;
   1400 	void *p, *end;
   1401 
   1402 	if (!nr_blocks)
   1403 		return;
   1404 
   1405 	p = io_u->xfer_buf;
   1406 	boffset = io_u->offset;
   1407 	io_u->buf_filled_len = 0;
   1408 
   1409 	for (i = 0; i < nr_blocks; i++) {
   1410 		/*
   1411 		 * Fill the byte offset into a "random" start offset of
   1412 		 * the buffer, given by the product of the usec time
   1413 		 * and the actual offset.
   1414 		 */
   1415 		offset = (io_u->start_time.tv_usec ^ boffset) & 511;
   1416 		offset &= ~(sizeof(uint64_t) - 1);
   1417 		if (offset >= 512 - sizeof(uint64_t))
   1418 			offset -= sizeof(uint64_t);
   1419 		memcpy(p + offset, &boffset, sizeof(boffset));
   1420 
   1421 		end = p + 512 - sizeof(io_u->start_time);
   1422 		memcpy(end, &io_u->start_time, sizeof(io_u->start_time));
   1423 		p += 512;
   1424 		boffset += 512;
   1425 	}
   1426 }
   1427 
   1428 /*
   1429  * Return an io_u to be processed. Gets a buflen and offset, sets direction,
   1430  * etc. The returned io_u is fully ready to be prepped and submitted.
   1431  */
   1432 struct io_u *get_io_u(struct thread_data *td)
   1433 {
   1434 	struct fio_file *f;
   1435 	struct io_u *io_u;
   1436 	int do_scramble = 0;
   1437 	long ret = 0;
   1438 
   1439 	io_u = __get_io_u(td);
   1440 	if (!io_u) {
   1441 		dprint(FD_IO, "__get_io_u failed\n");
   1442 		return NULL;
   1443 	}
   1444 
   1445 	if (check_get_verify(td, io_u))
   1446 		goto out;
   1447 	if (check_get_trim(td, io_u))
   1448 		goto out;
   1449 
   1450 	/*
   1451 	 * from a requeue, io_u already setup
   1452 	 */
   1453 	if (io_u->file)
   1454 		goto out;
   1455 
   1456 	/*
   1457 	 * If using an iolog, grab next piece if any available.
   1458 	 */
   1459 	if (td->flags & TD_F_READ_IOLOG) {
   1460 		if (read_iolog_get(td, io_u))
   1461 			goto err_put;
   1462 	} else if (set_io_u_file(td, io_u)) {
   1463 		ret = -EBUSY;
   1464 		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
   1465 		goto err_put;
   1466 	}
   1467 
   1468 	f = io_u->file;
   1469 	if (!f) {
   1470 		dprint(FD_IO, "io_u %p, setting file failed\n", io_u);
   1471 		goto err_put;
   1472 	}
   1473 
   1474 	assert(fio_file_open(f));
   1475 
   1476 	if (ddir_rw(io_u->ddir)) {
   1477 		if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) {
   1478 			dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
   1479 			goto err_put;
   1480 		}
   1481 
   1482 		f->last_start[io_u->ddir] = io_u->offset;
   1483 		f->last_pos[io_u->ddir] = io_u->offset + io_u->buflen;
   1484 
   1485 		if (io_u->ddir == DDIR_WRITE) {
   1486 			if (td->flags & TD_F_REFILL_BUFFERS) {
   1487 				io_u_fill_buffer(td, io_u,
   1488 					td->o.min_bs[DDIR_WRITE],
   1489 					io_u->xfer_buflen);
   1490 			} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
   1491 				   !(td->flags & TD_F_COMPRESS))
   1492 				do_scramble = 1;
   1493 			if (td->flags & TD_F_VER_NONE) {
   1494 				populate_verify_io_u(td, io_u);
   1495 				do_scramble = 0;
   1496 			}
   1497 		} else if (io_u->ddir == DDIR_READ) {
   1498 			/*
   1499 			 * Reset the buf_filled parameters so next time if the
   1500 			 * buffer is used for writes it is refilled.
   1501 			 */
   1502 			io_u->buf_filled_len = 0;
   1503 		}
   1504 	}
   1505 
   1506 	/*
   1507 	 * Set io data pointers.
   1508 	 */
   1509 	io_u->xfer_buf = io_u->buf;
   1510 	io_u->xfer_buflen = io_u->buflen;
   1511 
   1512 out:
   1513 	assert(io_u->file);
   1514 	if (!td_io_prep(td, io_u)) {
   1515 		if (!td->o.disable_slat)
   1516 			fio_gettime(&io_u->start_time, NULL);
   1517 		if (do_scramble)
   1518 			small_content_scramble(io_u);
   1519 		return io_u;
   1520 	}
   1521 err_put:
   1522 	dprint(FD_IO, "get_io_u failed\n");
   1523 	put_io_u(td, io_u);
   1524 	return ERR_PTR(ret);
   1525 }
   1526 
   1527 void io_u_log_error(struct thread_data *td, struct io_u *io_u)
   1528 {
   1529 	enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error);
   1530 
   1531 	if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump)
   1532 		return;
   1533 
   1534 	log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%lu\n",
   1535 		io_u->file ? " on file " : "",
   1536 		io_u->file ? io_u->file->file_name : "",
   1537 		strerror(io_u->error),
   1538 		io_ddir_name(io_u->ddir),
   1539 		io_u->offset, io_u->xfer_buflen);
   1540 
   1541 	if (!td->error)
   1542 		td_verror(td, io_u->error, "io_u error");
   1543 }
   1544 
   1545 static inline int gtod_reduce(struct thread_data *td)
   1546 {
   1547 	return td->o.disable_clat && td->o.disable_lat && td->o.disable_slat
   1548 		&& td->o.disable_bw;
   1549 }
   1550 
   1551 static void account_io_completion(struct thread_data *td, struct io_u *io_u,
   1552 				  struct io_completion_data *icd,
   1553 				  const enum fio_ddir idx, unsigned int bytes)
   1554 {
   1555 	unsigned long lusec = 0;
   1556 
   1557 	if (!gtod_reduce(td))
   1558 		lusec = utime_since(&io_u->issue_time, &icd->time);
   1559 
   1560 	if (!td->o.disable_lat) {
   1561 		unsigned long tusec;
   1562 
   1563 		tusec = utime_since(&io_u->start_time, &icd->time);
   1564 		add_lat_sample(td, idx, tusec, bytes, io_u->offset);
   1565 
   1566 		if (td->flags & TD_F_PROFILE_OPS) {
   1567 			struct prof_io_ops *ops = &td->prof_io_ops;
   1568 
   1569 			if (ops->io_u_lat)
   1570 				icd->error = ops->io_u_lat(td, tusec);
   1571 		}
   1572 
   1573 		if (td->o.max_latency && tusec > td->o.max_latency)
   1574 			lat_fatal(td, icd, tusec, td->o.max_latency);
   1575 		if (td->o.latency_target && tusec > td->o.latency_target) {
   1576 			if (lat_target_failed(td))
   1577 				lat_fatal(td, icd, tusec, td->o.latency_target);
   1578 		}
   1579 	}
   1580 
   1581 	if (!td->o.disable_clat) {
   1582 		add_clat_sample(td, idx, lusec, bytes, io_u->offset);
   1583 		io_u_mark_latency(td, lusec);
   1584 	}
   1585 
   1586 	if (!td->o.disable_bw)
   1587 		add_bw_sample(td, idx, bytes, &icd->time);
   1588 
   1589 	if (!gtod_reduce(td))
   1590 		add_iops_sample(td, idx, bytes, &icd->time);
   1591 }
   1592 
   1593 static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
   1594 {
   1595 	uint64_t secs, remainder, bps, bytes;
   1596 
   1597 	bytes = td->this_io_bytes[ddir];
   1598 	bps = td->rate_bps[ddir];
   1599 	secs = bytes / bps;
   1600 	remainder = bytes % bps;
   1601 	return remainder * 1000000 / bps + secs * 1000000;
   1602 }
   1603 
   1604 static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
   1605 			 struct io_completion_data *icd)
   1606 {
   1607 	struct io_u *io_u = *io_u_ptr;
   1608 	enum fio_ddir ddir = io_u->ddir;
   1609 	struct fio_file *f = io_u->file;
   1610 
   1611 	dprint_io_u(io_u, "io complete");
   1612 
   1613 	td_io_u_lock(td);
   1614 	assert(io_u->flags & IO_U_F_FLIGHT);
   1615 	io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
   1616 
   1617 	/*
   1618 	 * Mark IO ok to verify
   1619 	 */
   1620 	if (io_u->ipo) {
   1621 		/*
   1622 		 * Remove errored entry from the verification list
   1623 		 */
   1624 		if (io_u->error)
   1625 			unlog_io_piece(td, io_u);
   1626 		else {
   1627 			io_u->ipo->flags &= ~IP_F_IN_FLIGHT;
   1628 			write_barrier();
   1629 		}
   1630 	}
   1631 
   1632 	td_io_u_unlock(td);
   1633 
   1634 	if (ddir_sync(ddir)) {
   1635 		td->last_was_sync = 1;
   1636 		if (f) {
   1637 			f->first_write = -1ULL;
   1638 			f->last_write = -1ULL;
   1639 		}
   1640 		return;
   1641 	}
   1642 
   1643 	td->last_was_sync = 0;
   1644 	td->last_ddir = ddir;
   1645 
   1646 	if (!io_u->error && ddir_rw(ddir)) {
   1647 		unsigned int bytes = io_u->buflen - io_u->resid;
   1648 		const enum fio_ddir oddir = ddir ^ 1;
   1649 		int ret;
   1650 
   1651 		td->io_blocks[ddir]++;
   1652 		td->this_io_blocks[ddir]++;
   1653 		td->io_bytes[ddir] += bytes;
   1654 
   1655 		if (!(io_u->flags & IO_U_F_VER_LIST))
   1656 			td->this_io_bytes[ddir] += bytes;
   1657 
   1658 		if (ddir == DDIR_WRITE) {
   1659 			if (f) {
   1660 				if (f->first_write == -1ULL ||
   1661 				    io_u->offset < f->first_write)
   1662 					f->first_write = io_u->offset;
   1663 				if (f->last_write == -1ULL ||
   1664 				    ((io_u->offset + bytes) > f->last_write))
   1665 					f->last_write = io_u->offset + bytes;
   1666 			}
   1667 			if (td->last_write_comp) {
   1668 				int idx = td->last_write_idx++;
   1669 
   1670 				td->last_write_comp[idx] = io_u->offset;
   1671 				if (td->last_write_idx == td->o.iodepth)
   1672 					td->last_write_idx = 0;
   1673 			}
   1674 		}
   1675 
   1676 		if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
   1677 					   td->runstate == TD_VERIFYING)) {
   1678 			account_io_completion(td, io_u, icd, ddir, bytes);
   1679 
   1680 			if (__should_check_rate(td, ddir)) {
   1681 				td->rate_pending_usleep[ddir] =
   1682 					(usec_for_io(td, ddir) -
   1683 					 utime_since_now(&td->start));
   1684 			}
   1685 			if (ddir != DDIR_TRIM &&
   1686 			    __should_check_rate(td, oddir)) {
   1687 				td->rate_pending_usleep[oddir] =
   1688 					(usec_for_io(td, oddir) -
   1689 					 utime_since_now(&td->start));
   1690 			}
   1691 		}
   1692 
   1693 		icd->bytes_done[ddir] += bytes;
   1694 
   1695 		if (io_u->end_io) {
   1696 			ret = io_u->end_io(td, io_u_ptr);
   1697 			io_u = *io_u_ptr;
   1698 			if (ret && !icd->error)
   1699 				icd->error = ret;
   1700 		}
   1701 	} else if (io_u->error) {
   1702 		icd->error = io_u->error;
   1703 		io_u_log_error(td, io_u);
   1704 	}
   1705 	if (icd->error) {
   1706 		enum error_type_bit eb = td_error_type(ddir, icd->error);
   1707 
   1708 		if (!td_non_fatal_error(td, eb, icd->error))
   1709 			return;
   1710 
   1711 		/*
   1712 		 * If there is a non_fatal error, then add to the error count
   1713 		 * and clear all the errors.
   1714 		 */
   1715 		update_error_count(td, icd->error);
   1716 		td_clear_error(td);
   1717 		icd->error = 0;
   1718 		if (io_u)
   1719 			io_u->error = 0;
   1720 	}
   1721 }
   1722 
   1723 static void init_icd(struct thread_data *td, struct io_completion_data *icd,
   1724 		     int nr)
   1725 {
   1726 	int ddir;
   1727 
   1728 	if (!gtod_reduce(td))
   1729 		fio_gettime(&icd->time, NULL);
   1730 
   1731 	icd->nr = nr;
   1732 
   1733 	icd->error = 0;
   1734 	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
   1735 		icd->bytes_done[ddir] = 0;
   1736 }
   1737 
   1738 static void ios_completed(struct thread_data *td,
   1739 			  struct io_completion_data *icd)
   1740 {
   1741 	struct io_u *io_u;
   1742 	int i;
   1743 
   1744 	for (i = 0; i < icd->nr; i++) {
   1745 		io_u = td->io_ops->event(td, i);
   1746 
   1747 		io_completed(td, &io_u, icd);
   1748 
   1749 		if (io_u)
   1750 			put_io_u(td, io_u);
   1751 	}
   1752 }
   1753 
   1754 /*
   1755  * Complete a single io_u for the sync engines.
   1756  */
   1757 int io_u_sync_complete(struct thread_data *td, struct io_u *io_u,
   1758 		       uint64_t *bytes)
   1759 {
   1760 	struct io_completion_data icd;
   1761 
   1762 	init_icd(td, &icd, 1);
   1763 	io_completed(td, &io_u, &icd);
   1764 
   1765 	if (io_u)
   1766 		put_io_u(td, io_u);
   1767 
   1768 	if (icd.error) {
   1769 		td_verror(td, icd.error, "io_u_sync_complete");
   1770 		return -1;
   1771 	}
   1772 
   1773 	if (bytes) {
   1774 		int ddir;
   1775 
   1776 		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
   1777 			bytes[ddir] += icd.bytes_done[ddir];
   1778 	}
   1779 
   1780 	return 0;
   1781 }
   1782 
   1783 /*
   1784  * Called to complete min_events number of io for the async engines.
   1785  */
   1786 int io_u_queued_complete(struct thread_data *td, int min_evts,
   1787 			 uint64_t *bytes)
   1788 {
   1789 	struct io_completion_data icd;
   1790 	struct timespec *tvp = NULL;
   1791 	int ret;
   1792 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
   1793 
   1794 	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
   1795 
   1796 	if (!min_evts)
   1797 		tvp = &ts;
   1798 	else if (min_evts > td->cur_depth)
   1799 		min_evts = td->cur_depth;
   1800 
   1801 	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
   1802 	if (ret < 0) {
   1803 		td_verror(td, -ret, "td_io_getevents");
   1804 		return ret;
   1805 	} else if (!ret)
   1806 		return ret;
   1807 
   1808 	init_icd(td, &icd, ret);
   1809 	ios_completed(td, &icd);
   1810 	if (icd.error) {
   1811 		td_verror(td, icd.error, "io_u_queued_complete");
   1812 		return -1;
   1813 	}
   1814 
   1815 	if (bytes) {
   1816 		int ddir;
   1817 
   1818 		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
   1819 			bytes[ddir] += icd.bytes_done[ddir];
   1820 	}
   1821 
   1822 	return 0;
   1823 }
   1824 
   1825 /*
   1826  * Call when io_u is really queued, to update the submission latency.
   1827  */
   1828 void io_u_queued(struct thread_data *td, struct io_u *io_u)
   1829 {
   1830 	if (!td->o.disable_slat) {
   1831 		unsigned long slat_time;
   1832 
   1833 		slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
   1834 		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
   1835 				io_u->offset);
   1836 	}
   1837 }
   1838 
   1839 /*
   1840  * See if we should reuse the last seed, if dedupe is enabled
   1841  */
   1842 static struct frand_state *get_buf_state(struct thread_data *td)
   1843 {
   1844 	unsigned int v;
   1845 	unsigned long r;
   1846 
   1847 	if (!td->o.dedupe_percentage)
   1848 		return &td->buf_state;
   1849 	else if (td->o.dedupe_percentage == 100)
   1850 		return &td->buf_state_prev;
   1851 
   1852 	r = __rand(&td->dedupe_state);
   1853 	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
   1854 
   1855 	if (v <= td->o.dedupe_percentage)
   1856 		return &td->buf_state_prev;
   1857 
   1858 	return &td->buf_state;
   1859 }
   1860 
   1861 static void save_buf_state(struct thread_data *td, struct frand_state *rs)
   1862 {
   1863 	if (rs == &td->buf_state)
   1864 		frand_copy(&td->buf_state_prev, rs);
   1865 }
   1866 
   1867 void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
   1868 		    unsigned int max_bs)
   1869 {
   1870 	struct thread_options *o = &td->o;
   1871 
   1872 	if (o->compress_percentage || o->dedupe_percentage) {
   1873 		unsigned int perc = td->o.compress_percentage;
   1874 		struct frand_state *rs;
   1875 		unsigned int left = max_bs;
   1876 
   1877 		do {
   1878 			rs = get_buf_state(td);
   1879 
   1880 			min_write = min(min_write, left);
   1881 
   1882 			if (perc) {
   1883 				unsigned int seg = min_write;
   1884 
   1885 				seg = min(min_write, td->o.compress_chunk);
   1886 				if (!seg)
   1887 					seg = min_write;
   1888 
   1889 				fill_random_buf_percentage(rs, buf, perc, seg,
   1890 					min_write, o->buffer_pattern,
   1891 						   o->buffer_pattern_bytes);
   1892 			} else
   1893 				fill_random_buf(rs, buf, min_write);
   1894 
   1895 			buf += min_write;
   1896 			left -= min_write;
   1897 			save_buf_state(td, rs);
   1898 		} while (left);
   1899 	} else if (o->buffer_pattern_bytes)
   1900 		fill_buffer_pattern(td, buf, max_bs);
   1901 	else
   1902 		memset(buf, 0, max_bs);
   1903 }
   1904 
   1905 /*
   1906  * "randomly" fill the buffer contents
   1907  */
   1908 void io_u_fill_buffer(struct thread_data *td, struct io_u *io_u,
   1909 		      unsigned int min_write, unsigned int max_bs)
   1910 {
   1911 	io_u->buf_filled_len = 0;
   1912 	fill_io_buffer(td, io_u->buf, min_write, max_bs);
   1913 }
   1914