Home | History | Annotate | Download | only in ext2fs
      1 /*
      2  * unix_io.c --- This is the Unix (well, really POSIX) implementation
      3  *	of the I/O manager.
      4  *
      5  * Implements a one-block write-through cache.
      6  *
      7  * Includes support for Windows NT support under Cygwin.
      8  *
      9  * Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
     10  *	2002 by Theodore Ts'o.
     11  *
     12  * %Begin-Header%
     13  * This file may be redistributed under the terms of the GNU Library
     14  * General Public License, version 2.
     15  * %End-Header%
     16  */
     17 
     18 #if !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
     19 #define _XOPEN_SOURCE 600
     20 #define _DARWIN_C_SOURCE
     21 #define _FILE_OFFSET_BITS 64
     22 #ifndef _LARGEFILE_SOURCE
     23 #define _LARGEFILE_SOURCE
     24 #endif
     25 #ifndef _LARGEFILE64_SOURCE
     26 #define _LARGEFILE64_SOURCE
     27 #endif
     28 #ifndef _GNU_SOURCE
     29 #define _GNU_SOURCE
     30 #endif
     31 #endif
     32 
     33 #include "config.h"
     34 #include <stdio.h>
     35 #include <string.h>
     36 #if HAVE_UNISTD_H
     37 #include <unistd.h>
     38 #endif
     39 #if HAVE_ERRNO_H
     40 #include <errno.h>
     41 #endif
     42 #include <fcntl.h>
     43 #include <time.h>
     44 #ifdef __linux__
     45 #include <sys/utsname.h>
     46 #endif
     47 #if HAVE_SYS_TYPES_H
     48 #include <sys/types.h>
     49 #endif
     50 #ifdef HAVE_SYS_IOCTL_H
     51 #include <sys/ioctl.h>
     52 #endif
     53 #ifdef HAVE_SYS_MOUNT_H
     54 #include <sys/mount.h>
     55 #endif
     56 #if HAVE_SYS_STAT_H
     57 #include <sys/stat.h>
     58 #endif
     59 #if HAVE_SYS_RESOURCE_H
     60 #include <sys/resource.h>
     61 #endif
     62 #if HAVE_LINUX_FALLOC_H
     63 #include <linux/falloc.h>
     64 #endif
     65 
     66 #if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
     67 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
     68 #endif
     69 
     70 #undef ALIGN_DEBUG
     71 
     72 #include "ext2_fs.h"
     73 #include "ext2fs.h"
     74 
     75 /*
     76  * For checking structure magic numbers...
     77  */
     78 
     79 #define EXT2_CHECK_MAGIC(struct, code) \
     80 	  if ((struct)->magic != (code)) return (code)
     81 
     82 struct unix_cache {
     83 	char			*buf;
     84 	unsigned long long	block;
     85 	int			access_time;
     86 	unsigned		dirty:1;
     87 	unsigned		in_use:1;
     88 };
     89 
     90 #define CACHE_SIZE 8
     91 #define WRITE_DIRECT_SIZE 4	/* Must be smaller than CACHE_SIZE */
     92 #define READ_DIRECT_SIZE 4	/* Should be smaller than CACHE_SIZE */
     93 
     94 struct unix_private_data {
     95 	int	magic;
     96 	int	dev;
     97 	int	flags;
     98 	int	align;
     99 	int	access_time;
    100 	ext2_loff_t offset;
    101 	struct unix_cache cache[CACHE_SIZE];
    102 	void	*bounce;
    103 	struct struct_io_stats io_stats;
    104 };
    105 
    106 #define IS_ALIGNED(n, align) ((((uintptr_t) n) & \
    107 			       ((uintptr_t) ((align)-1))) == 0)
    108 
    109 static errcode_t unix_get_stats(io_channel channel, io_stats *stats)
    110 {
    111 	errcode_t	retval = 0;
    112 
    113 	struct unix_private_data *data;
    114 
    115 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    116 	data = (struct unix_private_data *) channel->private_data;
    117 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    118 
    119 	if (stats)
    120 		*stats = &data->io_stats;
    121 
    122 	return retval;
    123 }
    124 
    125 /*
    126  * Here are the raw I/O functions
    127  */
    128 static errcode_t raw_read_blk(io_channel channel,
    129 			      struct unix_private_data *data,
    130 			      unsigned long long block,
    131 			      int count, void *bufv)
    132 {
    133 	errcode_t	retval;
    134 	ssize_t		size;
    135 	ext2_loff_t	location;
    136 	int		actual = 0;
    137 	unsigned char	*buf = bufv;
    138 
    139 	size = (count < 0) ? -count : count * channel->block_size;
    140 	data->io_stats.bytes_read += size;
    141 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
    142 
    143 #ifdef HAVE_PREAD64
    144 	/* Try an aligned pread */
    145 	if ((channel->align == 0) ||
    146 	    (IS_ALIGNED(buf, channel->align) &&
    147 	     IS_ALIGNED(size, channel->align))) {
    148 		actual = pread64(data->dev, buf, size, location);
    149 		if (actual == size)
    150 			return 0;
    151 	}
    152 #elif HAVE_PREAD
    153 	/* Try an aligned pread */
    154 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
    155 	    ((channel->align == 0) ||
    156 	     (IS_ALIGNED(buf, channel->align) &&
    157 	      IS_ALIGNED(size, channel->align)))) {
    158 		actual = pread(data->dev, buf, size, location);
    159 		if (actual == size)
    160 			return 0;
    161 	}
    162 #endif /* HAVE_PREAD */
    163 
    164 	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
    165 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
    166 		goto error_out;
    167 	}
    168 	if ((channel->align == 0) ||
    169 	    (IS_ALIGNED(buf, channel->align) &&
    170 	     IS_ALIGNED(size, channel->align))) {
    171 		actual = read(data->dev, buf, size);
    172 		if (actual != size) {
    173 		short_read:
    174 			if (actual < 0)
    175 				actual = 0;
    176 			retval = EXT2_ET_SHORT_READ;
    177 			goto error_out;
    178 		}
    179 		return 0;
    180 	}
    181 
    182 #ifdef ALIGN_DEBUG
    183 	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
    184 	       (unsigned long) size);
    185 #endif
    186 
    187 	/*
    188 	 * The buffer or size which we're trying to read isn't aligned
    189 	 * to the O_DIRECT rules, so we need to do this the hard way...
    190 	 */
    191 	while (size > 0) {
    192 		actual = read(data->dev, data->bounce, channel->block_size);
    193 		if (actual != channel->block_size)
    194 			goto short_read;
    195 		actual = size;
    196 		if (size > channel->block_size)
    197 			actual = channel->block_size;
    198 		memcpy(buf, data->bounce, actual);
    199 		size -= actual;
    200 		buf += actual;
    201 	}
    202 	return 0;
    203 
    204 error_out:
    205 	memset((char *) buf+actual, 0, size-actual);
    206 	if (channel->read_error)
    207 		retval = (channel->read_error)(channel, block, count, buf,
    208 					       size, actual, retval);
    209 	return retval;
    210 }
    211 
    212 static errcode_t raw_write_blk(io_channel channel,
    213 			       struct unix_private_data *data,
    214 			       unsigned long long block,
    215 			       int count, const void *bufv)
    216 {
    217 	ssize_t		size;
    218 	ext2_loff_t	location;
    219 	int		actual = 0;
    220 	errcode_t	retval;
    221 	const unsigned char *buf = bufv;
    222 
    223 	if (count == 1)
    224 		size = channel->block_size;
    225 	else {
    226 		if (count < 0)
    227 			size = -count;
    228 		else
    229 			size = count * channel->block_size;
    230 	}
    231 	data->io_stats.bytes_written += size;
    232 
    233 	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
    234 
    235 #ifdef HAVE_PWRITE64
    236 	/* Try an aligned pwrite */
    237 	if ((channel->align == 0) ||
    238 	    (IS_ALIGNED(buf, channel->align) &&
    239 	     IS_ALIGNED(size, channel->align))) {
    240 		actual = pwrite64(data->dev, buf, size, location);
    241 		if (actual == size)
    242 			return 0;
    243 	}
    244 #elif HAVE_PWRITE
    245 	/* Try an aligned pwrite */
    246 	if ((sizeof(off_t) >= sizeof(ext2_loff_t)) &&
    247 	    ((channel->align == 0) ||
    248 	     (IS_ALIGNED(buf, channel->align) &&
    249 	      IS_ALIGNED(size, channel->align)))) {
    250 		actual = pwrite(data->dev, buf, size, location);
    251 		if (actual == size)
    252 			return 0;
    253 	}
    254 #endif /* HAVE_PWRITE */
    255 
    256 	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
    257 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
    258 		goto error_out;
    259 	}
    260 
    261 	if ((channel->align == 0) ||
    262 	    (IS_ALIGNED(buf, channel->align) &&
    263 	     IS_ALIGNED(size, channel->align))) {
    264 		actual = write(data->dev, buf, size);
    265 		if (actual != size) {
    266 		short_write:
    267 			retval = EXT2_ET_SHORT_WRITE;
    268 			goto error_out;
    269 		}
    270 		return 0;
    271 	}
    272 
    273 #ifdef ALIGN_DEBUG
    274 	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
    275 	       (unsigned long) size);
    276 #endif
    277 	/*
    278 	 * The buffer or size which we're trying to write isn't aligned
    279 	 * to the O_DIRECT rules, so we need to do this the hard way...
    280 	 */
    281 	while (size > 0) {
    282 		if (size < channel->block_size) {
    283 			actual = read(data->dev, data->bounce,
    284 				      channel->block_size);
    285 			if (actual != channel->block_size) {
    286 				retval = EXT2_ET_SHORT_READ;
    287 				goto error_out;
    288 			}
    289 		}
    290 		actual = size;
    291 		if (size > channel->block_size)
    292 			actual = channel->block_size;
    293 		memcpy(data->bounce, buf, actual);
    294 		if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
    295 			retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
    296 			goto error_out;
    297 		}
    298 		actual = write(data->dev, data->bounce, channel->block_size);
    299 		if (actual != channel->block_size)
    300 			goto short_write;
    301 		size -= actual;
    302 		buf += actual;
    303 	}
    304 	return 0;
    305 
    306 error_out:
    307 	if (channel->write_error)
    308 		retval = (channel->write_error)(channel, block, count, buf,
    309 						size, actual, retval);
    310 	return retval;
    311 }
    312 
    313 
    314 /*
    315  * Here we implement the cache functions
    316  */
    317 
    318 /* Allocate the cache buffers */
    319 static errcode_t alloc_cache(io_channel channel,
    320 			     struct unix_private_data *data)
    321 {
    322 	errcode_t		retval;
    323 	struct unix_cache	*cache;
    324 	int			i;
    325 
    326 	data->access_time = 0;
    327 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
    328 		cache->block = 0;
    329 		cache->access_time = 0;
    330 		cache->dirty = 0;
    331 		cache->in_use = 0;
    332 		if (cache->buf)
    333 			ext2fs_free_mem(&cache->buf);
    334 		retval = io_channel_alloc_buf(channel, 0, &cache->buf);
    335 		if (retval)
    336 			return retval;
    337 	}
    338 	if (channel->align) {
    339 		if (data->bounce)
    340 			ext2fs_free_mem(&data->bounce);
    341 		retval = io_channel_alloc_buf(channel, 0, &data->bounce);
    342 	}
    343 	return retval;
    344 }
    345 
    346 /* Free the cache buffers */
    347 static void free_cache(struct unix_private_data *data)
    348 {
    349 	struct unix_cache	*cache;
    350 	int			i;
    351 
    352 	data->access_time = 0;
    353 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
    354 		cache->block = 0;
    355 		cache->access_time = 0;
    356 		cache->dirty = 0;
    357 		cache->in_use = 0;
    358 		if (cache->buf)
    359 			ext2fs_free_mem(&cache->buf);
    360 	}
    361 	if (data->bounce)
    362 		ext2fs_free_mem(&data->bounce);
    363 }
    364 
    365 #ifndef NO_IO_CACHE
    366 /*
    367  * Try to find a block in the cache.  If the block is not found, and
    368  * eldest is a non-zero pointer, then fill in eldest with the cache
    369  * entry to that should be reused.
    370  */
    371 static struct unix_cache *find_cached_block(struct unix_private_data *data,
    372 					    unsigned long long block,
    373 					    struct unix_cache **eldest)
    374 {
    375 	struct unix_cache	*cache, *unused_cache, *oldest_cache;
    376 	int			i;
    377 
    378 	unused_cache = oldest_cache = 0;
    379 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
    380 		if (!cache->in_use) {
    381 			if (!unused_cache)
    382 				unused_cache = cache;
    383 			continue;
    384 		}
    385 		if (cache->block == block) {
    386 			cache->access_time = ++data->access_time;
    387 			return cache;
    388 		}
    389 		if (!oldest_cache ||
    390 		    (cache->access_time < oldest_cache->access_time))
    391 			oldest_cache = cache;
    392 	}
    393 	if (eldest)
    394 		*eldest = (unused_cache) ? unused_cache : oldest_cache;
    395 	return 0;
    396 }
    397 
    398 /*
    399  * Reuse a particular cache entry for another block.
    400  */
    401 static void reuse_cache(io_channel channel, struct unix_private_data *data,
    402 		 struct unix_cache *cache, unsigned long long block)
    403 {
    404 	if (cache->dirty && cache->in_use)
    405 		raw_write_blk(channel, data, cache->block, 1, cache->buf);
    406 
    407 	cache->in_use = 1;
    408 	cache->dirty = 0;
    409 	cache->block = block;
    410 	cache->access_time = ++data->access_time;
    411 }
    412 
    413 /*
    414  * Flush all of the blocks in the cache
    415  */
    416 static errcode_t flush_cached_blocks(io_channel channel,
    417 				     struct unix_private_data *data,
    418 				     int invalidate)
    419 
    420 {
    421 	struct unix_cache	*cache;
    422 	errcode_t		retval, retval2;
    423 	int			i;
    424 
    425 	retval2 = 0;
    426 	for (i=0, cache = data->cache; i < CACHE_SIZE; i++, cache++) {
    427 		if (!cache->in_use)
    428 			continue;
    429 
    430 		if (invalidate)
    431 			cache->in_use = 0;
    432 
    433 		if (!cache->dirty)
    434 			continue;
    435 
    436 		retval = raw_write_blk(channel, data,
    437 				       cache->block, 1, cache->buf);
    438 		if (retval)
    439 			retval2 = retval;
    440 		else
    441 			cache->dirty = 0;
    442 	}
    443 	return retval2;
    444 }
    445 #endif /* NO_IO_CACHE */
    446 
    447 #ifdef __linux__
    448 #ifndef BLKDISCARDZEROES
    449 #define BLKDISCARDZEROES _IO(0x12,124)
    450 #endif
    451 #endif
    452 
    453 int ext2fs_open_file(const char *pathname, int flags, mode_t mode)
    454 {
    455 	if (mode)
    456 #if defined(HAVE_OPEN64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
    457 		return open64(pathname, flags, mode);
    458 	else
    459 		return open64(pathname, flags);
    460 #else
    461 		return open(pathname, flags, mode);
    462 	else
    463 		return open(pathname, flags);
    464 #endif
    465 }
    466 
    467 int ext2fs_stat(const char *path, ext2fs_struct_stat *buf)
    468 {
    469 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
    470 	return stat64(path, buf);
    471 #else
    472 	return stat(path, buf);
    473 #endif
    474 }
    475 
    476 int ext2fs_fstat(int fd, ext2fs_struct_stat *buf)
    477 {
    478 #if defined(HAVE_FSTAT64) && !defined(__OSX_AVAILABLE_BUT_DEPRECATED)
    479 	return fstat64(fd, buf);
    480 #else
    481 	return fstat(fd, buf);
    482 #endif
    483 }
    484 
    485 
    486 static errcode_t unix_open_channel(const char *name, int fd,
    487 				   int flags, io_channel *channel,
    488 				   io_manager io_mgr)
    489 {
    490 	io_channel	io = NULL;
    491 	struct unix_private_data *data = NULL;
    492 	errcode_t	retval;
    493 	ext2fs_struct_stat st;
    494 #ifdef __linux__
    495 	struct		utsname ut;
    496 #endif
    497 
    498 	retval = ext2fs_get_mem(sizeof(struct struct_io_channel), &io);
    499 	if (retval)
    500 		goto cleanup;
    501 	memset(io, 0, sizeof(struct struct_io_channel));
    502 	io->magic = EXT2_ET_MAGIC_IO_CHANNEL;
    503 	retval = ext2fs_get_mem(sizeof(struct unix_private_data), &data);
    504 	if (retval)
    505 		goto cleanup;
    506 
    507 	io->manager = io_mgr;
    508 	retval = ext2fs_get_mem(strlen(name)+1, &io->name);
    509 	if (retval)
    510 		goto cleanup;
    511 
    512 	strcpy(io->name, name);
    513 	io->private_data = data;
    514 	io->block_size = 1024;
    515 	io->read_error = 0;
    516 	io->write_error = 0;
    517 	io->refcount = 1;
    518 
    519 	memset(data, 0, sizeof(struct unix_private_data));
    520 	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
    521 	data->io_stats.num_fields = 2;
    522 	data->flags = flags;
    523 	data->dev = fd;
    524 
    525 #if defined(O_DIRECT)
    526 	if (flags & IO_FLAG_DIRECT_IO)
    527 		io->align = ext2fs_get_dio_alignment(data->dev);
    528 #elif defined(F_NOCACHE)
    529 	if (flags & IO_FLAG_DIRECT_IO)
    530 		io->align = 4096;
    531 #endif
    532 
    533 	/*
    534 	 * If the device is really a block device, then set the
    535 	 * appropriate flag, otherwise we can set DISCARD_ZEROES flag
    536 	 * because we are going to use punch hole instead of discard
    537 	 * and if it succeed, subsequent read from sparse area returns
    538 	 * zero.
    539 	 */
    540 	if (ext2fs_fstat(data->dev, &st) == 0) {
    541 		if (S_ISBLK(st.st_mode))
    542 			io->flags |= CHANNEL_FLAGS_BLOCK_DEVICE;
    543 		else
    544 			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
    545 	}
    546 
    547 #ifdef BLKDISCARDZEROES
    548 	{
    549 		int zeroes = 0;
    550 		if (ioctl(data->dev, BLKDISCARDZEROES, &zeroes) == 0 &&
    551 		    zeroes)
    552 			io->flags |= CHANNEL_FLAGS_DISCARD_ZEROES;
    553 	}
    554 #endif
    555 
    556 #if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
    557 	/*
    558 	 * Some operating systems require that the buffers be aligned,
    559 	 * regardless of O_DIRECT
    560 	 */
    561 	if (!io->align)
    562 		io->align = 512;
    563 #endif
    564 
    565 
    566 	if ((retval = alloc_cache(io, data)))
    567 		goto cleanup;
    568 
    569 #ifdef BLKROGET
    570 	if (flags & IO_FLAG_RW) {
    571 		int error;
    572 		int readonly = 0;
    573 
    574 		/* Is the block device actually writable? */
    575 		error = ioctl(data->dev, BLKROGET, &readonly);
    576 		if (!error && readonly) {
    577 			retval = EPERM;
    578 			goto cleanup;
    579 		}
    580 	}
    581 #endif
    582 
    583 #ifdef __linux__
    584 #undef RLIM_INFINITY
    585 #if (defined(__alpha__) || ((defined(__sparc__) || defined(__mips__)) && (SIZEOF_LONG == 4)))
    586 #define RLIM_INFINITY	((unsigned long)(~0UL>>1))
    587 #else
    588 #define RLIM_INFINITY  (~0UL)
    589 #endif
    590 	/*
    591 	 * Work around a bug in 2.4.10-2.4.18 kernels where writes to
    592 	 * block devices are wrongly getting hit by the filesize
    593 	 * limit.  This workaround isn't perfect, since it won't work
    594 	 * if glibc wasn't built against 2.2 header files.  (Sigh.)
    595 	 *
    596 	 */
    597 	if ((flags & IO_FLAG_RW) &&
    598 	    (uname(&ut) == 0) &&
    599 	    ((ut.release[0] == '2') && (ut.release[1] == '.') &&
    600 	     (ut.release[2] == '4') && (ut.release[3] == '.') &&
    601 	     (ut.release[4] == '1') && (ut.release[5] >= '0') &&
    602 	     (ut.release[5] < '8')) &&
    603 	    (ext2fs_fstat(data->dev, &st) == 0) &&
    604 	    (S_ISBLK(st.st_mode))) {
    605 		struct rlimit	rlim;
    606 
    607 		rlim.rlim_cur = rlim.rlim_max = (unsigned long) RLIM_INFINITY;
    608 		setrlimit(RLIMIT_FSIZE, &rlim);
    609 		getrlimit(RLIMIT_FSIZE, &rlim);
    610 		if (((unsigned long) rlim.rlim_cur) <
    611 		    ((unsigned long) rlim.rlim_max)) {
    612 			rlim.rlim_cur = rlim.rlim_max;
    613 			setrlimit(RLIMIT_FSIZE, &rlim);
    614 		}
    615 	}
    616 #endif
    617 	*channel = io;
    618 	return 0;
    619 
    620 cleanup:
    621 	if (data) {
    622 		if (data->dev >= 0)
    623 			close(data->dev);
    624 		free_cache(data);
    625 		ext2fs_free_mem(&data);
    626 	}
    627 	if (io) {
    628 		if (io->name) {
    629 			ext2fs_free_mem(&io->name);
    630 		}
    631 		ext2fs_free_mem(&io);
    632 	}
    633 	return retval;
    634 }
    635 
    636 static errcode_t unixfd_open(const char *str_fd, int flags,
    637 			     io_channel *channel)
    638 {
    639 	int fd;
    640 	int fd_flags;
    641 
    642 	fd = atoi(str_fd);
    643 #if defined(HAVE_FCNTL)
    644 	fd_flags = fcntl(fd, F_GETFD);
    645 	if (fd_flags == -1)
    646 		return -EBADF;
    647 
    648 	flags = 0;
    649 	if (fd_flags & O_RDWR)
    650 		flags |= IO_FLAG_RW;
    651 	if (fd_flags & O_EXCL)
    652 		flags |= IO_FLAG_EXCLUSIVE;
    653 #if defined(O_DIRECT)
    654 	if (fd_flags & O_DIRECT)
    655 		flags |= IO_FLAG_DIRECT_IO;
    656 #endif
    657 #endif  /* HAVE_FCNTL */
    658 
    659 	return unix_open_channel(str_fd, fd, flags, channel, unixfd_io_manager);
    660 }
    661 
    662 static errcode_t unix_open(const char *name, int flags,
    663 			   io_channel *channel)
    664 {
    665 	int fd = -1;
    666 	int open_flags;
    667 
    668 	if (name == 0)
    669 		return EXT2_ET_BAD_DEVICE_NAME;
    670 
    671 	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
    672 	if (flags & IO_FLAG_EXCLUSIVE)
    673 		open_flags |= O_EXCL;
    674 #if defined(O_DIRECT)
    675 	if (flags & IO_FLAG_DIRECT_IO)
    676 		open_flags |= O_DIRECT;
    677 #endif
    678 	fd = ext2fs_open_file(name, open_flags, 0);
    679 	if (fd < 0)
    680 		return errno;
    681 #if defined(F_NOCACHE) && !defined(IO_DIRECT)
    682 	if (flags & IO_FLAG_DIRECT_IO) {
    683 		if (fcntl(fd, F_NOCACHE, 1) < 0)
    684 			return errno;
    685 	}
    686 #endif
    687 	return unix_open_channel(name, fd, flags, channel, unix_io_manager);
    688 }
    689 
    690 static errcode_t unix_close(io_channel channel)
    691 {
    692 	struct unix_private_data *data;
    693 	errcode_t	retval = 0;
    694 
    695 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    696 	data = (struct unix_private_data *) channel->private_data;
    697 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    698 
    699 	if (--channel->refcount > 0)
    700 		return 0;
    701 
    702 #ifndef NO_IO_CACHE
    703 	retval = flush_cached_blocks(channel, data, 0);
    704 #endif
    705 
    706 	if (close(data->dev) < 0)
    707 		retval = errno;
    708 	free_cache(data);
    709 
    710 	ext2fs_free_mem(&channel->private_data);
    711 	if (channel->name)
    712 		ext2fs_free_mem(&channel->name);
    713 	ext2fs_free_mem(&channel);
    714 	return retval;
    715 }
    716 
    717 static errcode_t unix_set_blksize(io_channel channel, int blksize)
    718 {
    719 	struct unix_private_data *data;
    720 	errcode_t		retval;
    721 
    722 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    723 	data = (struct unix_private_data *) channel->private_data;
    724 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    725 
    726 	if (channel->block_size != blksize) {
    727 #ifndef NO_IO_CACHE
    728 		if ((retval = flush_cached_blocks(channel, data, 0)))
    729 			return retval;
    730 #endif
    731 
    732 		channel->block_size = blksize;
    733 		free_cache(data);
    734 		if ((retval = alloc_cache(channel, data)))
    735 			return retval;
    736 	}
    737 	return 0;
    738 }
    739 
    740 static errcode_t unix_read_blk64(io_channel channel, unsigned long long block,
    741 			       int count, void *buf)
    742 {
    743 	struct unix_private_data *data;
    744 	struct unix_cache *cache, *reuse[READ_DIRECT_SIZE];
    745 	errcode_t	retval;
    746 	char		*cp;
    747 	int		i, j;
    748 
    749 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    750 	data = (struct unix_private_data *) channel->private_data;
    751 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    752 
    753 #ifdef NO_IO_CACHE
    754 	return raw_read_blk(channel, data, block, count, buf);
    755 #else
    756 	/*
    757 	 * If we're doing an odd-sized read or a very large read,
    758 	 * flush out the cache and then do a direct read.
    759 	 */
    760 	if (count < 0 || count > WRITE_DIRECT_SIZE) {
    761 		if ((retval = flush_cached_blocks(channel, data, 0)))
    762 			return retval;
    763 		return raw_read_blk(channel, data, block, count, buf);
    764 	}
    765 
    766 	cp = buf;
    767 	while (count > 0) {
    768 		/* If it's in the cache, use it! */
    769 		if ((cache = find_cached_block(data, block, &reuse[0]))) {
    770 #ifdef DEBUG
    771 			printf("Using cached block %lu\n", block);
    772 #endif
    773 			memcpy(cp, cache->buf, channel->block_size);
    774 			count--;
    775 			block++;
    776 			cp += channel->block_size;
    777 			continue;
    778 		}
    779 		if (count == 1) {
    780 			/*
    781 			 * Special case where we read directly into the
    782 			 * cache buffer; important in the O_DIRECT case
    783 			 */
    784 			cache = reuse[0];
    785 			reuse_cache(channel, data, cache, block);
    786 			if ((retval = raw_read_blk(channel, data, block, 1,
    787 						   cache->buf))) {
    788 				cache->in_use = 0;
    789 				return retval;
    790 			}
    791 			memcpy(cp, cache->buf, channel->block_size);
    792 			return 0;
    793 		}
    794 
    795 		/*
    796 		 * Find the number of uncached blocks so we can do a
    797 		 * single read request
    798 		 */
    799 		for (i=1; i < count; i++)
    800 			if (find_cached_block(data, block+i, &reuse[i]))
    801 				break;
    802 #ifdef DEBUG
    803 		printf("Reading %d blocks starting at %lu\n", i, block);
    804 #endif
    805 		if ((retval = raw_read_blk(channel, data, block, i, cp)))
    806 			return retval;
    807 
    808 		/* Save the results in the cache */
    809 		for (j=0; j < i; j++) {
    810 			count--;
    811 			cache = reuse[j];
    812 			reuse_cache(channel, data, cache, block++);
    813 			memcpy(cache->buf, cp, channel->block_size);
    814 			cp += channel->block_size;
    815 		}
    816 	}
    817 	return 0;
    818 #endif /* NO_IO_CACHE */
    819 }
    820 
    821 static errcode_t unix_read_blk(io_channel channel, unsigned long block,
    822 			       int count, void *buf)
    823 {
    824 	return unix_read_blk64(channel, block, count, buf);
    825 }
    826 
    827 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
    828 				int count, const void *buf)
    829 {
    830 	struct unix_private_data *data;
    831 	struct unix_cache *cache, *reuse;
    832 	errcode_t	retval = 0;
    833 	const char	*cp;
    834 	int		writethrough;
    835 
    836 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    837 	data = (struct unix_private_data *) channel->private_data;
    838 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    839 
    840 #ifdef NO_IO_CACHE
    841 	return raw_write_blk(channel, data, block, count, buf);
    842 #else
    843 	/*
    844 	 * If we're doing an odd-sized write or a very large write,
    845 	 * flush out the cache completely and then do a direct write.
    846 	 */
    847 	if (count < 0 || count > WRITE_DIRECT_SIZE) {
    848 		if ((retval = flush_cached_blocks(channel, data, 1)))
    849 			return retval;
    850 		return raw_write_blk(channel, data, block, count, buf);
    851 	}
    852 
    853 	/*
    854 	 * For a moderate-sized multi-block write, first force a write
    855 	 * if we're in write-through cache mode, and then fill the
    856 	 * cache with the blocks.
    857 	 */
    858 	writethrough = channel->flags & CHANNEL_FLAGS_WRITETHROUGH;
    859 	if (writethrough)
    860 		retval = raw_write_blk(channel, data, block, count, buf);
    861 
    862 	cp = buf;
    863 	while (count > 0) {
    864 		cache = find_cached_block(data, block, &reuse);
    865 		if (!cache) {
    866 			cache = reuse;
    867 			reuse_cache(channel, data, cache, block);
    868 		}
    869 		if (cache->buf != cp)
    870 			memcpy(cache->buf, cp, channel->block_size);
    871 		cache->dirty = !writethrough;
    872 		count--;
    873 		block++;
    874 		cp += channel->block_size;
    875 	}
    876 	return retval;
    877 #endif /* NO_IO_CACHE */
    878 }
    879 
    880 static errcode_t unix_cache_readahead(io_channel channel,
    881 				      unsigned long long block,
    882 				      unsigned long long count)
    883 {
    884 #ifdef POSIX_FADV_WILLNEED
    885 	struct unix_private_data *data;
    886 
    887 	data = (struct unix_private_data *)channel->private_data;
    888 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    889 	return posix_fadvise(data->dev,
    890 			     (ext2_loff_t)block * channel->block_size + data->offset,
    891 			     (ext2_loff_t)count * channel->block_size,
    892 			     POSIX_FADV_WILLNEED);
    893 #else
    894 	return EXT2_ET_OP_NOT_SUPPORTED;
    895 #endif
    896 }
    897 
    898 static errcode_t unix_write_blk(io_channel channel, unsigned long block,
    899 				int count, const void *buf)
    900 {
    901 	return unix_write_blk64(channel, block, count, buf);
    902 }
    903 
    904 static errcode_t unix_write_byte(io_channel channel, unsigned long offset,
    905 				 int size, const void *buf)
    906 {
    907 	struct unix_private_data *data;
    908 	errcode_t	retval = 0;
    909 	ssize_t		actual;
    910 
    911 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    912 	data = (struct unix_private_data *) channel->private_data;
    913 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    914 
    915 	if (channel->align != 0) {
    916 #ifdef ALIGN_DEBUG
    917 		printf("unix_write_byte: O_DIRECT fallback\n");
    918 #endif
    919 		return EXT2_ET_UNIMPLEMENTED;
    920 	}
    921 
    922 #ifndef NO_IO_CACHE
    923 	/*
    924 	 * Flush out the cache completely
    925 	 */
    926 	if ((retval = flush_cached_blocks(channel, data, 1)))
    927 		return retval;
    928 #endif
    929 
    930 	if (lseek(data->dev, offset + data->offset, SEEK_SET) < 0)
    931 		return errno;
    932 
    933 	actual = write(data->dev, buf, size);
    934 	if (actual != size)
    935 		return EXT2_ET_SHORT_WRITE;
    936 
    937 	return 0;
    938 }
    939 
    940 /*
    941  * Flush data buffers to disk.
    942  */
    943 static errcode_t unix_flush(io_channel channel)
    944 {
    945 	struct unix_private_data *data;
    946 	errcode_t retval = 0;
    947 
    948 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    949 	data = (struct unix_private_data *) channel->private_data;
    950 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    951 
    952 #ifndef NO_IO_CACHE
    953 	retval = flush_cached_blocks(channel, data, 0);
    954 #endif
    955 #ifdef HAVE_FSYNC
    956 	fsync(data->dev);
    957 #endif
    958 	return retval;
    959 }
    960 
    961 static errcode_t unix_set_option(io_channel channel, const char *option,
    962 				 const char *arg)
    963 {
    964 	struct unix_private_data *data;
    965 	unsigned long long tmp;
    966 	char *end;
    967 
    968 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    969 	data = (struct unix_private_data *) channel->private_data;
    970 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
    971 
    972 	if (!strcmp(option, "offset")) {
    973 		if (!arg)
    974 			return EXT2_ET_INVALID_ARGUMENT;
    975 
    976 		tmp = strtoull(arg, &end, 0);
    977 		if (*end)
    978 			return EXT2_ET_INVALID_ARGUMENT;
    979 		data->offset = tmp;
    980 		if (data->offset < 0)
    981 			return EXT2_ET_INVALID_ARGUMENT;
    982 		return 0;
    983 	}
    984 	return EXT2_ET_INVALID_ARGUMENT;
    985 }
    986 
    987 #if defined(__linux__) && !defined(BLKDISCARD)
    988 #define BLKDISCARD		_IO(0x12,119)
    989 #endif
    990 
    991 static errcode_t unix_discard(io_channel channel, unsigned long long block,
    992 			      unsigned long long count)
    993 {
    994 	struct unix_private_data *data;
    995 	int		ret;
    996 
    997 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
    998 	data = (struct unix_private_data *) channel->private_data;
    999 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
   1000 
   1001 	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
   1002 #ifdef BLKDISCARD
   1003 		__u64 range[2];
   1004 
   1005 		range[0] = (__u64)(block) * channel->block_size + data->offset;
   1006 		range[1] = (__u64)(count) * channel->block_size;
   1007 
   1008 		ret = ioctl(data->dev, BLKDISCARD, &range);
   1009 #else
   1010 		goto unimplemented;
   1011 #endif
   1012 	} else {
   1013 #if defined(HAVE_FALLOCATE) && defined(FALLOC_FL_PUNCH_HOLE)
   1014 		/*
   1015 		 * If we are not on block device, try to use punch hole
   1016 		 * to reclaim free space.
   1017 		 */
   1018 		ret = fallocate(data->dev,
   1019 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1020 				(off_t)(block) * channel->block_size + data->offset,
   1021 				(off_t)(count) * channel->block_size);
   1022 #else
   1023 		goto unimplemented;
   1024 #endif
   1025 	}
   1026 	if (ret < 0) {
   1027 		if (errno == EOPNOTSUPP)
   1028 			goto unimplemented;
   1029 		return errno;
   1030 	}
   1031 	return 0;
   1032 unimplemented:
   1033 	return EXT2_ET_UNIMPLEMENTED;
   1034 }
   1035 
   1036 /* parameters might not be used if OS doesn't support zeroout */
   1037 #pragma GCC diagnostic push
   1038 #pragma GCC diagnostic ignored "-Wunused-parameter"
   1039 static errcode_t unix_zeroout(io_channel channel, unsigned long long block,
   1040 			      unsigned long long count)
   1041 {
   1042 	struct unix_private_data *data;
   1043 	int		ret;
   1044 
   1045 	EXT2_CHECK_MAGIC(channel, EXT2_ET_MAGIC_IO_CHANNEL);
   1046 	data = (struct unix_private_data *) channel->private_data;
   1047 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
   1048 
   1049 	if (getenv("UNIX_IO_NOZEROOUT"))
   1050 		goto unimplemented;
   1051 
   1052 	if (channel->flags & CHANNEL_FLAGS_BLOCK_DEVICE) {
   1053 		/* Not implemented until the BLKZEROOUT mess is fixed */
   1054 		goto unimplemented;
   1055 	} else {
   1056 		/* Regular file, try to use truncate/punch/zero. */
   1057 		struct stat statbuf;
   1058 
   1059 		if (count == 0)
   1060 			return 0;
   1061 		/*
   1062 		 * If we're trying to zero a range past the end of the file,
   1063 		 * extend the file size, then truncate everything.
   1064 		 */
   1065 		ret = fstat(data->dev, &statbuf);
   1066 		if (ret)
   1067 			goto err;
   1068 		if ((unsigned long long) statbuf.st_size <
   1069 			(block + count) * channel->block_size + data->offset) {
   1070 			ret = ftruncate(data->dev,
   1071 					(block + count) * channel->block_size + data->offset);
   1072 			if (ret)
   1073 				goto err;
   1074 		}
   1075 #if defined(HAVE_FALLOCATE) && (defined(FALLOC_FL_ZERO_RANGE) || \
   1076 	(defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)))
   1077 #if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
   1078 		ret = fallocate(data->dev,
   1079 				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
   1080 				(off_t)(block) * channel->block_size + data->offset,
   1081 				(off_t)(count) * channel->block_size);
   1082 		if (ret == 0)
   1083 			goto err;
   1084 #endif
   1085 #ifdef FALLOC_FL_ZERO_RANGE
   1086 		ret = fallocate(data->dev,
   1087 				FALLOC_FL_ZERO_RANGE,
   1088 				(off_t)(block) * channel->block_size + data->offset,
   1089 				(off_t)(count) * channel->block_size);
   1090 #endif
   1091 #else
   1092 		goto unimplemented;
   1093 #endif /* HAVE_FALLOCATE && (ZERO_RANGE || (PUNCH_HOLE && KEEP_SIZE)) */
   1094 	}
   1095 err:
   1096 	if (ret < 0) {
   1097 		if (errno == EOPNOTSUPP)
   1098 			goto unimplemented;
   1099 		return errno;
   1100 	}
   1101 	return 0;
   1102 unimplemented:
   1103 	return EXT2_ET_UNIMPLEMENTED;
   1104 }
   1105 #pragma GCC diagnostic pop
   1106 
   1107 static struct struct_io_manager struct_unix_manager = {
   1108 	.magic		= EXT2_ET_MAGIC_IO_MANAGER,
   1109 	.name		= "Unix I/O Manager",
   1110 	.open		= unix_open,
   1111 	.close		= unix_close,
   1112 	.set_blksize	= unix_set_blksize,
   1113 	.read_blk	= unix_read_blk,
   1114 	.write_blk	= unix_write_blk,
   1115 	.flush		= unix_flush,
   1116 	.write_byte	= unix_write_byte,
   1117 	.set_option	= unix_set_option,
   1118 	.get_stats	= unix_get_stats,
   1119 	.read_blk64	= unix_read_blk64,
   1120 	.write_blk64	= unix_write_blk64,
   1121 	.discard	= unix_discard,
   1122 	.cache_readahead	= unix_cache_readahead,
   1123 	.zeroout	= unix_zeroout,
   1124 };
   1125 
   1126 io_manager unix_io_manager = &struct_unix_manager;
   1127 
   1128 static struct struct_io_manager struct_unixfd_manager = {
   1129 	.magic		= EXT2_ET_MAGIC_IO_MANAGER,
   1130 	.name		= "Unix fd I/O Manager",
   1131 	.open		= unixfd_open,
   1132 	.close		= unix_close,
   1133 	.set_blksize	= unix_set_blksize,
   1134 	.read_blk	= unix_read_blk,
   1135 	.write_blk	= unix_write_blk,
   1136 	.flush		= unix_flush,
   1137 	.write_byte	= unix_write_byte,
   1138 	.set_option	= unix_set_option,
   1139 	.get_stats	= unix_get_stats,
   1140 	.read_blk64	= unix_read_blk64,
   1141 	.write_blk64	= unix_write_blk64,
   1142 	.discard	= unix_discard,
   1143 	.cache_readahead	= unix_cache_readahead,
   1144 	.zeroout	= unix_zeroout,
   1145 };
   1146 
   1147 io_manager unixfd_io_manager = &struct_unixfd_manager;
   1148