Home | History | Annotate | Download | only in direct_io
      1 /******************************************************************************/
      2 /* Copyright (c) Tim LaBerge <tim.laberge (at) quantum.com>, 2009                  */
      3 /*                                                                            */
      4 /* This program is free software;  you can redistribute it and/or modify      */
      5 /* it under the terms of the GNU General Public License as published by       */
      6 /* the Free Software Foundation; either version 2 of the License, or          */
      7 /* (at your option) any later version.                                        */
      8 /*                                                                            */
      9 /* This program is distributed in the hope that it will be useful,            */
     10 /* but WITHOUT ANY WARRANTY;  without even the implied warranty of            */
     11 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See                  */
     12 /* the GNU General Public License for more details.                           */
     13 /*                                                                            */
     14 /* You should have received a copy of the GNU General Public License          */
     15 /* along with this program;  if not, write to the Free Software Foundation,   */
     16 /* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA           */
     17 /*                                                                            */
     18 /******************************************************************************/
     19 
     20 /******************************************************************************/
     21 /*                                                                            */
     22 /* File:        dma_thread_diotest7.c                                         */
     23 /*                                                                            */
     24 /* Description: The man page for open(2) states the following:                */
     25 /*   O_DIRECT (Since Linux 2.6.10). Try to minimize cache effects of the I/O  */
     26 /*   to and from this file. In general this will degrade performance, but it  */
     27 /*   is useful in special situations, such as when applications do their own  */
     28 /*   caching. File I/O is done directly to/from user space buffers. The I/O is*/
     29 /*   synchronous, that is, at the completion of a read(2) or write(2), data is*/
     30 /*   guranteed to have been transferred. Under Linux 2.4 transfer sizes, and  */
     31 /*   the alignment of user buffer and file  offset  must all be multiples of  */
     32 /*   the logical block size of the file system.  Under Linux 2.6 alignment to */
     33 /*   512-byte bound-aries suffices.                                           */
     34 /*   However, it appears that data corruption may occur when a multithreaded  */
     35 /*   process reads into a non-page size aligned user buffer. A test program   */
     36 /*   which reliably reproduces the problem on ext3 and xfs is attached. The   */
     37 /*   program creates, patterns, reads, and verify a series of files. In the   */
     38 /*   read phase, a file is opened with O_DIRECT n times, where n is the       */
     39 /*   number of cpu's. A single buffer large enough to contain the file is     */
     40 /*   allocated and patterned with data not found in any of the files. The     */
     41 /*   alignment of the buffer is controlled by a command line option. Each file*/
     42 /*   is read in parallel by n threads, where n is the number of cpu's. Thread */
     43 /*   0 reads the first page of data from the file into the first page of the  */
     44 /*   buffer, thread 1 reads the second page of data in to the second page of  */
     45 /*   the buffer, and so on.  Thread n - 1 reads the remainder of the file into*/
     46 /*   the remainder of the buffer.                                             */
     47 /*   After a thread reads data into the buffer, it immediately verifies that  */
     48 /*   the contents of the buffer are correct. If the buffer contains corrupt   */
     49 /*   data, the thread dumps the data surrounding the corruption and calls     */
     50 /*   abort(). Otherwise, the thread exits.                                    */
     51 /*   Crucially, before the reader threads are dispatched, another thread is   */
     52 /*   started which calls fork()/msleep() in a loop until all reads are compl- */
     53 /*   eted. The child created by fork() does nothing but call exit(0). A comm- */
     54 /*   and line option controls whether the buffer is aligned.  In the case wh- */
     55 /*   ere the buffer is aligned on a page boundary, all is well. In the case   */
     56 /*   where the buffer is aligned on a page + 512 byte offset, corruption is   */
     57 /*   seen frequently.                                                         */
     58 /*   I believe that what is happening is that in the direct IO path, because  */
     59 /*   the user's buffer is not aligned, some user pages are being mapped twice.*/
     60 /*   When a fork() happens in between the calls to map the page, the page will*/
     61 /*   be marked as COW. When the second map happens (via get_user_pages()), a  */
     62 /*   new physical page will be allocated and copied. Thus, there is a race    */
     63 /*   between the completion of the first read from disk (and write to the user*/
     64 /*   page) and get_user_pages() mapping the page for the second time. If the  */
     65 /*   write does not complete before the page is copied, the user will see     */
     66 /*   stale data in the first 512 bytes of this page of their buffer. Indeed,  */
     67 /*   this is corruption most frequently seen. (It's also possible for the race*/
     68 /*   to be lost the other way, so that the last 3584 bytes of the page are    */
     69 /*   stale.)                                                                  */
     70 /*   The attached program (which is a heavily modified version of a program   */
     71 /*   provided by a customer seeing this problem) reliably reproduces the pro- */
     72 /*   blem on any multicore linux machine on both ext3 and xfs, although any   */
     73 /*   filesystem using the generic blockdev_direct_IO() routine is probably    */
     74 /*   vulnerable. I've seen a few threads that mention the potential for this  */
     75 /*   kind of problem, but no definitive solution or workaround (other than    */
     76 /*   "Don't do that").                                                        */
     77 /*   http://marc.info/?l=linux-mm&m=122668235304637&w=2                       */
     78 /*                                                                            */
     79 /* Total Tests: 1                                                             */
     80 /*                                                                            */
     81 /* Test Name:   dma_thread_diotest7                                           */
     82 /*                                                                            */
     83 /* Author:      Tim LaBerge <tim.laberge (at) quantum.com>                         */
     84 /*                                                                            */
     85 /* History:     Reported - Jan 07 2009 - Li Zefan <lizf (at) cn.fujitsu.com>       */
     86 /*              Ported   - Jan 23 2009 - Subrata <subrata (at) linux.vnet.ibm.com> */
     87 /*                                                                            */
     88 /******************************************************************************/
     89 
     90 #define _GNU_SOURCE 1
     91 
     92 #include <stdio.h>
     93 #include <stdint.h>
     94 #include <stdlib.h>
     95 #include <fcntl.h>
     96 #include <unistd.h>
     97 #include <memory.h>
     98 #include <pthread.h>
     99 #include <getopt.h>
    100 #include <errno.h>
    101 #include <sys/types.h>
    102 #include <sys/wait.h>
    103 #include <sys/mount.h>
    104 
    105 #include "test.h"
    106 #include "safe_macros.h"
    107 
    108 #define FILESIZE	(12*1024*1024)
    109 #define READSIZE	(1024*1024)
    110 
    111 #define MNT_POINT	"mntpoint"
    112 #define FILE_BASEPATH   MNT_POINT "/_dma_thread_test_%.04d.tmp"
    113 #define DIR_MODE	(S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP| \
    114 			 S_IXGRP|S_IROTH|S_IXOTH)
    115 #define FILECOUNT	100
    116 #define PATTERN		(0xfa)
    117 #define PAGE_SIZE	getpagesize()
    118 #define MIN_WORKERS	2
    119 #define MAX_WORKERS	(READSIZE/PAGE_SIZE)
    120 
    121 char *TCID = "dma_thread_diotest";
    122 int TST_TOTAL = 1;
    123 
    124 static void setup(void);
    125 static void dma_thread_diotest_verify(void);
    126 static void cleanup(void);
    127 static void help(void);
    128 
    129 static unsigned char *buffer;
    130 
    131 static char *align_str;
    132 static int align;
    133 static char *workers_str;
    134 static int workers;
    135 static char *device;
    136 static int mount_flag;
    137 static option_t options[] = {
    138 	{"a:", NULL, &align_str},
    139 	{"w:", NULL, &workers_str},
    140 	{NULL, NULL, NULL}
    141 };
    142 
    143 static volatile int done;
    144 static volatile int tst_result;
    145 
    146 typedef struct {
    147 	pthread_t tid;
    148 	int worker_number;
    149 	int fd;
    150 	int offset;
    151 	int length;
    152 	int pattern;
    153 	unsigned char *buffer;
    154 } worker_t;
    155 static worker_t *worker;
    156 
    157 static void *worker_thread(void *arg)
    158 {
    159 	int i, k;
    160 	int nread;
    161 	worker_t *worker = (worker_t *)arg;
    162 	int offset = worker->offset;
    163 	int fd = worker->fd;
    164 	unsigned char *buffer = worker->buffer;
    165 	int pattern = worker->pattern;
    166 	int length = worker->length;
    167 
    168 	if (lseek(fd, offset, SEEK_SET) < 0) {
    169 		fprintf(stderr, "Failed to lseek to %d on fd %d: %s.\n",
    170 			offset, fd, strerror(errno));
    171 		return (void *) 1;
    172 	}
    173 
    174 	nread = read(fd, buffer, length);
    175 	if (nread == -1 || nread != length) {
    176 		fprintf(stderr, "read failed in worker thread%d: %s",
    177 			worker->worker_number, strerror(errno));
    178 		return (void *) 1;
    179 	}
    180 
    181 	/* Corruption check */
    182 	for (i = 0; i < length; i++) {
    183 		if (buffer[i] != pattern) {
    184 			printf("Bad data at 0x%.06x: %p, \n", i, buffer + i);
    185 			printf("Data dump starting at 0x%.06x:\n", i - 8);
    186 			printf("Expect 0x%x followed by 0x%x:\n",
    187 			       pattern, PATTERN);
    188 
    189 			for (k = 0; k < 16; k++) {
    190 				printf("%02x ", buffer[i - 8 + k]);
    191 				if (k == 7) {
    192 					printf("\n");
    193 				}
    194 			}
    195 
    196 			printf("\n");
    197 			tst_result = 1;
    198 			return NULL;
    199 		}
    200 	}
    201 
    202 	return NULL;
    203 }
    204 
    205 static void *fork_thread(void *arg)
    206 {
    207 	pid_t pid;
    208 
    209 	(void) arg;
    210 
    211 	while (!done) {
    212 		pid = tst_fork();
    213 		if (pid == 0) {
    214 			exit(0);
    215 		} else if (pid < 0) {
    216 			fprintf(stderr, "Failed to fork child: %s.\n",
    217 				strerror(errno));
    218 			return (void *) 1;
    219 		}
    220 		waitpid(pid, NULL, 0);
    221 		usleep(100);
    222 	}
    223 
    224 	return NULL;
    225 }
    226 
    227 int main(int argc, char *argv[])
    228 {
    229 	int i, lc;
    230 
    231 	workers = sysconf(_SC_NPROCESSORS_ONLN);
    232 	if (workers > MAX_WORKERS)
    233 		workers = MAX_WORKERS;
    234 	tst_parse_opts(argc, argv, options, help);
    235 
    236 	setup();
    237 
    238 	for (lc = 0; TEST_LOOPING(lc); lc++) {
    239 		tst_count = 0;
    240 
    241 		for (i = 0; i < TST_TOTAL; i++)
    242 			dma_thread_diotest_verify();
    243 	}
    244 
    245 	cleanup();
    246 	tst_exit();
    247 }
    248 
    249 static void dma_thread_diotest_verify(void)
    250 {
    251 	int n, j, offset, rc;
    252 	void *retval;
    253 	char filename[PATH_MAX];
    254 	pthread_t fork_tid;
    255 
    256 	tst_result = 0;
    257 
    258 	for (n = 1; n <= FILECOUNT; n++) {
    259 		snprintf(filename, sizeof(filename), FILE_BASEPATH, n);
    260 		for (j = 0; j < workers; j++) {
    261 			worker[j].fd = SAFE_OPEN(cleanup, filename,
    262 						 O_RDONLY | O_DIRECT);
    263 			worker[j].pattern = n;
    264 		}
    265 
    266 		tst_resm(TINFO, "Reading file %d.", n);
    267 
    268 		for (offset = 0; offset < FILESIZE; offset += READSIZE) {
    269 			memset(buffer, PATTERN, READSIZE + align);
    270 			for (j = 0; j < workers; j++) {
    271 				worker[j].offset = offset + j * PAGE_SIZE;
    272 				worker[j].buffer =
    273 				    buffer + align + j * PAGE_SIZE;
    274 				worker[j].length = PAGE_SIZE;
    275 			}
    276 			/* The final worker reads whatever is left over. */
    277 			worker[workers - 1].length =
    278 			    READSIZE - PAGE_SIZE * (workers - 1);
    279 
    280 			done = 0;
    281 
    282 			rc = pthread_create(&fork_tid, NULL, fork_thread, NULL);
    283 			if (rc != 0) {
    284 				tst_brkm(TBROK, cleanup, "pthread_create "
    285 					 "failed: %s", strerror(rc));
    286 			}
    287 
    288 			for (j = 0; j < workers; j++) {
    289 				rc = pthread_create(&worker[j].tid, NULL,
    290 						    worker_thread, worker + j);
    291 				if (rc != 0) {
    292 					tst_brkm(TBROK, cleanup, "Can't create"
    293 						 "worker thread %d: %s",
    294 						 j, strerror(rc));
    295 				}
    296 			}
    297 
    298 			for (j = 0; j < workers; j++) {
    299 				rc = pthread_join(worker[j].tid, &retval);
    300 				if (rc != 0) {
    301 					tst_brkm(TBROK, cleanup, "Failed to "
    302 						 "join worker thread %d: %s.",
    303 						 j, strerror(rc));
    304 				}
    305 				if ((intptr_t)retval != 0) {
    306 					tst_brkm(TBROK, cleanup, "there is"
    307 						 "some errors in worker[%d],"
    308 						 "return value: %ld",
    309 						 j, (intptr_t)retval);
    310 				}
    311 			}
    312 
    313 			/* Let the fork thread know it's ok to exit */
    314 			done = 1;
    315 
    316 			rc = pthread_join(fork_tid, &retval);
    317 			if (rc != 0) {
    318 				tst_brkm(TBROK, cleanup,
    319 					 "Failed to join fork thread: %s.",
    320 					 strerror(rc));
    321 			}
    322 			if ((intptr_t)retval != 0) {
    323 				tst_brkm(TBROK, cleanup,
    324 					 "fork() failed in fork thread:"
    325 					 "return value: %ld", (intptr_t)retval);
    326 			}
    327 		}
    328 
    329 		/* Close the fd's for the next file. */
    330 		for (j = 0; j < workers; j++)
    331 			SAFE_CLOSE(cleanup, worker[j].fd);
    332 		if (tst_result)
    333 			break;
    334 	}
    335 
    336 	if (tst_result)
    337 		tst_resm(TFAIL, "data corruption is detected");
    338 	else
    339 		tst_resm(TPASS, "data corruption is not detected");
    340 }
    341 
    342 static void setup(void)
    343 {
    344 	char filename[PATH_MAX];
    345 	int n, j, fd, directflag = 1;
    346 	long type;
    347 
    348 	if (align_str) {
    349 		align = atoi(align_str);
    350 		if (align < 0 || align > PAGE_SIZE)
    351 			tst_brkm(TCONF, NULL, "Bad alignment %d.", align);
    352 	}
    353 	tst_resm(TINFO, "using alignment %d", align);
    354 
    355 	if (workers_str) {
    356 		workers = atoi(workers_str);
    357 		if (workers < MIN_WORKERS || workers > MAX_WORKERS) {
    358 			tst_brkm(TCONF, NULL, "Worker count %d not between "
    359 				 "%d and %d, inclusive",
    360 				 workers, MIN_WORKERS, MAX_WORKERS);
    361 		}
    362 	}
    363 	tst_resm(TINFO, "using %d workers.", workers);
    364 
    365 	tst_sig(FORK, DEF_HANDLER, NULL);
    366 	tst_require_root();
    367 
    368 	TEST_PAUSE;
    369 
    370 	tst_tmpdir();
    371 
    372 	/*
    373 	 * Some file systems may not implement the O_DIRECT flag and open() will
    374 	 * fail with EINVAL if it is used. So add this check for current
    375 	 * filesystem current directory is in, if not supported, we choose to
    376 	 * have this test in LTP_BIG_DEV and mkfs it as ext3.
    377 	 */
    378 	fd = open("testfile", O_CREAT | O_DIRECT, 0644);
    379 	if (fd < 0 && errno == EINVAL) {
    380 		type = tst_fs_type(NULL, ".");
    381 		tst_resm(TINFO, "O_DIRECT flag is not supported on %s "
    382 			 "filesystem", tst_fs_type_name(type));
    383 		directflag = 0;
    384 	} else if (fd > 0) {
    385 		SAFE_CLOSE(NULL, fd);
    386 	}
    387 
    388 	SAFE_MKDIR(cleanup, MNT_POINT, DIR_MODE);
    389 
    390 	/*
    391 	 * verify whether the current directory has enough free space,
    392 	 * if it is not satisfied, we will use the LTP_BIG_DEV, which
    393 	 * will be exported by runltp with "-z" option.
    394 	 */
    395 	if (!directflag || !tst_fs_has_free(NULL, ".", 1300, TST_MB)) {
    396 		device = getenv("LTP_BIG_DEV");
    397 		if (device == NULL) {
    398 			tst_brkm(TCONF, NULL,
    399 				 "you must specify a big blockdevice(>1.3G)");
    400 		} else {
    401 			tst_mkfs(NULL, device, "ext3", NULL, NULL);
    402 		}
    403 
    404 		if (mount(device, MNT_POINT, "ext3", 0, NULL) < 0) {
    405 			tst_brkm(TBROK | TERRNO, NULL,
    406 				 "mount device:%s failed", device);
    407 		}
    408 		mount_flag = 1;
    409 	}
    410 
    411 	worker = SAFE_MALLOC(cleanup, workers * sizeof(worker_t));
    412 
    413 	for (j = 0; j < workers; j++)
    414 		worker[j].worker_number = j;
    415 
    416 	for (n = 1; n <= FILECOUNT; n++) {
    417 		snprintf(filename, sizeof(filename), FILE_BASEPATH, n);
    418 
    419 		if (tst_fill_file(filename, n, FILESIZE, 1)) {
    420 			tst_brkm(TBROK, cleanup, "failed to create file: %s",
    421 				 filename);
    422 		}
    423 	}
    424 
    425 	if (posix_memalign((void **)&buffer, PAGE_SIZE, READSIZE + align) != 0)
    426 		tst_brkm(TBROK, cleanup, "call posix_memalign failed");
    427 }
    428 
    429 static void cleanup(void)
    430 {
    431 	free(buffer);
    432 
    433 	if (mount_flag && tst_umount(MNT_POINT) < 0)
    434 		tst_resm(TWARN | TERRNO, "umount device:%s failed", device);
    435 
    436 	free(worker);
    437 
    438 	tst_rmdir();
    439 }
    440 
    441 static void help(void)
    442 {
    443 	printf("-a align read buffer to offset <alignment>.\n");
    444 	printf("-w number of worker threads, 2 (default) to %d,"
    445 	       " defaults to number of cores.\n", MAX_WORKERS);
    446 }
    447