1 /******************************************************************************/ 2 /* Copyright (c) Tim LaBerge <tim.laberge (at) quantum.com>, 2009 */ 3 /* */ 4 /* This program is free software; you can redistribute it and/or modify */ 5 /* it under the terms of the GNU General Public License as published by */ 6 /* the Free Software Foundation; either version 2 of the License, or */ 7 /* (at your option) any later version. */ 8 /* */ 9 /* This program is distributed in the hope that it will be useful, */ 10 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ 11 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See */ 12 /* the GNU General Public License for more details. */ 13 /* */ 14 /* You should have received a copy of the GNU General Public License */ 15 /* along with this program; if not, write to the Free Software Foundation, */ 16 /* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ 17 /* */ 18 /******************************************************************************/ 19 20 /******************************************************************************/ 21 /* */ 22 /* File: dma_thread_diotest7.c */ 23 /* */ 24 /* Description: The man page for open(2) states the following: */ 25 /* O_DIRECT (Since Linux 2.6.10). Try to minimize cache effects of the I/O */ 26 /* to and from this file. In general this will degrade performance, but it */ 27 /* is useful in special situations, such as when applications do their own */ 28 /* caching. File I/O is done directly to/from user space buffers. The I/O is*/ 29 /* synchronous, that is, at the completion of a read(2) or write(2), data is*/ 30 /* guranteed to have been transferred. Under Linux 2.4 transfer sizes, and */ 31 /* the alignment of user buffer and file offset must all be multiples of */ 32 /* the logical block size of the file system. Under Linux 2.6 alignment to */ 33 /* 512-byte bound-aries suffices. */ 34 /* However, it appears that data corruption may occur when a multithreaded */ 35 /* process reads into a non-page size aligned user buffer. A test program */ 36 /* which reliably reproduces the problem on ext3 and xfs is attached. The */ 37 /* program creates, patterns, reads, and verify a series of files. In the */ 38 /* read phase, a file is opened with O_DIRECT n times, where n is the */ 39 /* number of cpu's. A single buffer large enough to contain the file is */ 40 /* allocated and patterned with data not found in any of the files. The */ 41 /* alignment of the buffer is controlled by a command line option. Each file*/ 42 /* is read in parallel by n threads, where n is the number of cpu's. Thread */ 43 /* 0 reads the first page of data from the file into the first page of the */ 44 /* buffer, thread 1 reads the second page of data in to the second page of */ 45 /* the buffer, and so on. Thread n - 1 reads the remainder of the file into*/ 46 /* the remainder of the buffer. */ 47 /* After a thread reads data into the buffer, it immediately verifies that */ 48 /* the contents of the buffer are correct. If the buffer contains corrupt */ 49 /* data, the thread dumps the data surrounding the corruption and calls */ 50 /* abort(). Otherwise, the thread exits. */ 51 /* Crucially, before the reader threads are dispatched, another thread is */ 52 /* started which calls fork()/msleep() in a loop until all reads are compl- */ 53 /* eted. The child created by fork() does nothing but call exit(0). A comm- */ 54 /* and line option controls whether the buffer is aligned. In the case wh- */ 55 /* ere the buffer is aligned on a page boundary, all is well. In the case */ 56 /* where the buffer is aligned on a page + 512 byte offset, corruption is */ 57 /* seen frequently. */ 58 /* I believe that what is happening is that in the direct IO path, because */ 59 /* the user's buffer is not aligned, some user pages are being mapped twice.*/ 60 /* When a fork() happens in between the calls to map the page, the page will*/ 61 /* be marked as COW. When the second map happens (via get_user_pages()), a */ 62 /* new physical page will be allocated and copied. Thus, there is a race */ 63 /* between the completion of the first read from disk (and write to the user*/ 64 /* page) and get_user_pages() mapping the page for the second time. If the */ 65 /* write does not complete before the page is copied, the user will see */ 66 /* stale data in the first 512 bytes of this page of their buffer. Indeed, */ 67 /* this is corruption most frequently seen. (It's also possible for the race*/ 68 /* to be lost the other way, so that the last 3584 bytes of the page are */ 69 /* stale.) */ 70 /* The attached program (which is a heavily modified version of a program */ 71 /* provided by a customer seeing this problem) reliably reproduces the pro- */ 72 /* blem on any multicore linux machine on both ext3 and xfs, although any */ 73 /* filesystem using the generic blockdev_direct_IO() routine is probably */ 74 /* vulnerable. I've seen a few threads that mention the potential for this */ 75 /* kind of problem, but no definitive solution or workaround (other than */ 76 /* "Don't do that"). */ 77 /* http://marc.info/?l=linux-mm&m=122668235304637&w=2 */ 78 /* */ 79 /* Total Tests: 1 */ 80 /* */ 81 /* Test Name: dma_thread_diotest7 */ 82 /* */ 83 /* Author: Tim LaBerge <tim.laberge (at) quantum.com> */ 84 /* */ 85 /* History: Reported - Jan 07 2009 - Li Zefan <lizf (at) cn.fujitsu.com> */ 86 /* Ported - Jan 23 2009 - Subrata <subrata (at) linux.vnet.ibm.com> */ 87 /* */ 88 /******************************************************************************/ 89 90 #define _GNU_SOURCE 1 91 92 #include <stdio.h> 93 #include <stdint.h> 94 #include <stdlib.h> 95 #include <fcntl.h> 96 #include <unistd.h> 97 #include <memory.h> 98 #include <pthread.h> 99 #include <getopt.h> 100 #include <errno.h> 101 #include <sys/types.h> 102 #include <sys/wait.h> 103 #include <sys/mount.h> 104 105 #include "test.h" 106 #include "safe_macros.h" 107 108 #define FILESIZE (12*1024*1024) 109 #define READSIZE (1024*1024) 110 111 #define MNT_POINT "mntpoint" 112 #define FILE_BASEPATH MNT_POINT "/_dma_thread_test_%.04d.tmp" 113 #define DIR_MODE (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP| \ 114 S_IXGRP|S_IROTH|S_IXOTH) 115 #define FILECOUNT 100 116 #define PATTERN (0xfa) 117 #define PAGE_SIZE getpagesize() 118 #define MIN_WORKERS 2 119 #define MAX_WORKERS (READSIZE/PAGE_SIZE) 120 121 char *TCID = "dma_thread_diotest"; 122 int TST_TOTAL = 1; 123 124 static void setup(void); 125 static void dma_thread_diotest_verify(void); 126 static void cleanup(void); 127 static void help(void); 128 129 static unsigned char *buffer; 130 131 static char *align_str; 132 static int align; 133 static char *workers_str; 134 static int workers; 135 static char *device; 136 static int mount_flag; 137 static option_t options[] = { 138 {"a:", NULL, &align_str}, 139 {"w:", NULL, &workers_str}, 140 {NULL, NULL, NULL} 141 }; 142 143 static volatile int done; 144 static volatile int tst_result; 145 146 typedef struct { 147 pthread_t tid; 148 int worker_number; 149 int fd; 150 int offset; 151 int length; 152 int pattern; 153 unsigned char *buffer; 154 } worker_t; 155 static worker_t *worker; 156 157 static void *worker_thread(void *arg) 158 { 159 int i, k; 160 int nread; 161 worker_t *worker = (worker_t *)arg; 162 int offset = worker->offset; 163 int fd = worker->fd; 164 unsigned char *buffer = worker->buffer; 165 int pattern = worker->pattern; 166 int length = worker->length; 167 168 if (lseek(fd, offset, SEEK_SET) < 0) { 169 fprintf(stderr, "Failed to lseek to %d on fd %d: %s.\n", 170 offset, fd, strerror(errno)); 171 return (void *) 1; 172 } 173 174 nread = read(fd, buffer, length); 175 if (nread == -1 || nread != length) { 176 fprintf(stderr, "read failed in worker thread%d: %s", 177 worker->worker_number, strerror(errno)); 178 return (void *) 1; 179 } 180 181 /* Corruption check */ 182 for (i = 0; i < length; i++) { 183 if (buffer[i] != pattern) { 184 printf("Bad data at 0x%.06x: %p, \n", i, buffer + i); 185 printf("Data dump starting at 0x%.06x:\n", i - 8); 186 printf("Expect 0x%x followed by 0x%x:\n", 187 pattern, PATTERN); 188 189 for (k = 0; k < 16; k++) { 190 printf("%02x ", buffer[i - 8 + k]); 191 if (k == 7) { 192 printf("\n"); 193 } 194 } 195 196 printf("\n"); 197 tst_result = 1; 198 return NULL; 199 } 200 } 201 202 return NULL; 203 } 204 205 static void *fork_thread(void *arg) 206 { 207 pid_t pid; 208 209 (void) arg; 210 211 while (!done) { 212 pid = tst_fork(); 213 if (pid == 0) { 214 exit(0); 215 } else if (pid < 0) { 216 fprintf(stderr, "Failed to fork child: %s.\n", 217 strerror(errno)); 218 return (void *) 1; 219 } 220 waitpid(pid, NULL, 0); 221 usleep(100); 222 } 223 224 return NULL; 225 } 226 227 int main(int argc, char *argv[]) 228 { 229 int i, lc; 230 231 workers = sysconf(_SC_NPROCESSORS_ONLN); 232 if (workers > MAX_WORKERS) 233 workers = MAX_WORKERS; 234 tst_parse_opts(argc, argv, options, help); 235 236 setup(); 237 238 for (lc = 0; TEST_LOOPING(lc); lc++) { 239 tst_count = 0; 240 241 for (i = 0; i < TST_TOTAL; i++) 242 dma_thread_diotest_verify(); 243 } 244 245 cleanup(); 246 tst_exit(); 247 } 248 249 static void dma_thread_diotest_verify(void) 250 { 251 int n, j, offset, rc; 252 void *retval; 253 char filename[PATH_MAX]; 254 pthread_t fork_tid; 255 256 tst_result = 0; 257 258 for (n = 1; n <= FILECOUNT; n++) { 259 snprintf(filename, sizeof(filename), FILE_BASEPATH, n); 260 for (j = 0; j < workers; j++) { 261 worker[j].fd = SAFE_OPEN(cleanup, filename, 262 O_RDONLY | O_DIRECT); 263 worker[j].pattern = n; 264 } 265 266 tst_resm(TINFO, "Reading file %d.", n); 267 268 for (offset = 0; offset < FILESIZE; offset += READSIZE) { 269 memset(buffer, PATTERN, READSIZE + align); 270 for (j = 0; j < workers; j++) { 271 worker[j].offset = offset + j * PAGE_SIZE; 272 worker[j].buffer = 273 buffer + align + j * PAGE_SIZE; 274 worker[j].length = PAGE_SIZE; 275 } 276 /* The final worker reads whatever is left over. */ 277 worker[workers - 1].length = 278 READSIZE - PAGE_SIZE * (workers - 1); 279 280 done = 0; 281 282 rc = pthread_create(&fork_tid, NULL, fork_thread, NULL); 283 if (rc != 0) { 284 tst_brkm(TBROK, cleanup, "pthread_create " 285 "failed: %s", strerror(rc)); 286 } 287 288 for (j = 0; j < workers; j++) { 289 rc = pthread_create(&worker[j].tid, NULL, 290 worker_thread, worker + j); 291 if (rc != 0) { 292 tst_brkm(TBROK, cleanup, "Can't create" 293 "worker thread %d: %s", 294 j, strerror(rc)); 295 } 296 } 297 298 for (j = 0; j < workers; j++) { 299 rc = pthread_join(worker[j].tid, &retval); 300 if (rc != 0) { 301 tst_brkm(TBROK, cleanup, "Failed to " 302 "join worker thread %d: %s.", 303 j, strerror(rc)); 304 } 305 if ((intptr_t)retval != 0) { 306 tst_brkm(TBROK, cleanup, "there is" 307 "some errors in worker[%d]," 308 "return value: %ld", 309 j, (intptr_t)retval); 310 } 311 } 312 313 /* Let the fork thread know it's ok to exit */ 314 done = 1; 315 316 rc = pthread_join(fork_tid, &retval); 317 if (rc != 0) { 318 tst_brkm(TBROK, cleanup, 319 "Failed to join fork thread: %s.", 320 strerror(rc)); 321 } 322 if ((intptr_t)retval != 0) { 323 tst_brkm(TBROK, cleanup, 324 "fork() failed in fork thread:" 325 "return value: %ld", (intptr_t)retval); 326 } 327 } 328 329 /* Close the fd's for the next file. */ 330 for (j = 0; j < workers; j++) 331 SAFE_CLOSE(cleanup, worker[j].fd); 332 if (tst_result) 333 break; 334 } 335 336 if (tst_result) 337 tst_resm(TFAIL, "data corruption is detected"); 338 else 339 tst_resm(TPASS, "data corruption is not detected"); 340 } 341 342 static void setup(void) 343 { 344 char filename[PATH_MAX]; 345 int n, j, fd, directflag = 1; 346 long type; 347 348 if (align_str) { 349 align = atoi(align_str); 350 if (align < 0 || align > PAGE_SIZE) 351 tst_brkm(TCONF, NULL, "Bad alignment %d.", align); 352 } 353 tst_resm(TINFO, "using alignment %d", align); 354 355 if (workers_str) { 356 workers = atoi(workers_str); 357 if (workers < MIN_WORKERS || workers > MAX_WORKERS) { 358 tst_brkm(TCONF, NULL, "Worker count %d not between " 359 "%d and %d, inclusive", 360 workers, MIN_WORKERS, MAX_WORKERS); 361 } 362 } 363 tst_resm(TINFO, "using %d workers.", workers); 364 365 tst_sig(FORK, DEF_HANDLER, NULL); 366 tst_require_root(); 367 368 TEST_PAUSE; 369 370 tst_tmpdir(); 371 372 /* 373 * Some file systems may not implement the O_DIRECT flag and open() will 374 * fail with EINVAL if it is used. So add this check for current 375 * filesystem current directory is in, if not supported, we choose to 376 * have this test in LTP_BIG_DEV and mkfs it as ext3. 377 */ 378 fd = open("testfile", O_CREAT | O_DIRECT, 0644); 379 if (fd < 0 && errno == EINVAL) { 380 type = tst_fs_type(NULL, "."); 381 tst_resm(TINFO, "O_DIRECT flag is not supported on %s " 382 "filesystem", tst_fs_type_name(type)); 383 directflag = 0; 384 } else if (fd > 0) { 385 SAFE_CLOSE(NULL, fd); 386 } 387 388 SAFE_MKDIR(cleanup, MNT_POINT, DIR_MODE); 389 390 /* 391 * verify whether the current directory has enough free space, 392 * if it is not satisfied, we will use the LTP_BIG_DEV, which 393 * will be exported by runltp with "-z" option. 394 */ 395 if (!directflag || !tst_fs_has_free(NULL, ".", 1300, TST_MB)) { 396 device = getenv("LTP_BIG_DEV"); 397 if (device == NULL) { 398 tst_brkm(TCONF, NULL, 399 "you must specify a big blockdevice(>1.3G)"); 400 } else { 401 tst_mkfs(NULL, device, "ext3", NULL, NULL); 402 } 403 404 SAFE_MOUNT(NULL, device, MNT_POINT, "ext3", 0, NULL); 405 mount_flag = 1; 406 } 407 408 worker = SAFE_MALLOC(cleanup, workers * sizeof(worker_t)); 409 410 for (j = 0; j < workers; j++) 411 worker[j].worker_number = j; 412 413 for (n = 1; n <= FILECOUNT; n++) { 414 snprintf(filename, sizeof(filename), FILE_BASEPATH, n); 415 416 if (tst_fill_file(filename, n, FILESIZE, 1)) { 417 tst_brkm(TBROK, cleanup, "failed to create file: %s", 418 filename); 419 } 420 } 421 422 if (posix_memalign((void **)&buffer, PAGE_SIZE, READSIZE + align) != 0) 423 tst_brkm(TBROK, cleanup, "call posix_memalign failed"); 424 } 425 426 static void cleanup(void) 427 { 428 free(buffer); 429 430 if (mount_flag && tst_umount(MNT_POINT) < 0) 431 tst_resm(TWARN | TERRNO, "umount device:%s failed", device); 432 433 free(worker); 434 435 tst_rmdir(); 436 } 437 438 static void help(void) 439 { 440 printf("-a align read buffer to offset <alignment>.\n"); 441 printf("-w number of worker threads, 2 (default) to %d," 442 " defaults to number of cores.\n", MAX_WORKERS); 443 } 444