1 2 /* This is an example of a program which does atomic memory operations 3 between two processes which share a page. Valgrind 3.4.1 and 4 earlier produce incorrect answers because it does not preserve 5 atomicity of the relevant instructions in the generated code; but 6 the post-DCAS-merge versions of Valgrind do behave correctly. */ 7 8 /* On ARM, this can be compiled into either ARM or Thumb code, so as 9 to test both A and T encodings of LDREX/STREX et al. Also on ARM, 10 it tests doubleword atomics (LDREXD, STREXD) which I don't think it 11 does on any other platform. */ 12 13 #include <stdlib.h> 14 #include <stdio.h> 15 #include <string.h> 16 #include <assert.h> 17 #include <unistd.h> 18 #include <sys/wait.h> 19 #include "tests/sys_mman.h" 20 21 #define NNN 3456987 22 23 #define IS_8_ALIGNED(_ptr) (0 == (((unsigned long)(_ptr)) & 7)) 24 25 26 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n ) 27 { 28 #if defined(VGA_x86) 29 unsigned long block[2]; 30 block[0] = (unsigned long)p; 31 block[1] = n; 32 __asm__ __volatile__( 33 "movl 0(%%esi),%%eax" "\n\t" 34 "movl 4(%%esi),%%ebx" "\n\t" 35 "lock; addb %%bl,(%%eax)" "\n" 36 : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx" 37 ); 38 #elif defined(VGA_amd64) 39 unsigned long block[2]; 40 block[0] = (unsigned long)p; 41 block[1] = n; 42 __asm__ __volatile__( 43 "movq 0(%%rsi),%%rax" "\n\t" 44 "movq 8(%%rsi),%%rbx" "\n\t" 45 "lock; addb %%bl,(%%rax)" "\n" 46 : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx" 47 ); 48 #elif defined(VGA_ppc32) 49 /* Nasty hack. Does correctly atomically do *p += n, but only if p 50 is 4-aligned -- guaranteed by caller. */ 51 unsigned long success; 52 do { 53 __asm__ __volatile__( 54 "lwarx 15,0,%1" "\n\t" 55 "add 15,15,%2" "\n\t" 56 "stwcx. 15,0,%1" "\n\t" 57 "mfcr %0" "\n\t" 58 "srwi %0,%0,29" "\n\t" 59 "andi. %0,%0,1" "\n" 60 : /*out*/"=b"(success) 61 : /*in*/ "b"(p), "b"(((unsigned long)n) << 24) 62 : /*trash*/ "memory", "cc", "r15" 63 ); 64 } while (success != 1); 65 #elif defined(VGA_ppc64) 66 /* Nasty hack. Does correctly atomically do *p += n, but only if p 67 is 8-aligned -- guaranteed by caller. */ 68 unsigned long success; 69 do { 70 __asm__ __volatile__( 71 "ldarx 15,0,%1" "\n\t" 72 "add 15,15,%2" "\n\t" 73 "stdcx. 15,0,%1" "\n\t" 74 "mfcr %0" "\n\t" 75 "srwi %0,%0,29" "\n\t" 76 "andi. %0,%0,1" "\n" 77 : /*out*/"=b"(success) 78 : /*in*/ "b"(p), "b"(((unsigned long)n) << 56) 79 : /*trash*/ "memory", "cc", "r15" 80 ); 81 } while (success != 1); 82 #elif defined(VGA_arm) 83 unsigned int block[3] 84 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 85 do { 86 __asm__ __volatile__( 87 "mov r5, %0" "\n\t" 88 "ldr r9, [r5, #0]" "\n\t" // p 89 "ldr r10, [r5, #4]" "\n\t" // n 90 "ldrexb r8, [r9]" "\n\t" 91 "add r8, r8, r10" "\n\t" 92 "strexb r4, r8, [r9]" "\n\t" 93 "str r4, [r5, #8]" "\n\t" 94 : /*out*/ 95 : /*in*/ "r"(&block[0]) 96 : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4" 97 ); 98 } while (block[2] != 0); 99 #elif defined(VGA_s390x) 100 int dummy; 101 __asm__ __volatile__( 102 " l 0,%0\n\t" 103 "0: st 0,%1\n\t" 104 " icm 1,1,%1\n\t" 105 " ar 1,%2\n\t" 106 " stcm 1,1,%1\n\t" 107 " l 1,%1\n\t" 108 " cs 0,1,%0\n\t" 109 " jl 0b\n\t" 110 : "+m" (*p), "+m" (dummy) 111 : "d" (n) 112 : "cc", "memory", "0", "1"); 113 #elif defined(VGA_mips32) 114 #if defined (_MIPSEL) 115 unsigned int block[3] 116 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 117 do { 118 __asm__ __volatile__( 119 "move $t0, %0" "\n\t" 120 "lw $t1, 0($t0)" "\n\t" // p 121 "lw $t2, 4($t0)" "\n\t" // n 122 "ll $t3, 0($t1)" "\n\t" 123 "addu $t3, $t3, $t2" "\n\t" 124 "andi $t3, $t3, 0xFF" "\n\t" 125 "sc $t3, 0($t1)" "\n\t" 126 "sw $t3, 8($t0)" "\n\t" 127 : /*out*/ 128 : /*in*/ "r"(&block[0]) 129 : /*trash*/ "memory", "cc", "t0", "t1", "t2", "t3" 130 ); 131 } while (block[2] != 1); 132 #elif defined (_MIPSEB) 133 unsigned int block[3] 134 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 135 do { 136 __asm__ __volatile__( 137 "move $t0, %0" "\n\t" 138 "lw $t1, 0($t0)" "\n\t" // p 139 "lw $t2, 4($t0)" "\n\t" // n 140 "li $t4, 0x000000FF" "\n\t" 141 "ll $t3, 0($t1)" "\n\t" 142 "addu $t3, $t3, $t2" "\n\t" 143 "and $t3, $t3, $t4" "\n\t" 144 "wsbh $t4, $t3" "\n\t" 145 "rotr $t4, $t4, 16" "\n\t" 146 "or $t3, $t4, $t3" "\n\t" 147 "sc $t3, 0($t1)" "\n\t" 148 "sw $t3, 8($t0)" "\n\t" 149 : /*out*/ 150 : /*in*/ "r"(&block[0]) 151 : /*trash*/ "memory", "cc", "t0", "t1", "t2", "t3", "t4" 152 ); 153 } while (block[2] != 1); 154 #endif 155 #else 156 # error "Unsupported arch" 157 #endif 158 } 159 160 161 __attribute__((noinline)) void atomic_add_16bit ( short* p, int n ) 162 { 163 #if defined(VGA_x86) 164 unsigned long block[2]; 165 block[0] = (unsigned long)p; 166 block[1] = n; 167 __asm__ __volatile__( 168 "movl 0(%%esi),%%eax" "\n\t" 169 "movl 4(%%esi),%%ebx" "\n\t" 170 "lock; addw %%bx,(%%eax)" "\n" 171 : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx" 172 ); 173 #elif defined(VGA_amd64) 174 unsigned long block[2]; 175 block[0] = (unsigned long)p; 176 block[1] = n; 177 __asm__ __volatile__( 178 "movq 0(%%rsi),%%rax" "\n\t" 179 "movq 8(%%rsi),%%rbx" "\n\t" 180 "lock; addw %%bx,(%%rax)" "\n" 181 : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx" 182 ); 183 #elif defined(VGA_ppc32) 184 /* Nasty hack. Does correctly atomically do *p += n, but only if p 185 is 8-aligned -- guaranteed by caller. */ 186 unsigned long success; 187 do { 188 __asm__ __volatile__( 189 "lwarx 15,0,%1" "\n\t" 190 "add 15,15,%2" "\n\t" 191 "stwcx. 15,0,%1" "\n\t" 192 "mfcr %0" "\n\t" 193 "srwi %0,%0,29" "\n\t" 194 "andi. %0,%0,1" "\n" 195 : /*out*/"=b"(success) 196 : /*in*/ "b"(p), "b"(((unsigned long)n) << 16) 197 : /*trash*/ "memory", "cc", "r15" 198 ); 199 } while (success != 1); 200 #elif defined(VGA_ppc64) 201 /* Nasty hack. Does correctly atomically do *p += n, but only if p 202 is 8-aligned -- guaranteed by caller. */ 203 unsigned long success; 204 do { 205 __asm__ __volatile__( 206 "ldarx 15,0,%1" "\n\t" 207 "add 15,15,%2" "\n\t" 208 "stdcx. 15,0,%1" "\n\t" 209 "mfcr %0" "\n\t" 210 "srwi %0,%0,29" "\n\t" 211 "andi. %0,%0,1" "\n" 212 : /*out*/"=b"(success) 213 : /*in*/ "b"(p), "b"(((unsigned long)n) << 48) 214 : /*trash*/ "memory", "cc", "r15" 215 ); 216 } while (success != 1); 217 #elif defined(VGA_arm) 218 unsigned int block[3] 219 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 220 do { 221 __asm__ __volatile__( 222 "mov r5, %0" "\n\t" 223 "ldr r9, [r5, #0]" "\n\t" // p 224 "ldr r10, [r5, #4]" "\n\t" // n 225 "ldrexh r8, [r9]" "\n\t" 226 "add r8, r8, r10" "\n\t" 227 "strexh r4, r8, [r9]" "\n\t" 228 "str r4, [r5, #8]" "\n\t" 229 : /*out*/ 230 : /*in*/ "r"(&block[0]) 231 : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4" 232 ); 233 } while (block[2] != 0); 234 #elif defined(VGA_s390x) 235 int dummy; 236 __asm__ __volatile__( 237 " l 0,%0\n\t" 238 "0: st 0,%1\n\t" 239 " icm 1,3,%1\n\t" 240 " ar 1,%2\n\t" 241 " stcm 1,3,%1\n\t" 242 " l 1,%1\n\t" 243 " cs 0,1,%0\n\t" 244 " jl 0b\n\t" 245 : "+m" (*p), "+m" (dummy) 246 : "d" (n) 247 : "cc", "memory", "0", "1"); 248 #elif defined(VGA_mips32) 249 #if defined (_MIPSEL) 250 unsigned int block[3] 251 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 252 do { 253 __asm__ __volatile__( 254 "move $t0, %0" "\n\t" 255 "lw $t1, 0($t0)" "\n\t" // p 256 "lw $t2, 4($t0)" "\n\t" // n 257 "ll $t3, 0($t1)" "\n\t" 258 "addu $t3, $t3, $t2" "\n\t" 259 "andi $t3, $t3, 0xFFFF" "\n\t" 260 "sc $t3, 0($t1)" "\n\t" 261 "sw $t3, 8($t0)" "\n\t" 262 : /*out*/ 263 : /*in*/ "r"(&block[0]) 264 : /*trash*/ "memory", "cc", "t0", "t1", "t2", "t3" 265 ); 266 } while (block[2] != 1); 267 #elif defined (_MIPSEB) 268 unsigned int block[3] 269 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 270 do { 271 __asm__ __volatile__( 272 "move $t0, %0" "\n\t" 273 "lw $t1, 0($t0)" "\n\t" // p 274 "li $t2, 32694" "\n\t" // n 275 "li $t3, 0x1" "\n\t" 276 "sll $t2, $t2, 16" "\n\t" 277 "sw $t2, 0($t1)" "\n\t" 278 "sw $t3, 8($t0)" "\n\t" 279 : /*out*/ 280 : /*in*/ "r"(&block[0]) 281 : /*trash*/ "memory", "cc", "t0", "t1", "t2", "t3" 282 ); 283 } while (block[2] != 1); 284 #endif 285 #else 286 # error "Unsupported arch" 287 #endif 288 } 289 290 __attribute__((noinline)) void atomic_add_32bit ( int* p, int n ) 291 { 292 #if defined(VGA_x86) 293 unsigned long block[2]; 294 block[0] = (unsigned long)p; 295 block[1] = n; 296 __asm__ __volatile__( 297 "movl 0(%%esi),%%eax" "\n\t" 298 "movl 4(%%esi),%%ebx" "\n\t" 299 "lock; addl %%ebx,(%%eax)" "\n" 300 : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx" 301 ); 302 #elif defined(VGA_amd64) 303 unsigned long block[2]; 304 block[0] = (unsigned long)p; 305 block[1] = n; 306 __asm__ __volatile__( 307 "movq 0(%%rsi),%%rax" "\n\t" 308 "movq 8(%%rsi),%%rbx" "\n\t" 309 "lock; addl %%ebx,(%%rax)" "\n" 310 : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx" 311 ); 312 #elif defined(VGA_ppc32) 313 unsigned long success; 314 do { 315 __asm__ __volatile__( 316 "lwarx 15,0,%1" "\n\t" 317 "add 15,15,%2" "\n\t" 318 "stwcx. 15,0,%1" "\n\t" 319 "mfcr %0" "\n\t" 320 "srwi %0,%0,29" "\n\t" 321 "andi. %0,%0,1" "\n" 322 : /*out*/"=b"(success) 323 : /*in*/ "b"(p), "b"(n) 324 : /*trash*/ "memory", "cc", "r15" 325 ); 326 } while (success != 1); 327 #elif defined(VGA_ppc64) 328 /* Nasty hack. Does correctly atomically do *p += n, but only if p 329 is 8-aligned -- guaranteed by caller. */ 330 unsigned long success; 331 do { 332 __asm__ __volatile__( 333 "ldarx 15,0,%1" "\n\t" 334 "add 15,15,%2" "\n\t" 335 "stdcx. 15,0,%1" "\n\t" 336 "mfcr %0" "\n\t" 337 "srwi %0,%0,29" "\n\t" 338 "andi. %0,%0,1" "\n" 339 : /*out*/"=b"(success) 340 : /*in*/ "b"(p), "b"(((unsigned long)n) << 32) 341 : /*trash*/ "memory", "cc", "r15" 342 ); 343 } while (success != 1); 344 #elif defined(VGA_arm) 345 unsigned int block[3] 346 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 347 do { 348 __asm__ __volatile__( 349 "mov r5, %0" "\n\t" 350 "ldr r9, [r5, #0]" "\n\t" // p 351 "ldr r10, [r5, #4]" "\n\t" // n 352 "ldrex r8, [r9]" "\n\t" 353 "add r8, r8, r10" "\n\t" 354 "strex r4, r8, [r9]" "\n\t" 355 "str r4, [r5, #8]" "\n\t" 356 : /*out*/ 357 : /*in*/ "r"(&block[0]) 358 : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4" 359 ); 360 } while (block[2] != 0); 361 #elif defined(VGA_s390x) 362 __asm__ __volatile__( 363 " l 0,%0\n\t" 364 "0: lr 1,0\n\t" 365 " ar 1,%1\n\t" 366 " cs 0,1,%0\n\t" 367 " jl 0b\n\t" 368 : "+m" (*p) 369 : "d" (n) 370 : "cc", "memory", "0", "1"); 371 #elif defined(VGA_mips32) 372 unsigned int block[3] 373 = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF }; 374 do { 375 __asm__ __volatile__( 376 "move $t0, %0" "\n\t" 377 "lw $t1, 0($t0)" "\n\t" // p 378 "lw $t2, 4($t0)" "\n\t" // n 379 "ll $t3, 0($t1)" "\n\t" 380 "addu $t3, $t3, $t2" "\n\t" 381 "sc $t3, 0($t1)" "\n\t" 382 "sw $t3, 8($t0)" "\n\t" 383 : /*out*/ 384 : /*in*/ "r"(&block[0]) 385 : /*trash*/ "memory", "cc", "t0", "t1", "t2", "t3" 386 ); 387 } while (block[2] != 1); 388 #else 389 # error "Unsupported arch" 390 #endif 391 } 392 393 __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n ) 394 { 395 #if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32) 396 /* do nothing; is not supported */ 397 #elif defined(VGA_amd64) 398 // this is a bit subtle. It relies on the fact that, on a 64-bit platform, 399 // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*) 400 unsigned long long int block[2]; 401 block[0] = (unsigned long long int)(unsigned long)p; 402 block[1] = n; 403 __asm__ __volatile__( 404 "movq 0(%%rsi),%%rax" "\n\t" 405 "movq 8(%%rsi),%%rbx" "\n\t" 406 "lock; addq %%rbx,(%%rax)" "\n" 407 : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx" 408 ); 409 #elif defined(VGA_ppc64) 410 unsigned long success; 411 do { 412 __asm__ __volatile__( 413 "ldarx 15,0,%1" "\n\t" 414 "add 15,15,%2" "\n\t" 415 "stdcx. 15,0,%1" "\n\t" 416 "mfcr %0" "\n\t" 417 "srwi %0,%0,29" "\n\t" 418 "andi. %0,%0,1" "\n" 419 : /*out*/"=b"(success) 420 : /*in*/ "b"(p), "b"(n) 421 : /*trash*/ "memory", "cc", "r15" 422 ); 423 } while (success != 1); 424 #elif defined(VGA_arm) 425 unsigned long long int block[3] 426 = { (unsigned long long int)(unsigned long)p, 427 (unsigned long long int)n, 428 0xFFFFFFFFFFFFFFFFULL }; 429 do { 430 __asm__ __volatile__( 431 "mov r5, %0" "\n\t" 432 "ldr r8, [r5, #0]" "\n\t" // p 433 "ldrd r2, r3, [r5, #8]" "\n\t" // n 434 "ldrexd r0, r1, [r8]" "\n\t" 435 "adds r2, r2, r0" "\n\t" 436 "adc r3, r3, r1" "\n\t" 437 "strexd r1, r2, r3, [r8]" "\n\t" 438 "str r1, [r5, #16]" "\n\t" 439 : /*out*/ 440 : /*in*/ "r"(&block[0]) 441 : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3" 442 ); 443 } while (block[2] != 0xFFFFFFFF00000000ULL); 444 #elif defined(VGA_s390x) 445 __asm__ __volatile__( 446 " lg 0,%0\n\t" 447 "0: lgr 1,0\n\t" 448 " agr 1,%1\n\t" 449 " csg 0,1,%0\n\t" 450 " jl 0b\n\t" 451 : "+m" (*p) 452 : "d" (n) 453 : "cc", "memory", "0", "1"); 454 #else 455 # error "Unsupported arch" 456 #endif 457 } 458 459 int main ( int argc, char** argv ) 460 { 461 int i, status; 462 char* page; 463 char* p8; 464 short* p16; 465 int* p32; 466 long long int* p64; 467 pid_t child, p2; 468 469 printf("parent, pre-fork\n"); 470 471 page = mmap( 0, sysconf(_SC_PAGESIZE), 472 PROT_READ|PROT_WRITE, 473 MAP_ANONYMOUS|MAP_SHARED, -1, 0 ); 474 if (page == MAP_FAILED) { 475 perror("mmap failed"); 476 exit(1); 477 } 478 479 p8 = (char*)(page+0); 480 p16 = (short*)(page+256); 481 p32 = (int*)(page+512); 482 p64 = (long long int*)(page+768); 483 484 assert( IS_8_ALIGNED(p8) ); 485 assert( IS_8_ALIGNED(p16) ); 486 assert( IS_8_ALIGNED(p32) ); 487 assert( IS_8_ALIGNED(p64) ); 488 489 memset(page, 0, 1024); 490 491 *p8 = 0; 492 *p16 = 0; 493 *p32 = 0; 494 *p64 = 0; 495 496 child = fork(); 497 if (child == -1) { 498 perror("fork() failed\n"); 499 return 1; 500 } 501 502 if (child == 0) { 503 /* --- CHILD --- */ 504 printf("child\n"); 505 for (i = 0; i < NNN; i++) { 506 atomic_add_8bit(p8, 1); 507 atomic_add_16bit(p16, 1); 508 atomic_add_32bit(p32, 1); 509 atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */ 510 } 511 return 1; 512 /* NOTREACHED */ 513 514 } 515 516 /* --- PARENT --- */ 517 518 printf("parent\n"); 519 520 for (i = 0; i < NNN; i++) { 521 atomic_add_8bit(p8, 1); 522 atomic_add_16bit(p16, 1); 523 atomic_add_32bit(p32, 1); 524 atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */ 525 } 526 527 p2 = waitpid(child, &status, 0); 528 assert(p2 == child); 529 530 /* assert that child finished normally */ 531 assert(WIFEXITED(status)); 532 533 printf("FINAL VALUES: 8 bit %d, 16 bit %d, 32 bit %d, 64 bit %lld\n", 534 (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 ); 535 536 if (-74 == (int)(*(signed char*)p8) 537 && 32694 == (int)(*p16) 538 && 6913974 == *p32 539 && (0LL == *p64 || 682858642110LL == *p64)) { 540 printf("PASS\n"); 541 } else { 542 printf("FAIL -- see source code for expected values\n"); 543 } 544 545 printf("parent exits\n"); 546 547 return 0; 548 } 549