Home | History | Annotate | Download | only in tests
      1 
      2 /* This is an example of a program which does atomic memory operations
      3    between two processes which share a page.  Valgrind 3.4.1 and
      4    earlier produce incorrect answers because it does not preserve
      5    atomicity of the relevant instructions in the generated code; but
      6    the post-DCAS-merge versions of Valgrind do behave correctly. */
      7 
      8 /* On ARM, this can be compiled into either ARM or Thumb code, so as
      9    to test both A and T encodings of LDREX/STREX et al.  Also on ARM,
     10    it tests doubleword atomics (LDREXD, STREXD) which I don't think it
     11    does on any other platform. */
     12 
     13 #include <stdlib.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <assert.h>
     17 #include <unistd.h>
     18 #include <sys/wait.h>
     19 #include "tests/sys_mman.h"
     20 
     21 #define NNN 3456987
     22 
     23 #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
     24 
     25 
     26 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n )
     27 {
     28 #if defined(VGA_x86)
     29    unsigned long block[2];
     30    block[0] = (unsigned long)p;
     31    block[1] = n;
     32    __asm__ __volatile__(
     33       "movl 0(%%esi),%%eax"      "\n\t"
     34       "movl 4(%%esi),%%ebx"      "\n\t"
     35       "lock; addb %%bl,(%%eax)"  "\n"
     36       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
     37    );
     38 #elif defined(VGA_amd64)
     39    unsigned long block[2];
     40    block[0] = (unsigned long)p;
     41    block[1] = n;
     42    __asm__ __volatile__(
     43       "movq 0(%%rsi),%%rax"      "\n\t"
     44       "movq 8(%%rsi),%%rbx"      "\n\t"
     45       "lock; addb %%bl,(%%rax)"  "\n"
     46       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
     47    );
     48 #elif defined(VGA_ppc32)
     49    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     50       is 4-aligned -- guaranteed by caller. */
     51    unsigned long success;
     52    do {
     53       __asm__ __volatile__(
     54          "lwarx  15,0,%1"    "\n\t"
     55          "add    15,15,%2"   "\n\t"
     56          "stwcx. 15,0,%1"    "\n\t"
     57          "mfcr   %0"         "\n\t"
     58          "srwi   %0,%0,29"   "\n\t"
     59          "andi.  %0,%0,1"    "\n"
     60          : /*out*/"=b"(success)
     61          : /*in*/ "b"(p), "b"(((unsigned long)n) << 24)
     62          : /*trash*/ "memory", "cc", "r15"
     63       );
     64    } while (success != 1);
     65 #elif defined(VGA_ppc64be)
     66    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     67       is 8-aligned -- guaranteed by caller. */
     68    unsigned long success;
     69    do {
     70       __asm__ __volatile__(
     71          "ldarx  15,0,%1"    "\n\t"
     72          "add    15,15,%2"   "\n\t"
     73          "stdcx. 15,0,%1"    "\n\t"
     74          "mfcr   %0"         "\n\t"
     75          "srwi   %0,%0,29"   "\n\t"
     76          "andi.  %0,%0,1"    "\n"
     77          : /*out*/"=b"(success)
     78          : /*in*/ "b"(p), "b"(((unsigned long)n) << 56)
     79          : /*trash*/ "memory", "cc", "r15"
     80       );
     81    } while (success != 1);
     82 #elif defined(VGA_ppc64le)
     83    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     84       is 8-aligned -- guaranteed by caller. */
     85    unsigned long success;
     86    do {
     87       __asm__ __volatile__(
     88          "ldarx  15,0,%1"    "\n\t"
     89          "add    15,15,%2"   "\n\t"
     90          "stdcx. 15,0,%1"    "\n\t"
     91          "mfcr   %0"         "\n\t"
     92          "srwi   %0,%0,29"   "\n\t"
     93          "andi.  %0,%0,1"    "\n"
     94          : /*out*/"=b"(success)
     95          : /*in*/ "b"(p), "b"(((unsigned long)n))
     96          : /*trash*/ "memory", "cc", "r15"
     97       );
     98    } while (success != 1);
     99 #elif defined(VGA_arm)
    100    unsigned int block[3]
    101       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    102    do {
    103       __asm__ __volatile__(
    104          "mov    r5, %0"         "\n\t"
    105          "ldr    r9, [r5, #0]"   "\n\t" // p
    106          "ldr    r10, [r5, #4]"  "\n\t" // n
    107          "ldrexb r8, [r9]"       "\n\t"
    108          "add    r8, r8, r10"    "\n\t"
    109          "strexb r4, r8, [r9]"   "\n\t"
    110          "str    r4, [r5, #8]"   "\n\t"
    111          : /*out*/
    112          : /*in*/ "r"(&block[0])
    113          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    114       );
    115    } while (block[2] != 0);
    116 #elif defined(VGA_arm64)
    117    unsigned long long int block[3]
    118       = { (unsigned long long int)p, (unsigned long long int)n,
    119           0xFFFFFFFFFFFFFFFFULL};
    120    do {
    121       __asm__ __volatile__(
    122          "mov   x5, %0"         "\n\t"
    123          "ldr   x9, [x5, #0]"   "\n\t" // p
    124          "ldr   x10, [x5, #8]"  "\n\t" // n
    125          "ldxrb w8, [x9]"       "\n\t"
    126          "add   x8, x8, x10"    "\n\t"
    127          "stxrb w4, w8, [x9]"    "\n\t"
    128          "str   x4, [x5, #16]"   "\n\t"
    129          : /*out*/
    130          : /*in*/ "r"(&block[0])
    131          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    132       );
    133    } while (block[2] != 0);
    134 #elif defined(VGA_s390x)
    135    int dummy;
    136    __asm__ __volatile__(
    137       "   l	0,%0\n\t"
    138       "0: st	0,%1\n\t"
    139       "   icm	1,1,%1\n\t"
    140       "   ar	1,%2\n\t"
    141       "   stcm  1,1,%1\n\t"
    142       "   l     1,%1\n\t"
    143       "   cs	0,1,%0\n\t"
    144       "   jl    0b\n\t"
    145       : "+m" (*p), "+m" (dummy)
    146       : "d" (n)
    147       : "cc", "memory", "0", "1");
    148 #elif defined(VGA_mips32)
    149    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    150       exception that can cause this function to fail. */
    151 #if defined (_MIPSEL)
    152    unsigned int block[3]
    153       = { (unsigned int)p, (unsigned int)n, 0x0 };
    154    do {
    155       __asm__ __volatile__(
    156          "move $t0, %0"           "\n\t"
    157          "lw   $t1, 0($t0)"       "\n\t"  // p
    158          "lw   $t2, 4($t0)"       "\n\t"  // n
    159          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    160          "li   $t4, 0xFF"         "\n\t"
    161          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFFFF00
    162          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    163          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFFFF00
    164          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    165          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    166          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    167          "sc   $t3, 0($t1)"       "\n\t"
    168          "sw   $t3, 8($t0)"       "\n\t"  // save result
    169          : /*out*/
    170          : /*in*/ "r"(&block[0])
    171          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    172       );
    173    } while (block[2] != 1);
    174 #elif defined (_MIPSEB)
    175    unsigned int block[3]
    176       = { (unsigned int)p, (unsigned int)n << 24, 0x0 };
    177    do {
    178       __asm__ __volatile__(
    179          "move $t0, %0"          "\n\t"
    180          "lw   $t1, 0($t0)"      "\n\t"  // p
    181          "lw   $t2, 4($t0)"      "\n\t"  // n
    182          "ll   $t3, 0($t1)"      "\n\t"
    183          "addu $t3, $t3, $t2"    "\n\t"
    184          "sc   $t3, 0($t1)"      "\n\t"
    185          "sw   $t3, 8($t0)"      "\n\t"
    186          : /*out*/
    187          : /*in*/ "r"(&block[0])
    188          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    189       );
    190    } while (block[2] != 1);
    191 #endif
    192 #elif defined(VGA_mips64)
    193    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    194       exception that can cause this function to fail. */
    195 #if defined (_MIPSEL)
    196    unsigned long block[3]
    197       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    198    do {
    199       __asm__ __volatile__(
    200          "move $t0, %0"           "\n\t"
    201          "ld   $t1, 0($t0)"       "\n\t"  // p
    202          "ld   $t2, 8($t0)"       "\n\t"  // n
    203          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    204          "li   $s0, 0xFF"         "\n\t"
    205          "nor  $s0, $s0, $zero"   "\n\t"  // $s0 = 0xFFFFFF00
    206          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    207          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFFFF00
    208          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    209          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    210          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    211          "sc   $t3, 0($t1)"       "\n\t"
    212          "sw   $t3, 16($t0)"      "\n\t"  // save result
    213          : /*out*/
    214          : /*in*/ "r"(&block[0])
    215          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    216       );
    217    } while (block[2] != 1);
    218 #elif defined (_MIPSEB)
    219    unsigned long block[3]
    220       = { (unsigned long)p, (unsigned long)n << 56, 0x0 };
    221    do {
    222       __asm__ __volatile__(
    223          "move  $t0, %0"          "\n\t"
    224          "ld    $t1, 0($t0)"      "\n\t"  // p
    225          "ld    $t2, 8($t0)"      "\n\t"  // n
    226          "lld   $t3, 0($t1)"      "\n\t"
    227          "daddu $t3, $t3, $t2"    "\n\t"
    228          "scd   $t3, 0($t1)"      "\n\t"
    229          "sd    $t3, 16($t0)"     "\n\t"
    230          : /*out*/
    231          : /*in*/ "r"(&block[0])
    232          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    233       );
    234    } while (block[2] != 1);
    235 #endif
    236 #elif defined(VGA_tilegx)
    237    int i;
    238    unsigned int *p4 = (unsigned int *)(((unsigned long long)p + 3) & (~3ULL));
    239    unsigned int  mask = (0xff) << ((int)p & 3);
    240    unsigned int  add = (n & 0xff) << ((int)p & 3);
    241    unsigned int x, new;
    242 
    243    while(1) {
    244       x = *p4;
    245       new = (x & (~mask)) | ((x + add) & mask);
    246       __insn_mtspr(0x2780, x);
    247       if ( __insn_cmpexch4(p4, new) == x)
    248          break;
    249    }
    250 #else
    251 # error "Unsupported arch"
    252 #endif
    253 }
    254 
    255 
    256 __attribute__((noinline)) void atomic_add_16bit ( short* p, int n )
    257 {
    258 #if defined(VGA_x86)
    259    unsigned long block[2];
    260    block[0] = (unsigned long)p;
    261    block[1] = n;
    262    __asm__ __volatile__(
    263       "movl 0(%%esi),%%eax"      "\n\t"
    264       "movl 4(%%esi),%%ebx"      "\n\t"
    265       "lock; addw %%bx,(%%eax)"  "\n"
    266       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    267    );
    268 #elif defined(VGA_amd64)
    269    unsigned long block[2];
    270    block[0] = (unsigned long)p;
    271    block[1] = n;
    272    __asm__ __volatile__(
    273       "movq 0(%%rsi),%%rax"      "\n\t"
    274       "movq 8(%%rsi),%%rbx"      "\n\t"
    275       "lock; addw %%bx,(%%rax)"  "\n"
    276       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    277    );
    278 #elif defined(VGA_ppc32)
    279    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    280       is 8-aligned -- guaranteed by caller. */
    281    unsigned long success;
    282    do {
    283       __asm__ __volatile__(
    284          "lwarx  15,0,%1"    "\n\t"
    285          "add    15,15,%2"   "\n\t"
    286          "stwcx. 15,0,%1"    "\n\t"
    287          "mfcr   %0"         "\n\t"
    288          "srwi   %0,%0,29"   "\n\t"
    289          "andi.  %0,%0,1"    "\n"
    290          : /*out*/"=b"(success)
    291          : /*in*/ "b"(p), "b"(((unsigned long)n) << 16)
    292          : /*trash*/ "memory", "cc", "r15"
    293       );
    294    } while (success != 1);
    295 #elif defined(VGA_ppc64be)
    296    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    297       is 8-aligned -- guaranteed by caller. */
    298    unsigned long success;
    299    do {
    300       __asm__ __volatile__(
    301          "ldarx  15,0,%1"    "\n\t"
    302          "add    15,15,%2"   "\n\t"
    303          "stdcx. 15,0,%1"    "\n\t"
    304          "mfcr   %0"         "\n\t"
    305          "srwi   %0,%0,29"   "\n\t"
    306          "andi.  %0,%0,1"    "\n"
    307          : /*out*/"=b"(success)
    308          : /*in*/ "b"(p), "b"(((unsigned long)n) << 48)
    309          : /*trash*/ "memory", "cc", "r15"
    310       );
    311    } while (success != 1);
    312 #elif defined(VGA_ppc64le)
    313    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    314       is 8-aligned -- guaranteed by caller. */
    315    unsigned long success;
    316    do {
    317       __asm__ __volatile__(
    318          "ldarx  15,0,%1"    "\n\t"
    319          "add    15,15,%2"   "\n\t"
    320          "stdcx. 15,0,%1"    "\n\t"
    321          "mfcr   %0"         "\n\t"
    322          "srwi   %0,%0,29"   "\n\t"
    323          "andi.  %0,%0,1"    "\n"
    324          : /*out*/"=b"(success)
    325          : /*in*/ "b"(p), "b"(((unsigned long)n))
    326          : /*trash*/ "memory", "cc", "r15"
    327       );
    328    } while (success != 1);
    329 #elif defined(VGA_arm)
    330    unsigned int block[3]
    331       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    332    do {
    333       __asm__ __volatile__(
    334          "mov    r5, %0"         "\n\t"
    335          "ldr    r9, [r5, #0]"   "\n\t" // p
    336          "ldr    r10, [r5, #4]"  "\n\t" // n
    337          "ldrexh r8, [r9]"       "\n\t"
    338          "add    r8, r8, r10"    "\n\t"
    339          "strexh r4, r8, [r9]"   "\n\t"
    340          "str    r4, [r5, #8]"   "\n\t"
    341          : /*out*/
    342          : /*in*/ "r"(&block[0])
    343          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    344       );
    345    } while (block[2] != 0);
    346 #elif defined(VGA_arm64)
    347    unsigned long long int block[3]
    348       = { (unsigned long long int)p, (unsigned long long int)n,
    349           0xFFFFFFFFFFFFFFFFULL};
    350    do {
    351       __asm__ __volatile__(
    352          "mov   x5, %0"         "\n\t"
    353          "ldr   x9, [x5, #0]"   "\n\t" // p
    354          "ldr   x10, [x5, #8]"  "\n\t" // n
    355          "ldxrh w8, [x9]"       "\n\t"
    356          "add   x8, x8, x10"    "\n\t"
    357          "stxrh w4, w8, [x9]"    "\n\t"
    358          "str   x4, [x5, #16]"   "\n\t"
    359          : /*out*/
    360          : /*in*/ "r"(&block[0])
    361          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    362       );
    363    } while (block[2] != 0);
    364 #elif defined(VGA_s390x)
    365    int dummy;
    366    __asm__ __volatile__(
    367       "   l	0,%0\n\t"
    368       "0: st	0,%1\n\t"
    369       "   icm	1,3,%1\n\t"
    370       "   ar	1,%2\n\t"
    371       "   stcm  1,3,%1\n\t"
    372       "   l     1,%1\n\t"
    373       "   cs	0,1,%0\n\t"
    374       "   jl    0b\n\t"
    375       : "+m" (*p), "+m" (dummy)
    376       : "d" (n)
    377       : "cc", "memory", "0", "1");
    378 #elif defined(VGA_mips32)
    379    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    380       exception that can cause this function to fail. */
    381 #if defined (_MIPSEL)
    382    unsigned int block[3]
    383       = { (unsigned int)p, (unsigned int)n, 0x0 };
    384    do {
    385       __asm__ __volatile__(
    386          "move $t0, %0"           "\n\t"
    387          "lw   $t1, 0($t0)"       "\n\t"  // p
    388          "lw   $t2, 4($t0)"       "\n\t"  // n
    389          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    390          "li   $t4, 0xFFFF"       "\n\t"
    391          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFF0000
    392          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    393          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFF0000
    394          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    395          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    396          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    397          "sc   $t3, 0($t1)"       "\n\t"
    398          "sw   $t3, 8($t0)"       "\n\t"  // save result
    399          : /*out*/
    400          : /*in*/ "r"(&block[0])
    401          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    402       );
    403    } while (block[2] != 1);
    404 #elif defined (_MIPSEB)
    405    unsigned int block[3]
    406       = { (unsigned int)p, (unsigned int)n << 16, 0x0 };
    407    do {
    408       __asm__ __volatile__(
    409          "move $t0, %0"          "\n\t"
    410          "lw   $t1, 0($t0)"      "\n\t"  // p
    411          "lw   $t2, 4($t0)"      "\n\t"  // n
    412          "ll   $t3, 0($t1)"      "\n\t"
    413          "addu $t3, $t3, $t2"    "\n\t"
    414          "sc   $t3, 0($t1)"      "\n\t"
    415          "sw   $t3, 8($t0)"      "\n\t"
    416          : /*out*/
    417          : /*in*/ "r"(&block[0])
    418          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    419       );
    420    } while (block[2] != 1);
    421 #endif
    422 #elif defined(VGA_mips64)
    423    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    424       exception that can cause this function to fail. */
    425 #if defined (_MIPSEL)
    426    unsigned long block[3]
    427       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    428    do {
    429       __asm__ __volatile__(
    430          "move $t0, %0"           "\n\t"
    431          "ld   $t1, 0($t0)"       "\n\t"  // p
    432          "ld   $t2, 8($t0)"       "\n\t"  // n
    433          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    434          "li   $s0, 0xFFFF"       "\n\t"
    435          "nor  $s0, $s0, $zero"   "\n\t"  // $s0= 0xFFFF0000
    436          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    437          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFF0000
    438          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    439          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    440          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    441          "sc   $t3, 0($t1)"       "\n\t"
    442          "sw   $t3, 16($t0)"      "\n\t"  // save result
    443          : /*out*/
    444          : /*in*/ "r"(&block[0])
    445          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    446       );
    447    } while (block[2] != 1);
    448 #elif defined (_MIPSEB)
    449    unsigned long block[3]
    450       = { (unsigned long)p, (unsigned long)n << 48, 0x0 };
    451    do {
    452       __asm__ __volatile__(
    453          "move  $t0, %0"          "\n\t"
    454          "ld    $t1, 0($t0)"      "\n\t"  // p
    455          "ld    $t2, 8($t0)"      "\n\t"  // n
    456          "lld   $t3, 0($t1)"      "\n\t"
    457          "daddu $t3, $t3, $t2"    "\n\t"
    458          "scd   $t3, 0($t1)"      "\n\t"
    459          "sd    $t3, 16($t0)"     "\n\t"
    460          : /*out*/
    461          : /*in*/ "r"(&block[0])
    462          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    463       );
    464    } while (block[2] != 1);
    465 #endif
    466 #elif defined(VGA_tilegx)
    467    int i;
    468    unsigned int *p4 = (unsigned int *)(((unsigned long long)p + 3) & (~3ULL));
    469    unsigned int  mask = (0xffff) << ((int)p & 3);
    470    unsigned int  add = (n & 0xffff) << ((int)p & 3);
    471    unsigned int x, new;
    472 
    473    while(1) {
    474       x = *p4;
    475       new = (x & (~mask)) | ((x + add) & mask);
    476       __insn_mtspr(0x2780, x);
    477       if ( __insn_cmpexch4(p4, new) == x)
    478          break;
    479    }
    480 #else
    481 # error "Unsupported arch"
    482 #endif
    483 }
    484 
    485 __attribute__((noinline)) void atomic_add_32bit ( int* p, int n )
    486 {
    487 #if defined(VGA_x86)
    488    unsigned long block[2];
    489    block[0] = (unsigned long)p;
    490    block[1] = n;
    491    __asm__ __volatile__(
    492       "movl 0(%%esi),%%eax"       "\n\t"
    493       "movl 4(%%esi),%%ebx"       "\n\t"
    494       "lock; addl %%ebx,(%%eax)"  "\n"
    495       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    496    );
    497 #elif defined(VGA_amd64)
    498    unsigned long block[2];
    499    block[0] = (unsigned long)p;
    500    block[1] = n;
    501    __asm__ __volatile__(
    502       "movq 0(%%rsi),%%rax"       "\n\t"
    503       "movq 8(%%rsi),%%rbx"       "\n\t"
    504       "lock; addl %%ebx,(%%rax)"  "\n"
    505       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    506    );
    507 #elif defined(VGA_ppc32)
    508    unsigned long success;
    509    do {
    510       __asm__ __volatile__(
    511          "lwarx  15,0,%1"    "\n\t"
    512          "add    15,15,%2"   "\n\t"
    513          "stwcx. 15,0,%1"    "\n\t"
    514          "mfcr   %0"         "\n\t"
    515          "srwi   %0,%0,29"   "\n\t"
    516          "andi.  %0,%0,1"    "\n"
    517          : /*out*/"=b"(success)
    518          : /*in*/ "b"(p), "b"(n)
    519          : /*trash*/ "memory", "cc", "r15"
    520       );
    521    } while (success != 1);
    522 #elif defined(VGA_ppc64be)
    523    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    524       is 8-aligned -- guaranteed by caller. */
    525    unsigned long success;
    526    do {
    527       __asm__ __volatile__(
    528          "ldarx  15,0,%1"    "\n\t"
    529          "add    15,15,%2"   "\n\t"
    530          "stdcx. 15,0,%1"    "\n\t"
    531          "mfcr   %0"         "\n\t"
    532          "srwi   %0,%0,29"   "\n\t"
    533          "andi.  %0,%0,1"    "\n"
    534          : /*out*/"=b"(success)
    535          : /*in*/ "b"(p), "b"(((unsigned long)n) << 32)
    536          : /*trash*/ "memory", "cc", "r15"
    537       );
    538    } while (success != 1);
    539 #elif defined(VGA_ppc64le)
    540    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    541       is 8-aligned -- guaranteed by caller. */
    542    unsigned long success;
    543    do {
    544       __asm__ __volatile__(
    545          "ldarx  15,0,%1"    "\n\t"
    546          "add    15,15,%2"   "\n\t"
    547          "stdcx. 15,0,%1"    "\n\t"
    548          "mfcr   %0"         "\n\t"
    549          "srwi   %0,%0,29"   "\n\t"
    550          "andi.  %0,%0,1"    "\n"
    551          : /*out*/"=b"(success)
    552          : /*in*/ "b"(p), "b"(((unsigned long)n))
    553          : /*trash*/ "memory", "cc", "r15"
    554       );
    555    } while (success != 1);
    556 #elif defined(VGA_arm)
    557    unsigned int block[3]
    558       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    559    do {
    560       __asm__ __volatile__(
    561          "mov   r5, %0"         "\n\t"
    562          "ldr   r9, [r5, #0]"   "\n\t" // p
    563          "ldr   r10, [r5, #4]"  "\n\t" // n
    564          "ldrex r8, [r9]"       "\n\t"
    565          "add   r8, r8, r10"    "\n\t"
    566          "strex r4, r8, [r9]"   "\n\t"
    567          "str   r4, [r5, #8]"   "\n\t"
    568          : /*out*/
    569          : /*in*/ "r"(&block[0])
    570          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    571       );
    572    } while (block[2] != 0);
    573 #elif defined(VGA_arm64)
    574    unsigned long long int block[3]
    575       = { (unsigned long long int)p, (unsigned long long int)n,
    576           0xFFFFFFFFFFFFFFFFULL};
    577    do {
    578       __asm__ __volatile__(
    579          "mov   x5, %0"         "\n\t"
    580          "ldr   x9, [x5, #0]"   "\n\t" // p
    581          "ldr   x10, [x5, #8]"  "\n\t" // n
    582          "ldxr  w8, [x9]"       "\n\t"
    583          "add   x8, x8, x10"    "\n\t"
    584          "stxr  w4, w8, [x9]"    "\n\t"
    585          "str   x4, [x5, #16]"   "\n\t"
    586          : /*out*/
    587          : /*in*/ "r"(&block[0])
    588          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    589       );
    590    } while (block[2] != 0);
    591 #elif defined(VGA_s390x)
    592    __asm__ __volatile__(
    593       "   l	0,%0\n\t"
    594       "0: lr	1,0\n\t"
    595       "   ar	1,%1\n\t"
    596       "   cs	0,1,%0\n\t"
    597       "   jl    0b\n\t"
    598       : "+m" (*p)
    599       : "d" (n)
    600       : "cc", "memory", "0", "1");
    601 #elif defined(VGA_mips32)
    602    unsigned int block[3]
    603       = { (unsigned int)p, (unsigned int)n, 0x0 };
    604    do {
    605       __asm__ __volatile__(
    606          "move $t0, %0"        "\n\t"
    607          "lw   $t1, 0($t0)"    "\n\t"  // p
    608          "lw   $t2, 4($t0)"    "\n\t"  // n
    609          "ll   $t3, 0($t1)"    "\n\t"
    610          "addu $t3, $t3, $t2"  "\n\t"
    611          "sc   $t3, 0($t1)"    "\n\t"
    612          "sw   $t3, 8($t0)"    "\n\t"
    613          : /*out*/
    614          : /*in*/ "r"(&block[0])
    615          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    616       );
    617    } while (block[2] != 1);
    618 #elif defined(VGA_mips64)
    619    unsigned long block[3]
    620       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    621    do {
    622       __asm__ __volatile__(
    623          "move  $t0, %0"        "\n\t"
    624          "ld    $t1, 0($t0)"    "\n\t"  // p
    625          "ld    $t2, 8($t0)"    "\n\t"  // n
    626          "ll    $t3, 0($t1)"    "\n\t"
    627          "addu  $t3, $t3, $t2"  "\n\t"
    628          "sc    $t3, 0($t1)"    "\n\t"
    629          "sd    $t3, 16($t0)"   "\n\t"
    630          : /*out*/
    631          : /*in*/ "r"(&block[0])
    632          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    633       );
    634    } while (block[2] != 1);
    635 #elif defined(VGA_tilegx)
    636     __insn_fetchadd4(p, n);
    637 #else
    638 # error "Unsupported arch"
    639 #endif
    640 }
    641 
    642 __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
    643 {
    644 #if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32)
    645    /* do nothing; is not supported */
    646 #elif defined(VGA_amd64)
    647    // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
    648    // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*)
    649    unsigned long long int block[2];
    650    block[0] = (unsigned long long int)(unsigned long)p;
    651    block[1] = n;
    652    __asm__ __volatile__(
    653       "movq 0(%%rsi),%%rax"      "\n\t"
    654       "movq 8(%%rsi),%%rbx"      "\n\t"
    655       "lock; addq %%rbx,(%%rax)" "\n"
    656       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    657    );
    658 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
    659    unsigned long success;
    660    do {
    661       __asm__ __volatile__(
    662          "ldarx  15,0,%1"    "\n\t"
    663          "add    15,15,%2"   "\n\t"
    664          "stdcx. 15,0,%1"    "\n\t"
    665          "mfcr   %0"         "\n\t"
    666          "srwi   %0,%0,29"   "\n\t"
    667          "andi.  %0,%0,1"    "\n"
    668          : /*out*/"=b"(success)
    669          : /*in*/ "b"(p), "b"(n)
    670          : /*trash*/ "memory", "cc", "r15"
    671       );
    672    } while (success != 1);
    673 #elif defined(VGA_arm)
    674    unsigned long long int block[3]
    675      = { (unsigned long long int)(unsigned long)p,
    676          (unsigned long long int)n,
    677          0xFFFFFFFFFFFFFFFFULL };
    678    do {
    679       __asm__ __volatile__(
    680          "mov    r5, %0"             "\n\t"
    681          "ldr    r8,     [r5, #0]"   "\n\t" // p
    682          "ldrd   r2, r3, [r5, #8]"   "\n\t" // n
    683          "ldrexd r0, r1, [r8]"       "\n\t"
    684          "adds   r2, r2, r0"         "\n\t"
    685          "adc    r3, r3, r1"         "\n\t"
    686          "strexd r1, r2, r3, [r8]"   "\n\t"
    687          "str    r1, [r5, #16]"      "\n\t"
    688          : /*out*/
    689          : /*in*/ "r"(&block[0])
    690          : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3"
    691       );
    692    } while (block[2] != 0xFFFFFFFF00000000ULL);
    693 #elif defined(VGA_arm64)
    694    unsigned long long int block[3]
    695       = { (unsigned long long int)p, (unsigned long long int)n,
    696           0xFFFFFFFFFFFFFFFFULL};
    697    do {
    698       __asm__ __volatile__(
    699          "mov   x5, %0"         "\n\t"
    700          "ldr   x9, [x5, #0]"   "\n\t" // p
    701          "ldr   x10, [x5, #8]"  "\n\t" // n
    702          "ldxr  x8, [x9]"       "\n\t"
    703          "add   x8, x8, x10"    "\n\t"
    704          "stxr  w4, x8, [x9]"   "\n\t"
    705          "str   x4, [x5, #16]"   "\n\t"
    706          : /*out*/
    707          : /*in*/ "r"(&block[0])
    708          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    709       );
    710    } while (block[2] != 0);
    711 #elif defined(VGA_s390x)
    712    __asm__ __volatile__(
    713       "   lg	0,%0\n\t"
    714       "0: lgr	1,0\n\t"
    715       "   agr	1,%1\n\t"
    716       "   csg	0,1,%0\n\t"
    717       "   jl    0b\n\t"
    718       : "+m" (*p)
    719       : "d" (n)
    720       : "cc", "memory", "0", "1");
    721 #elif defined(VGA_mips64)
    722    unsigned long block[3]
    723       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    724    do {
    725       __asm__ __volatile__(
    726          "move  $t0, %0"        "\n\t"
    727          "ld    $t1, 0($t0)"    "\n\t" // p
    728          "ld    $t2, 8($t0)"    "\n\t" // n
    729          "lld   $t3, 0($t1)"    "\n\t"
    730          "daddu $t3, $t3, $t2"  "\n\t"
    731          "scd   $t3, 0($t1)"    "\n\t"
    732          "sd    $t3, 16($t0)"   "\n\t"
    733          : /*out*/
    734          : /*in*/ "r"(&block[0])
    735          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    736       );
    737    } while (block[2] != 1);
    738 #elif defined(VGA_tilegx)
    739     __insn_fetchadd(p, n);
    740 #else
    741 # error "Unsupported arch"
    742 #endif
    743 }
    744 
    745 int main ( int argc, char** argv )
    746 {
    747    int    i, status;
    748    char*  page;
    749    char*  p8;
    750    short* p16;
    751    int*   p32;
    752    long long int* p64;
    753    pid_t  child, p2;
    754 
    755    printf("parent, pre-fork\n");
    756 
    757    page = mmap( 0, sysconf(_SC_PAGESIZE),
    758                    PROT_READ|PROT_WRITE,
    759                    MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
    760    if (page == MAP_FAILED) {
    761       perror("mmap failed");
    762       exit(1);
    763    }
    764 
    765    p8  = (char*)(page+0);
    766    p16 = (short*)(page+256);
    767    p32 = (int*)(page+512);
    768    p64 = (long long int*)(page+768);
    769 
    770    assert( IS_8_ALIGNED(p8) );
    771    assert( IS_8_ALIGNED(p16) );
    772    assert( IS_8_ALIGNED(p32) );
    773    assert( IS_8_ALIGNED(p64) );
    774 
    775    memset(page, 0, 1024);
    776 
    777    *p8  = 0;
    778    *p16 = 0;
    779    *p32 = 0;
    780    *p64 = 0;
    781 
    782    child = fork();
    783    if (child == -1) {
    784       perror("fork() failed\n");
    785       return 1;
    786    }
    787 
    788    if (child == 0) {
    789       /* --- CHILD --- */
    790       printf("child\n");
    791       for (i = 0; i < NNN; i++) {
    792          atomic_add_8bit(p8, 1);
    793          atomic_add_16bit(p16, 1);
    794          atomic_add_32bit(p32, 1);
    795          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    796       }
    797       return 1;
    798       /* NOTREACHED */
    799 
    800    }
    801 
    802    /* --- PARENT --- */
    803 
    804    printf("parent\n");
    805 
    806    for (i = 0; i < NNN; i++) {
    807       atomic_add_8bit(p8, 1);
    808       atomic_add_16bit(p16, 1);
    809       atomic_add_32bit(p32, 1);
    810       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    811    }
    812 
    813    p2 = waitpid(child, &status, 0);
    814    assert(p2 == child);
    815 
    816    /* assert that child finished normally */
    817    assert(WIFEXITED(status));
    818 
    819    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
    820           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
    821 
    822    if (-74 == (int)(*(signed char*)p8)
    823        && 32694 == (int)(*p16)
    824        && 6913974 == *p32
    825        && (0LL == *p64 || 682858642110LL == *p64)) {
    826       printf("PASS\n");
    827    } else {
    828       printf("FAIL -- see source code for expected values\n");
    829    }
    830 
    831    printf("parent exits\n");
    832 
    833    return 0;
    834 }
    835