Home | History | Annotate | Download | only in tests
      1 
      2 /* This is an example of a program which does atomic memory operations
      3    between two processes which share a page.  Valgrind 3.4.1 and
      4    earlier produce incorrect answers because it does not preserve
      5    atomicity of the relevant instructions in the generated code; but
      6    the post-DCAS-merge versions of Valgrind do behave correctly. */
      7 
      8 /* On ARM, this can be compiled into either ARM or Thumb code, so as
      9    to test both A and T encodings of LDREX/STREX et al.  Also on ARM,
     10    it tests doubleword atomics (LDREXD, STREXD) which I don't think it
     11    does on any other platform. */
     12 
     13 #include <stdlib.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <assert.h>
     17 #include <unistd.h>
     18 #include <sys/wait.h>
     19 #include "tests/sys_mman.h"
     20 
     21 #define NNN 3456987
     22 
     23 #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
     24 
     25 
     26 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n )
     27 {
     28 #if defined(VGA_x86)
     29    unsigned long block[2];
     30    block[0] = (unsigned long)p;
     31    block[1] = n;
     32    __asm__ __volatile__(
     33       "movl 0(%%esi),%%eax"      "\n\t"
     34       "movl 4(%%esi),%%ebx"      "\n\t"
     35       "lock; addb %%bl,(%%eax)"  "\n"
     36       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
     37    );
     38 #elif defined(VGA_amd64)
     39    unsigned long block[2];
     40    block[0] = (unsigned long)p;
     41    block[1] = n;
     42    __asm__ __volatile__(
     43       "movq 0(%%rsi),%%rax"      "\n\t"
     44       "movq 8(%%rsi),%%rbx"      "\n\t"
     45       "lock; addb %%bl,(%%rax)"  "\n"
     46       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
     47    );
     48 #elif defined(VGA_ppc32)
     49    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     50       is 4-aligned -- guaranteed by caller. */
     51    unsigned long success;
     52    do {
     53       __asm__ __volatile__(
     54          "lwarx  15,0,%1"    "\n\t"
     55          "add    15,15,%2"   "\n\t"
     56          "stwcx. 15,0,%1"    "\n\t"
     57          "mfcr   %0"         "\n\t"
     58          "srwi   %0,%0,29"   "\n\t"
     59          "andi.  %0,%0,1"    "\n"
     60          : /*out*/"=b"(success)
     61          : /*in*/ "b"(p), "b"(((unsigned long)n) << 24)
     62          : /*trash*/ "memory", "cc", "r15"
     63       );
     64    } while (success != 1);
     65 #elif defined(VGA_ppc64)
     66    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     67       is 8-aligned -- guaranteed by caller. */
     68    unsigned long success;
     69    do {
     70       __asm__ __volatile__(
     71          "ldarx  15,0,%1"    "\n\t"
     72          "add    15,15,%2"   "\n\t"
     73          "stdcx. 15,0,%1"    "\n\t"
     74          "mfcr   %0"         "\n\t"
     75          "srwi   %0,%0,29"   "\n\t"
     76          "andi.  %0,%0,1"    "\n"
     77          : /*out*/"=b"(success)
     78          : /*in*/ "b"(p), "b"(((unsigned long)n) << 56)
     79          : /*trash*/ "memory", "cc", "r15"
     80       );
     81    } while (success != 1);
     82 #elif defined(VGA_arm)
     83    unsigned int block[3]
     84       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
     85    do {
     86       __asm__ __volatile__(
     87          "mov    r5, %0"         "\n\t"
     88          "ldr    r9, [r5, #0]"   "\n\t" // p
     89          "ldr    r10, [r5, #4]"  "\n\t" // n
     90          "ldrexb r8, [r9]"       "\n\t"
     91          "add    r8, r8, r10"    "\n\t"
     92          "strexb r4, r8, [r9]"   "\n\t"
     93          "str    r4, [r5, #8]"   "\n\t"
     94          : /*out*/
     95          : /*in*/ "r"(&block[0])
     96          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
     97       );
     98    } while (block[2] != 0);
     99 #elif defined(VGA_arm64)
    100    unsigned long long int block[3]
    101       = { (unsigned long long int)p, (unsigned long long int)n,
    102           0xFFFFFFFFFFFFFFFFULL};
    103    do {
    104       __asm__ __volatile__(
    105          "mov   x5, %0"         "\n\t"
    106          "ldr   x9, [x5, #0]"   "\n\t" // p
    107          "ldr   x10, [x5, #8]"  "\n\t" // n
    108          "ldxrb w8, [x9]"       "\n\t"
    109          "add   x8, x8, x10"    "\n\t"
    110          "stxrb w4, w8, [x9]"    "\n\t"
    111          "str   x4, [x5, #16]"   "\n\t"
    112          : /*out*/
    113          : /*in*/ "r"(&block[0])
    114          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    115       );
    116    } while (block[2] != 0);
    117 #elif defined(VGA_s390x)
    118    int dummy;
    119    __asm__ __volatile__(
    120       "   l	0,%0\n\t"
    121       "0: st	0,%1\n\t"
    122       "   icm	1,1,%1\n\t"
    123       "   ar	1,%2\n\t"
    124       "   stcm  1,1,%1\n\t"
    125       "   l     1,%1\n\t"
    126       "   cs	0,1,%0\n\t"
    127       "   jl    0b\n\t"
    128       : "+m" (*p), "+m" (dummy)
    129       : "d" (n)
    130       : "cc", "memory", "0", "1");
    131 #elif defined(VGA_mips32)
    132    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    133       exception that can cause this function to fail. */
    134 #if defined (_MIPSEL)
    135    unsigned int block[3]
    136       = { (unsigned int)p, (unsigned int)n, 0x0 };
    137    do {
    138       __asm__ __volatile__(
    139          "move $t0, %0"           "\n\t"
    140          "lw   $t1, 0($t0)"       "\n\t"  // p
    141          "lw   $t2, 4($t0)"       "\n\t"  // n
    142          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    143          "li   $t4, 0xFF"         "\n\t"
    144          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFFFF00
    145          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    146          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFFFF00
    147          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    148          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    149          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    150          "sc   $t3, 0($t1)"       "\n\t"
    151          "sw   $t3, 8($t0)"       "\n\t"  // save result
    152          : /*out*/
    153          : /*in*/ "r"(&block[0])
    154          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    155       );
    156    } while (block[2] != 1);
    157 #elif defined (_MIPSEB)
    158    unsigned int block[3]
    159       = { (unsigned int)p, (unsigned int)n << 24, 0x0 };
    160    do {
    161       __asm__ __volatile__(
    162          "move $t0, %0"          "\n\t"
    163          "lw   $t1, 0($t0)"      "\n\t"  // p
    164          "lw   $t2, 4($t0)"      "\n\t"  // n
    165          "ll   $t3, 0($t1)"      "\n\t"
    166          "addu $t3, $t3, $t2"    "\n\t"
    167          "sc   $t3, 0($t1)"      "\n\t"
    168          "sw   $t3, 8($t0)"      "\n\t"
    169          : /*out*/
    170          : /*in*/ "r"(&block[0])
    171          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    172       );
    173    } while (block[2] != 1);
    174 #endif
    175 #elif defined(VGA_mips64)
    176    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    177       exception that can cause this function to fail. */
    178 #if defined (_MIPSEL)
    179    unsigned long block[3]
    180       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    181    do {
    182       __asm__ __volatile__(
    183          "move $t0, %0"           "\n\t"
    184          "ld   $t1, 0($t0)"       "\n\t"  // p
    185          "ld   $t2, 8($t0)"       "\n\t"  // n
    186          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    187          "li   $s0, 0xFF"         "\n\t"
    188          "nor  $s0, $s0, $zero"   "\n\t"  // $s0 = 0xFFFFFF00
    189          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    190          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFFFF00
    191          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    192          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    193          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    194          "sc   $t3, 0($t1)"       "\n\t"
    195          "sw   $t3, 16($t0)"      "\n\t"  // save result
    196          : /*out*/
    197          : /*in*/ "r"(&block[0])
    198          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    199       );
    200    } while (block[2] != 1);
    201 #elif defined (_MIPSEB)
    202    unsigned long block[3]
    203       = { (unsigned long)p, (unsigned long)n << 56, 0x0 };
    204    do {
    205       __asm__ __volatile__(
    206          "move  $t0, %0"          "\n\t"
    207          "ld    $t1, 0($t0)"      "\n\t"  // p
    208          "ld    $t2, 8($t0)"      "\n\t"  // n
    209          "lld   $t3, 0($t1)"      "\n\t"
    210          "daddu $t3, $t3, $t2"    "\n\t"
    211          "scd   $t3, 0($t1)"      "\n\t"
    212          "sd    $t3, 16($t0)"     "\n\t"
    213          : /*out*/
    214          : /*in*/ "r"(&block[0])
    215          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    216       );
    217    } while (block[2] != 1);
    218 #endif
    219 #else
    220 # error "Unsupported arch"
    221 #endif
    222 }
    223 
    224 
    225 __attribute__((noinline)) void atomic_add_16bit ( short* p, int n )
    226 {
    227 #if defined(VGA_x86)
    228    unsigned long block[2];
    229    block[0] = (unsigned long)p;
    230    block[1] = n;
    231    __asm__ __volatile__(
    232       "movl 0(%%esi),%%eax"      "\n\t"
    233       "movl 4(%%esi),%%ebx"      "\n\t"
    234       "lock; addw %%bx,(%%eax)"  "\n"
    235       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    236    );
    237 #elif defined(VGA_amd64)
    238    unsigned long block[2];
    239    block[0] = (unsigned long)p;
    240    block[1] = n;
    241    __asm__ __volatile__(
    242       "movq 0(%%rsi),%%rax"      "\n\t"
    243       "movq 8(%%rsi),%%rbx"      "\n\t"
    244       "lock; addw %%bx,(%%rax)"  "\n"
    245       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    246    );
    247 #elif defined(VGA_ppc32)
    248    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    249       is 8-aligned -- guaranteed by caller. */
    250    unsigned long success;
    251    do {
    252       __asm__ __volatile__(
    253          "lwarx  15,0,%1"    "\n\t"
    254          "add    15,15,%2"   "\n\t"
    255          "stwcx. 15,0,%1"    "\n\t"
    256          "mfcr   %0"         "\n\t"
    257          "srwi   %0,%0,29"   "\n\t"
    258          "andi.  %0,%0,1"    "\n"
    259          : /*out*/"=b"(success)
    260          : /*in*/ "b"(p), "b"(((unsigned long)n) << 16)
    261          : /*trash*/ "memory", "cc", "r15"
    262       );
    263    } while (success != 1);
    264 #elif defined(VGA_ppc64)
    265    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    266       is 8-aligned -- guaranteed by caller. */
    267    unsigned long success;
    268    do {
    269       __asm__ __volatile__(
    270          "ldarx  15,0,%1"    "\n\t"
    271          "add    15,15,%2"   "\n\t"
    272          "stdcx. 15,0,%1"    "\n\t"
    273          "mfcr   %0"         "\n\t"
    274          "srwi   %0,%0,29"   "\n\t"
    275          "andi.  %0,%0,1"    "\n"
    276          : /*out*/"=b"(success)
    277          : /*in*/ "b"(p), "b"(((unsigned long)n) << 48)
    278          : /*trash*/ "memory", "cc", "r15"
    279       );
    280    } while (success != 1);
    281 #elif defined(VGA_arm)
    282    unsigned int block[3]
    283       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    284    do {
    285       __asm__ __volatile__(
    286          "mov    r5, %0"         "\n\t"
    287          "ldr    r9, [r5, #0]"   "\n\t" // p
    288          "ldr    r10, [r5, #4]"  "\n\t" // n
    289          "ldrexh r8, [r9]"       "\n\t"
    290          "add    r8, r8, r10"    "\n\t"
    291          "strexh r4, r8, [r9]"   "\n\t"
    292          "str    r4, [r5, #8]"   "\n\t"
    293          : /*out*/
    294          : /*in*/ "r"(&block[0])
    295          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    296       );
    297    } while (block[2] != 0);
    298 #elif defined(VGA_arm64)
    299    unsigned long long int block[3]
    300       = { (unsigned long long int)p, (unsigned long long int)n,
    301           0xFFFFFFFFFFFFFFFFULL};
    302    do {
    303       __asm__ __volatile__(
    304          "mov   x5, %0"         "\n\t"
    305          "ldr   x9, [x5, #0]"   "\n\t" // p
    306          "ldr   x10, [x5, #8]"  "\n\t" // n
    307          "ldxrh w8, [x9]"       "\n\t"
    308          "add   x8, x8, x10"    "\n\t"
    309          "stxrh w4, w8, [x9]"    "\n\t"
    310          "str   x4, [x5, #16]"   "\n\t"
    311          : /*out*/
    312          : /*in*/ "r"(&block[0])
    313          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    314       );
    315    } while (block[2] != 0);
    316 #elif defined(VGA_s390x)
    317    int dummy;
    318    __asm__ __volatile__(
    319       "   l	0,%0\n\t"
    320       "0: st	0,%1\n\t"
    321       "   icm	1,3,%1\n\t"
    322       "   ar	1,%2\n\t"
    323       "   stcm  1,3,%1\n\t"
    324       "   l     1,%1\n\t"
    325       "   cs	0,1,%0\n\t"
    326       "   jl    0b\n\t"
    327       : "+m" (*p), "+m" (dummy)
    328       : "d" (n)
    329       : "cc", "memory", "0", "1");
    330 #elif defined(VGA_mips32)
    331    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    332       exception that can cause this function to fail. */
    333 #if defined (_MIPSEL)
    334    unsigned int block[3]
    335       = { (unsigned int)p, (unsigned int)n, 0x0 };
    336    do {
    337       __asm__ __volatile__(
    338          "move $t0, %0"           "\n\t"
    339          "lw   $t1, 0($t0)"       "\n\t"  // p
    340          "lw   $t2, 4($t0)"       "\n\t"  // n
    341          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    342          "li   $t4, 0xFFFF"       "\n\t"
    343          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFF0000
    344          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    345          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFF0000
    346          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    347          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    348          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    349          "sc   $t3, 0($t1)"       "\n\t"
    350          "sw   $t3, 8($t0)"       "\n\t"  // save result
    351          : /*out*/
    352          : /*in*/ "r"(&block[0])
    353          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    354       );
    355    } while (block[2] != 1);
    356 #elif defined (_MIPSEB)
    357    unsigned int block[3]
    358       = { (unsigned int)p, (unsigned int)n << 16, 0x0 };
    359    do {
    360       __asm__ __volatile__(
    361          "move $t0, %0"          "\n\t"
    362          "lw   $t1, 0($t0)"      "\n\t"  // p
    363          "lw   $t2, 4($t0)"      "\n\t"  // n
    364          "ll   $t3, 0($t1)"      "\n\t"
    365          "addu $t3, $t3, $t2"    "\n\t"
    366          "sc   $t3, 0($t1)"      "\n\t"
    367          "sw   $t3, 8($t0)"      "\n\t"
    368          : /*out*/
    369          : /*in*/ "r"(&block[0])
    370          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    371       );
    372    } while (block[2] != 1);
    373 #endif
    374 #elif defined(VGA_mips64)
    375    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    376       exception that can cause this function to fail. */
    377 #if defined (_MIPSEL)
    378    unsigned long block[3]
    379       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    380    do {
    381       __asm__ __volatile__(
    382          "move $t0, %0"           "\n\t"
    383          "ld   $t1, 0($t0)"       "\n\t"  // p
    384          "ld   $t2, 8($t0)"       "\n\t"  // n
    385          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    386          "li   $s0, 0xFFFF"       "\n\t"
    387          "nor  $s0, $s0, $zero"   "\n\t"  // $s0= 0xFFFF0000
    388          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    389          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFF0000
    390          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    391          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    392          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    393          "sc   $t3, 0($t1)"       "\n\t"
    394          "sw   $t3, 16($t0)"      "\n\t"  // save result
    395          : /*out*/
    396          : /*in*/ "r"(&block[0])
    397          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    398       );
    399    } while (block[2] != 1);
    400 #elif defined (_MIPSEB)
    401    unsigned long block[3]
    402       = { (unsigned long)p, (unsigned long)n << 48, 0x0 };
    403    do {
    404       __asm__ __volatile__(
    405          "move  $t0, %0"          "\n\t"
    406          "ld    $t1, 0($t0)"      "\n\t"  // p
    407          "ld    $t2, 8($t0)"      "\n\t"  // n
    408          "lld   $t3, 0($t1)"      "\n\t"
    409          "daddu $t3, $t3, $t2"    "\n\t"
    410          "scd   $t3, 0($t1)"      "\n\t"
    411          "sd    $t3, 16($t0)"     "\n\t"
    412          : /*out*/
    413          : /*in*/ "r"(&block[0])
    414          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    415       );
    416    } while (block[2] != 1);
    417 #endif
    418 #else
    419 # error "Unsupported arch"
    420 #endif
    421 }
    422 
    423 __attribute__((noinline)) void atomic_add_32bit ( int* p, int n )
    424 {
    425 #if defined(VGA_x86)
    426    unsigned long block[2];
    427    block[0] = (unsigned long)p;
    428    block[1] = n;
    429    __asm__ __volatile__(
    430       "movl 0(%%esi),%%eax"       "\n\t"
    431       "movl 4(%%esi),%%ebx"       "\n\t"
    432       "lock; addl %%ebx,(%%eax)"  "\n"
    433       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    434    );
    435 #elif defined(VGA_amd64)
    436    unsigned long block[2];
    437    block[0] = (unsigned long)p;
    438    block[1] = n;
    439    __asm__ __volatile__(
    440       "movq 0(%%rsi),%%rax"       "\n\t"
    441       "movq 8(%%rsi),%%rbx"       "\n\t"
    442       "lock; addl %%ebx,(%%rax)"  "\n"
    443       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    444    );
    445 #elif defined(VGA_ppc32)
    446    unsigned long success;
    447    do {
    448       __asm__ __volatile__(
    449          "lwarx  15,0,%1"    "\n\t"
    450          "add    15,15,%2"   "\n\t"
    451          "stwcx. 15,0,%1"    "\n\t"
    452          "mfcr   %0"         "\n\t"
    453          "srwi   %0,%0,29"   "\n\t"
    454          "andi.  %0,%0,1"    "\n"
    455          : /*out*/"=b"(success)
    456          : /*in*/ "b"(p), "b"(n)
    457          : /*trash*/ "memory", "cc", "r15"
    458       );
    459    } while (success != 1);
    460 #elif defined(VGA_ppc64)
    461    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    462       is 8-aligned -- guaranteed by caller. */
    463    unsigned long success;
    464    do {
    465       __asm__ __volatile__(
    466          "ldarx  15,0,%1"    "\n\t"
    467          "add    15,15,%2"   "\n\t"
    468          "stdcx. 15,0,%1"    "\n\t"
    469          "mfcr   %0"         "\n\t"
    470          "srwi   %0,%0,29"   "\n\t"
    471          "andi.  %0,%0,1"    "\n"
    472          : /*out*/"=b"(success)
    473          : /*in*/ "b"(p), "b"(((unsigned long)n) << 32)
    474          : /*trash*/ "memory", "cc", "r15"
    475       );
    476    } while (success != 1);
    477 #elif defined(VGA_arm)
    478    unsigned int block[3]
    479       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    480    do {
    481       __asm__ __volatile__(
    482          "mov   r5, %0"         "\n\t"
    483          "ldr   r9, [r5, #0]"   "\n\t" // p
    484          "ldr   r10, [r5, #4]"  "\n\t" // n
    485          "ldrex r8, [r9]"       "\n\t"
    486          "add   r8, r8, r10"    "\n\t"
    487          "strex r4, r8, [r9]"   "\n\t"
    488          "str   r4, [r5, #8]"   "\n\t"
    489          : /*out*/
    490          : /*in*/ "r"(&block[0])
    491          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    492       );
    493    } while (block[2] != 0);
    494 #elif defined(VGA_arm64)
    495    unsigned long long int block[3]
    496       = { (unsigned long long int)p, (unsigned long long int)n,
    497           0xFFFFFFFFFFFFFFFFULL};
    498    do {
    499       __asm__ __volatile__(
    500          "mov   x5, %0"         "\n\t"
    501          "ldr   x9, [x5, #0]"   "\n\t" // p
    502          "ldr   x10, [x5, #8]"  "\n\t" // n
    503          "ldxr  w8, [x9]"       "\n\t"
    504          "add   x8, x8, x10"    "\n\t"
    505          "stxr  w4, w8, [x9]"    "\n\t"
    506          "str   x4, [x5, #16]"   "\n\t"
    507          : /*out*/
    508          : /*in*/ "r"(&block[0])
    509          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    510       );
    511    } while (block[2] != 0);
    512 #elif defined(VGA_s390x)
    513    __asm__ __volatile__(
    514       "   l	0,%0\n\t"
    515       "0: lr	1,0\n\t"
    516       "   ar	1,%1\n\t"
    517       "   cs	0,1,%0\n\t"
    518       "   jl    0b\n\t"
    519       : "+m" (*p)
    520       : "d" (n)
    521       : "cc", "memory", "0", "1");
    522 #elif defined(VGA_mips32)
    523    unsigned int block[3]
    524       = { (unsigned int)p, (unsigned int)n, 0x0 };
    525    do {
    526       __asm__ __volatile__(
    527          "move $t0, %0"        "\n\t"
    528          "lw   $t1, 0($t0)"    "\n\t"  // p
    529          "lw   $t2, 4($t0)"    "\n\t"  // n
    530          "ll   $t3, 0($t1)"    "\n\t"
    531          "addu $t3, $t3, $t2"  "\n\t"
    532          "sc   $t3, 0($t1)"    "\n\t"
    533          "sw   $t3, 8($t0)"    "\n\t"
    534          : /*out*/
    535          : /*in*/ "r"(&block[0])
    536          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    537       );
    538    } while (block[2] != 1);
    539 #elif defined(VGA_mips64)
    540    unsigned long block[3]
    541       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    542    do {
    543       __asm__ __volatile__(
    544          "move  $t0, %0"        "\n\t"
    545          "ld    $t1, 0($t0)"    "\n\t"  // p
    546          "ld    $t2, 8($t0)"    "\n\t"  // n
    547          "ll    $t3, 0($t1)"    "\n\t"
    548          "addu  $t3, $t3, $t2"  "\n\t"
    549          "sc    $t3, 0($t1)"    "\n\t"
    550          "sd    $t3, 16($t0)"   "\n\t"
    551          : /*out*/
    552          : /*in*/ "r"(&block[0])
    553          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    554       );
    555    } while (block[2] != 1);
    556 #else
    557 # error "Unsupported arch"
    558 #endif
    559 }
    560 
    561 __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
    562 {
    563 #if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32)
    564    /* do nothing; is not supported */
    565 #elif defined(VGA_amd64)
    566    // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
    567    // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*)
    568    unsigned long long int block[2];
    569    block[0] = (unsigned long long int)(unsigned long)p;
    570    block[1] = n;
    571    __asm__ __volatile__(
    572       "movq 0(%%rsi),%%rax"      "\n\t"
    573       "movq 8(%%rsi),%%rbx"      "\n\t"
    574       "lock; addq %%rbx,(%%rax)" "\n"
    575       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    576    );
    577 #elif defined(VGA_ppc64)
    578    unsigned long success;
    579    do {
    580       __asm__ __volatile__(
    581          "ldarx  15,0,%1"    "\n\t"
    582          "add    15,15,%2"   "\n\t"
    583          "stdcx. 15,0,%1"    "\n\t"
    584          "mfcr   %0"         "\n\t"
    585          "srwi   %0,%0,29"   "\n\t"
    586          "andi.  %0,%0,1"    "\n"
    587          : /*out*/"=b"(success)
    588          : /*in*/ "b"(p), "b"(n)
    589          : /*trash*/ "memory", "cc", "r15"
    590       );
    591    } while (success != 1);
    592 #elif defined(VGA_arm)
    593    unsigned long long int block[3]
    594      = { (unsigned long long int)(unsigned long)p,
    595          (unsigned long long int)n,
    596          0xFFFFFFFFFFFFFFFFULL };
    597    do {
    598       __asm__ __volatile__(
    599          "mov    r5, %0"             "\n\t"
    600          "ldr    r8,     [r5, #0]"   "\n\t" // p
    601          "ldrd   r2, r3, [r5, #8]"   "\n\t" // n
    602          "ldrexd r0, r1, [r8]"       "\n\t"
    603          "adds   r2, r2, r0"         "\n\t"
    604          "adc    r3, r3, r1"         "\n\t"
    605          "strexd r1, r2, r3, [r8]"   "\n\t"
    606          "str    r1, [r5, #16]"      "\n\t"
    607          : /*out*/
    608          : /*in*/ "r"(&block[0])
    609          : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3"
    610       );
    611    } while (block[2] != 0xFFFFFFFF00000000ULL);
    612 #elif defined(VGA_arm64)
    613    unsigned long long int block[3]
    614       = { (unsigned long long int)p, (unsigned long long int)n,
    615           0xFFFFFFFFFFFFFFFFULL};
    616    do {
    617       __asm__ __volatile__(
    618          "mov   x5, %0"         "\n\t"
    619          "ldr   x9, [x5, #0]"   "\n\t" // p
    620          "ldr   x10, [x5, #8]"  "\n\t" // n
    621          "ldxr  x8, [x9]"       "\n\t"
    622          "add   x8, x8, x10"    "\n\t"
    623          "stxr  w4, x8, [x9]"   "\n\t"
    624          "str   x4, [x5, #16]"   "\n\t"
    625          : /*out*/
    626          : /*in*/ "r"(&block[0])
    627          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    628       );
    629    } while (block[2] != 0);
    630 #elif defined(VGA_s390x)
    631    __asm__ __volatile__(
    632       "   lg	0,%0\n\t"
    633       "0: lgr	1,0\n\t"
    634       "   agr	1,%1\n\t"
    635       "   csg	0,1,%0\n\t"
    636       "   jl    0b\n\t"
    637       : "+m" (*p)
    638       : "d" (n)
    639       : "cc", "memory", "0", "1");
    640 #elif defined(VGA_mips64)
    641    unsigned long block[3]
    642       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    643    do {
    644       __asm__ __volatile__(
    645          "move  $t0, %0"        "\n\t"
    646          "ld    $t1, 0($t0)"    "\n\t" // p
    647          "ld    $t2, 8($t0)"    "\n\t" // n
    648          "lld   $t3, 0($t1)"    "\n\t"
    649          "daddu $t3, $t3, $t2"  "\n\t"
    650          "scd   $t3, 0($t1)"    "\n\t"
    651          "sd    $t3, 16($t0)"   "\n\t"
    652          : /*out*/
    653          : /*in*/ "r"(&block[0])
    654          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    655       );
    656    } while (block[2] != 1);
    657 #else
    658 # error "Unsupported arch"
    659 #endif
    660 }
    661 
    662 int main ( int argc, char** argv )
    663 {
    664    int    i, status;
    665    char*  page;
    666    char*  p8;
    667    short* p16;
    668    int*   p32;
    669    long long int* p64;
    670    pid_t  child, p2;
    671 
    672    printf("parent, pre-fork\n");
    673 
    674    page = mmap( 0, sysconf(_SC_PAGESIZE),
    675                    PROT_READ|PROT_WRITE,
    676                    MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
    677    if (page == MAP_FAILED) {
    678       perror("mmap failed");
    679       exit(1);
    680    }
    681 
    682    p8  = (char*)(page+0);
    683    p16 = (short*)(page+256);
    684    p32 = (int*)(page+512);
    685    p64 = (long long int*)(page+768);
    686 
    687    assert( IS_8_ALIGNED(p8) );
    688    assert( IS_8_ALIGNED(p16) );
    689    assert( IS_8_ALIGNED(p32) );
    690    assert( IS_8_ALIGNED(p64) );
    691 
    692    memset(page, 0, 1024);
    693 
    694    *p8  = 0;
    695    *p16 = 0;
    696    *p32 = 0;
    697    *p64 = 0;
    698 
    699    child = fork();
    700    if (child == -1) {
    701       perror("fork() failed\n");
    702       return 1;
    703    }
    704 
    705    if (child == 0) {
    706       /* --- CHILD --- */
    707       printf("child\n");
    708       for (i = 0; i < NNN; i++) {
    709          atomic_add_8bit(p8, 1);
    710          atomic_add_16bit(p16, 1);
    711          atomic_add_32bit(p32, 1);
    712          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    713       }
    714       return 1;
    715       /* NOTREACHED */
    716 
    717    }
    718 
    719    /* --- PARENT --- */
    720 
    721    printf("parent\n");
    722 
    723    for (i = 0; i < NNN; i++) {
    724       atomic_add_8bit(p8, 1);
    725       atomic_add_16bit(p16, 1);
    726       atomic_add_32bit(p32, 1);
    727       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    728    }
    729 
    730    p2 = waitpid(child, &status, 0);
    731    assert(p2 == child);
    732 
    733    /* assert that child finished normally */
    734    assert(WIFEXITED(status));
    735 
    736    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
    737           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
    738 
    739    if (-74 == (int)(*(signed char*)p8)
    740        && 32694 == (int)(*p16)
    741        && 6913974 == *p32
    742        && (0LL == *p64 || 682858642110LL == *p64)) {
    743       printf("PASS\n");
    744    } else {
    745       printf("FAIL -- see source code for expected values\n");
    746    }
    747 
    748    printf("parent exits\n");
    749 
    750    return 0;
    751 }
    752