Home | History | Annotate | Download | only in tests
      1 
      2 /* This is an example of a program which does atomic memory operations
      3    between two processes which share a page.  Valgrind 3.4.1 and
      4    earlier produce incorrect answers because it does not preserve
      5    atomicity of the relevant instructions in the generated code; but
      6    the post-DCAS-merge versions of Valgrind do behave correctly. */
      7 
      8 /* On ARM, this can be compiled into either ARM or Thumb code, so as
      9    to test both A and T encodings of LDREX/STREX et al.  Also on ARM,
     10    it tests doubleword atomics (LDREXD, STREXD) which I don't think it
     11    does on any other platform. */
     12 
     13 #include <stdlib.h>
     14 #include <stdio.h>
     15 #include <string.h>
     16 #include <assert.h>
     17 #include <unistd.h>
     18 #include <sys/wait.h>
     19 #include "tests/sys_mman.h"
     20 
     21 #define NNN 3456987
     22 
     23 #define IS_8_ALIGNED(_ptr)   (0 == (((unsigned long)(_ptr)) & 7))
     24 
     25 
     26 __attribute__((noinline)) void atomic_add_8bit ( char* p, int n )
     27 {
     28 #if defined(VGA_x86)
     29    unsigned long block[2];
     30    block[0] = (unsigned long)p;
     31    block[1] = n;
     32    __asm__ __volatile__(
     33       "movl 0(%%esi),%%eax"      "\n\t"
     34       "movl 4(%%esi),%%ebx"      "\n\t"
     35       "lock; addb %%bl,(%%eax)"  "\n"
     36       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
     37    );
     38 #elif defined(VGA_amd64)
     39    unsigned long block[2];
     40    block[0] = (unsigned long)p;
     41    block[1] = n;
     42    __asm__ __volatile__(
     43       "movq 0(%%rsi),%%rax"      "\n\t"
     44       "movq 8(%%rsi),%%rbx"      "\n\t"
     45       "lock; addb %%bl,(%%rax)"  "\n"
     46       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
     47    );
     48 #elif defined(VGA_ppc32)
     49    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     50       is 4-aligned -- guaranteed by caller. */
     51    unsigned long success;
     52    do {
     53       __asm__ __volatile__(
     54          "lwarx  15,0,%1"    "\n\t"
     55          "add    15,15,%2"   "\n\t"
     56          "stwcx. 15,0,%1"    "\n\t"
     57          "mfcr   %0"         "\n\t"
     58          "srwi   %0,%0,29"   "\n\t"
     59          "andi.  %0,%0,1"    "\n"
     60          : /*out*/"=b"(success)
     61          : /*in*/ "b"(p), "b"(((unsigned long)n) << 24)
     62          : /*trash*/ "memory", "cc", "r15"
     63       );
     64    } while (success != 1);
     65 #elif defined(VGA_ppc64be)
     66    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     67       is 8-aligned -- guaranteed by caller. */
     68    unsigned long success;
     69    do {
     70       __asm__ __volatile__(
     71          "ldarx  15,0,%1"    "\n\t"
     72          "add    15,15,%2"   "\n\t"
     73          "stdcx. 15,0,%1"    "\n\t"
     74          "mfcr   %0"         "\n\t"
     75          "srwi   %0,%0,29"   "\n\t"
     76          "andi.  %0,%0,1"    "\n"
     77          : /*out*/"=b"(success)
     78          : /*in*/ "b"(p), "b"(((unsigned long)n) << 56)
     79          : /*trash*/ "memory", "cc", "r15"
     80       );
     81    } while (success != 1);
     82 #elif defined(VGA_ppc64le)
     83    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
     84       is 8-aligned -- guaranteed by caller. */
     85    unsigned long success;
     86    do {
     87       __asm__ __volatile__(
     88          "ldarx  15,0,%1"    "\n\t"
     89          "add    15,15,%2"   "\n\t"
     90          "stdcx. 15,0,%1"    "\n\t"
     91          "mfcr   %0"         "\n\t"
     92          "srwi   %0,%0,29"   "\n\t"
     93          "andi.  %0,%0,1"    "\n"
     94          : /*out*/"=b"(success)
     95          : /*in*/ "b"(p), "b"(((unsigned long)n))
     96          : /*trash*/ "memory", "cc", "r15"
     97       );
     98    } while (success != 1);
     99 #elif defined(VGA_arm)
    100    unsigned int block[3]
    101       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    102    do {
    103       __asm__ __volatile__(
    104          "mov    r5, %0"         "\n\t"
    105          "ldr    r9, [r5, #0]"   "\n\t" // p
    106          "ldr    r10, [r5, #4]"  "\n\t" // n
    107          "ldrexb r8, [r9]"       "\n\t"
    108          "add    r8, r8, r10"    "\n\t"
    109          "strexb r4, r8, [r9]"   "\n\t"
    110          "str    r4, [r5, #8]"   "\n\t"
    111          : /*out*/
    112          : /*in*/ "r"(&block[0])
    113          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    114       );
    115    } while (block[2] != 0);
    116 #elif defined(VGA_arm64)
    117    unsigned long long int block[3]
    118       = { (unsigned long long int)p, (unsigned long long int)n,
    119           0xFFFFFFFFFFFFFFFFULL};
    120    do {
    121       __asm__ __volatile__(
    122          "mov   x5, %0"         "\n\t"
    123          "ldr   x9, [x5, #0]"   "\n\t" // p
    124          "ldr   x10, [x5, #8]"  "\n\t" // n
    125          "ldxrb w8, [x9]"       "\n\t"
    126          "add   x8, x8, x10"    "\n\t"
    127          "stxrb w4, w8, [x9]"    "\n\t"
    128          "str   x4, [x5, #16]"   "\n\t"
    129          : /*out*/
    130          : /*in*/ "r"(&block[0])
    131          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    132       );
    133    } while (block[2] != 0);
    134 #elif defined(VGA_s390x)
    135    int dummy;
    136    __asm__ __volatile__(
    137       "   l	0,%0\n\t"
    138       "0: st	0,%1\n\t"
    139       "   icm	1,1,%1\n\t"
    140       "   ar	1,%2\n\t"
    141       "   stcm  1,1,%1\n\t"
    142       "   l     1,%1\n\t"
    143       "   cs	0,1,%0\n\t"
    144       "   jl    0b\n\t"
    145       : "+m" (*p), "+m" (dummy)
    146       : "d" (n)
    147       : "cc", "memory", "0", "1");
    148 #elif defined(VGA_mips32)
    149    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    150       exception that can cause this function to fail. */
    151 #if defined (_MIPSEL)
    152    unsigned int block[3]
    153       = { (unsigned int)p, (unsigned int)n, 0x0 };
    154    do {
    155       __asm__ __volatile__(
    156          "move $t0, %0"           "\n\t"
    157          "lw   $t1, 0($t0)"       "\n\t"  // p
    158          "lw   $t2, 4($t0)"       "\n\t"  // n
    159          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    160          "li   $t4, 0xFF"         "\n\t"
    161          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFFFF00
    162          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    163          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFFFF00
    164          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    165          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    166          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    167          "sc   $t3, 0($t1)"       "\n\t"
    168          "sw   $t3, 8($t0)"       "\n\t"  // save result
    169          : /*out*/
    170          : /*in*/ "r"(&block[0])
    171          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    172       );
    173    } while (block[2] != 1);
    174 #elif defined (_MIPSEB)
    175    unsigned int block[3]
    176       = { (unsigned int)p, (unsigned int)n << 24, 0x0 };
    177    do {
    178       __asm__ __volatile__(
    179          "move $t0, %0"          "\n\t"
    180          "lw   $t1, 0($t0)"      "\n\t"  // p
    181          "lw   $t2, 4($t0)"      "\n\t"  // n
    182          "ll   $t3, 0($t1)"      "\n\t"
    183          "addu $t3, $t3, $t2"    "\n\t"
    184          "sc   $t3, 0($t1)"      "\n\t"
    185          "sw   $t3, 8($t0)"      "\n\t"
    186          : /*out*/
    187          : /*in*/ "r"(&block[0])
    188          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    189       );
    190    } while (block[2] != 1);
    191 #endif
    192 #elif defined(VGA_mips64)
    193    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    194       exception that can cause this function to fail. */
    195 #if defined (_MIPSEL)
    196    unsigned long block[3]
    197       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    198    do {
    199       __asm__ __volatile__(
    200          "move $t0, %0"           "\n\t"
    201          "ld   $t1, 0($t0)"       "\n\t"  // p
    202          "ld   $t2, 8($t0)"       "\n\t"  // n
    203          "andi $t2, $t2, 0xFF"    "\n\t"  // n = n and 0xFF
    204          "li   $s0, 0xFF"         "\n\t"
    205          "nor  $s0, $s0, $zero"   "\n\t"  // $s0 = 0xFFFFFF00
    206          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    207          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFFFF00
    208          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    209          "andi $t3, $t3, 0xFF"    "\n\t"  // $t3 = $t3 and 0xFF
    210          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    211          "sc   $t3, 0($t1)"       "\n\t"
    212          "sw   $t3, 16($t0)"      "\n\t"  // save result
    213          : /*out*/
    214          : /*in*/ "r"(&block[0])
    215          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    216       );
    217    } while (block[2] != 1);
    218 #elif defined (_MIPSEB)
    219    unsigned long block[3]
    220       = { (unsigned long)p, (unsigned long)n << 56, 0x0 };
    221    do {
    222       __asm__ __volatile__(
    223          "move  $t0, %0"          "\n\t"
    224          "ld    $t1, 0($t0)"      "\n\t"  // p
    225          "ld    $t2, 8($t0)"      "\n\t"  // n
    226          "lld   $t3, 0($t1)"      "\n\t"
    227          "daddu $t3, $t3, $t2"    "\n\t"
    228          "scd   $t3, 0($t1)"      "\n\t"
    229          "sd    $t3, 16($t0)"     "\n\t"
    230          : /*out*/
    231          : /*in*/ "r"(&block[0])
    232          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    233       );
    234    } while (block[2] != 1);
    235 #endif
    236 #else
    237 # error "Unsupported arch"
    238 #endif
    239 }
    240 
    241 
    242 __attribute__((noinline)) void atomic_add_16bit ( short* p, int n )
    243 {
    244 #if defined(VGA_x86)
    245    unsigned long block[2];
    246    block[0] = (unsigned long)p;
    247    block[1] = n;
    248    __asm__ __volatile__(
    249       "movl 0(%%esi),%%eax"      "\n\t"
    250       "movl 4(%%esi),%%ebx"      "\n\t"
    251       "lock; addw %%bx,(%%eax)"  "\n"
    252       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    253    );
    254 #elif defined(VGA_amd64)
    255    unsigned long block[2];
    256    block[0] = (unsigned long)p;
    257    block[1] = n;
    258    __asm__ __volatile__(
    259       "movq 0(%%rsi),%%rax"      "\n\t"
    260       "movq 8(%%rsi),%%rbx"      "\n\t"
    261       "lock; addw %%bx,(%%rax)"  "\n"
    262       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    263    );
    264 #elif defined(VGA_ppc32)
    265    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    266       is 8-aligned -- guaranteed by caller. */
    267    unsigned long success;
    268    do {
    269       __asm__ __volatile__(
    270          "lwarx  15,0,%1"    "\n\t"
    271          "add    15,15,%2"   "\n\t"
    272          "stwcx. 15,0,%1"    "\n\t"
    273          "mfcr   %0"         "\n\t"
    274          "srwi   %0,%0,29"   "\n\t"
    275          "andi.  %0,%0,1"    "\n"
    276          : /*out*/"=b"(success)
    277          : /*in*/ "b"(p), "b"(((unsigned long)n) << 16)
    278          : /*trash*/ "memory", "cc", "r15"
    279       );
    280    } while (success != 1);
    281 #elif defined(VGA_ppc64be)
    282    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    283       is 8-aligned -- guaranteed by caller. */
    284    unsigned long success;
    285    do {
    286       __asm__ __volatile__(
    287          "ldarx  15,0,%1"    "\n\t"
    288          "add    15,15,%2"   "\n\t"
    289          "stdcx. 15,0,%1"    "\n\t"
    290          "mfcr   %0"         "\n\t"
    291          "srwi   %0,%0,29"   "\n\t"
    292          "andi.  %0,%0,1"    "\n"
    293          : /*out*/"=b"(success)
    294          : /*in*/ "b"(p), "b"(((unsigned long)n) << 48)
    295          : /*trash*/ "memory", "cc", "r15"
    296       );
    297    } while (success != 1);
    298 #elif defined(VGA_ppc64le)
    299    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    300       is 8-aligned -- guaranteed by caller. */
    301    unsigned long success;
    302    do {
    303       __asm__ __volatile__(
    304          "ldarx  15,0,%1"    "\n\t"
    305          "add    15,15,%2"   "\n\t"
    306          "stdcx. 15,0,%1"    "\n\t"
    307          "mfcr   %0"         "\n\t"
    308          "srwi   %0,%0,29"   "\n\t"
    309          "andi.  %0,%0,1"    "\n"
    310          : /*out*/"=b"(success)
    311          : /*in*/ "b"(p), "b"(((unsigned long)n))
    312          : /*trash*/ "memory", "cc", "r15"
    313       );
    314    } while (success != 1);
    315 #elif defined(VGA_arm)
    316    unsigned int block[3]
    317       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    318    do {
    319       __asm__ __volatile__(
    320          "mov    r5, %0"         "\n\t"
    321          "ldr    r9, [r5, #0]"   "\n\t" // p
    322          "ldr    r10, [r5, #4]"  "\n\t" // n
    323          "ldrexh r8, [r9]"       "\n\t"
    324          "add    r8, r8, r10"    "\n\t"
    325          "strexh r4, r8, [r9]"   "\n\t"
    326          "str    r4, [r5, #8]"   "\n\t"
    327          : /*out*/
    328          : /*in*/ "r"(&block[0])
    329          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    330       );
    331    } while (block[2] != 0);
    332 #elif defined(VGA_arm64)
    333    unsigned long long int block[3]
    334       = { (unsigned long long int)p, (unsigned long long int)n,
    335           0xFFFFFFFFFFFFFFFFULL};
    336    do {
    337       __asm__ __volatile__(
    338          "mov   x5, %0"         "\n\t"
    339          "ldr   x9, [x5, #0]"   "\n\t" // p
    340          "ldr   x10, [x5, #8]"  "\n\t" // n
    341          "ldxrh w8, [x9]"       "\n\t"
    342          "add   x8, x8, x10"    "\n\t"
    343          "stxrh w4, w8, [x9]"    "\n\t"
    344          "str   x4, [x5, #16]"   "\n\t"
    345          : /*out*/
    346          : /*in*/ "r"(&block[0])
    347          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    348       );
    349    } while (block[2] != 0);
    350 #elif defined(VGA_s390x)
    351    int dummy;
    352    __asm__ __volatile__(
    353       "   l	0,%0\n\t"
    354       "0: st	0,%1\n\t"
    355       "   icm	1,3,%1\n\t"
    356       "   ar	1,%2\n\t"
    357       "   stcm  1,3,%1\n\t"
    358       "   l     1,%1\n\t"
    359       "   cs	0,1,%0\n\t"
    360       "   jl    0b\n\t"
    361       : "+m" (*p), "+m" (dummy)
    362       : "d" (n)
    363       : "cc", "memory", "0", "1");
    364 #elif defined(VGA_mips32)
    365    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    366       exception that can cause this function to fail. */
    367 #if defined (_MIPSEL)
    368    unsigned int block[3]
    369       = { (unsigned int)p, (unsigned int)n, 0x0 };
    370    do {
    371       __asm__ __volatile__(
    372          "move $t0, %0"           "\n\t"
    373          "lw   $t1, 0($t0)"       "\n\t"  // p
    374          "lw   $t2, 4($t0)"       "\n\t"  // n
    375          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    376          "li   $t4, 0xFFFF"       "\n\t"
    377          "nor  $t4, $t4, $zero"   "\n\t"  // $t4 = 0xFFFF0000
    378          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    379          "and  $t4, $t4, $t3"     "\n\t"  // $t4 = $t3 and 0xFFFF0000
    380          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    381          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    382          "or   $t3, $t3, $t4"     "\n\t"  // $t3 = $t3 or $t4
    383          "sc   $t3, 0($t1)"       "\n\t"
    384          "sw   $t3, 8($t0)"       "\n\t"  // save result
    385          : /*out*/
    386          : /*in*/ "r"(&block[0])
    387          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "t4"
    388       );
    389    } while (block[2] != 1);
    390 #elif defined (_MIPSEB)
    391    unsigned int block[3]
    392       = { (unsigned int)p, (unsigned int)n << 16, 0x0 };
    393    do {
    394       __asm__ __volatile__(
    395          "move $t0, %0"          "\n\t"
    396          "lw   $t1, 0($t0)"      "\n\t"  // p
    397          "lw   $t2, 4($t0)"      "\n\t"  // n
    398          "ll   $t3, 0($t1)"      "\n\t"
    399          "addu $t3, $t3, $t2"    "\n\t"
    400          "sc   $t3, 0($t1)"      "\n\t"
    401          "sw   $t3, 8($t0)"      "\n\t"
    402          : /*out*/
    403          : /*in*/ "r"(&block[0])
    404          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    405       );
    406    } while (block[2] != 1);
    407 #endif
    408 #elif defined(VGA_mips64)
    409    /* We rely on the fact that p is 4-aligned. Otherwise 'll' may throw an
    410       exception that can cause this function to fail. */
    411 #if defined (_MIPSEL)
    412    unsigned long block[3]
    413       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    414    do {
    415       __asm__ __volatile__(
    416          "move $t0, %0"           "\n\t"
    417          "ld   $t1, 0($t0)"       "\n\t"  // p
    418          "ld   $t2, 8($t0)"       "\n\t"  // n
    419          "andi $t2, $t2, 0xFFFF"  "\n\t"  // n = n and 0xFFFF
    420          "li   $s0, 0xFFFF"       "\n\t"
    421          "nor  $s0, $s0, $zero"   "\n\t"  // $s0= 0xFFFF0000
    422          "ll   $t3, 0($t1)"       "\n\t"  // $t3 = old value
    423          "and  $s0, $s0, $t3"     "\n\t"  // $s0 = $t3 and 0xFFFF0000
    424          "addu $t3, $t3, $t2"     "\n\t"  // $t3 = $t3 + n
    425          "andi $t3, $t3, 0xFFFF"  "\n\t"  // $t3 = $t3 and 0xFFFF
    426          "or   $t3, $t3, $s0"     "\n\t"  // $t3 = $t3 or $s0
    427          "sc   $t3, 0($t1)"       "\n\t"
    428          "sw   $t3, 16($t0)"      "\n\t"  // save result
    429          : /*out*/
    430          : /*in*/ "r"(&block[0])
    431          : /*trash*/ "memory", "t0", "t1", "t2", "t3", "s0"
    432       );
    433    } while (block[2] != 1);
    434 #elif defined (_MIPSEB)
    435    unsigned long block[3]
    436       = { (unsigned long)p, (unsigned long)n << 48, 0x0 };
    437    do {
    438       __asm__ __volatile__(
    439          "move  $t0, %0"          "\n\t"
    440          "ld    $t1, 0($t0)"      "\n\t"  // p
    441          "ld    $t2, 8($t0)"      "\n\t"  // n
    442          "lld   $t3, 0($t1)"      "\n\t"
    443          "daddu $t3, $t3, $t2"    "\n\t"
    444          "scd   $t3, 0($t1)"      "\n\t"
    445          "sd    $t3, 16($t0)"     "\n\t"
    446          : /*out*/
    447          : /*in*/ "r"(&block[0])
    448          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    449       );
    450    } while (block[2] != 1);
    451 #endif
    452 #else
    453 # error "Unsupported arch"
    454 #endif
    455 }
    456 
    457 __attribute__((noinline)) void atomic_add_32bit ( int* p, int n )
    458 {
    459 #if defined(VGA_x86)
    460    unsigned long block[2];
    461    block[0] = (unsigned long)p;
    462    block[1] = n;
    463    __asm__ __volatile__(
    464       "movl 0(%%esi),%%eax"       "\n\t"
    465       "movl 4(%%esi),%%ebx"       "\n\t"
    466       "lock; addl %%ebx,(%%eax)"  "\n"
    467       : : "S"(&block[0])/* S means "esi only" */ : "memory","cc","eax","ebx"
    468    );
    469 #elif defined(VGA_amd64)
    470    unsigned long block[2];
    471    block[0] = (unsigned long)p;
    472    block[1] = n;
    473    __asm__ __volatile__(
    474       "movq 0(%%rsi),%%rax"       "\n\t"
    475       "movq 8(%%rsi),%%rbx"       "\n\t"
    476       "lock; addl %%ebx,(%%rax)"  "\n"
    477       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    478    );
    479 #elif defined(VGA_ppc32)
    480    unsigned long success;
    481    do {
    482       __asm__ __volatile__(
    483          "lwarx  15,0,%1"    "\n\t"
    484          "add    15,15,%2"   "\n\t"
    485          "stwcx. 15,0,%1"    "\n\t"
    486          "mfcr   %0"         "\n\t"
    487          "srwi   %0,%0,29"   "\n\t"
    488          "andi.  %0,%0,1"    "\n"
    489          : /*out*/"=b"(success)
    490          : /*in*/ "b"(p), "b"(n)
    491          : /*trash*/ "memory", "cc", "r15"
    492       );
    493    } while (success != 1);
    494 #elif defined(VGA_ppc64be)
    495    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    496       is 8-aligned -- guaranteed by caller. */
    497    unsigned long success;
    498    do {
    499       __asm__ __volatile__(
    500          "ldarx  15,0,%1"    "\n\t"
    501          "add    15,15,%2"   "\n\t"
    502          "stdcx. 15,0,%1"    "\n\t"
    503          "mfcr   %0"         "\n\t"
    504          "srwi   %0,%0,29"   "\n\t"
    505          "andi.  %0,%0,1"    "\n"
    506          : /*out*/"=b"(success)
    507          : /*in*/ "b"(p), "b"(((unsigned long)n) << 32)
    508          : /*trash*/ "memory", "cc", "r15"
    509       );
    510    } while (success != 1);
    511 #elif defined(VGA_ppc64le)
    512    /* Nasty hack.  Does correctly atomically do *p += n, but only if p
    513       is 8-aligned -- guaranteed by caller. */
    514    unsigned long success;
    515    do {
    516       __asm__ __volatile__(
    517          "ldarx  15,0,%1"    "\n\t"
    518          "add    15,15,%2"   "\n\t"
    519          "stdcx. 15,0,%1"    "\n\t"
    520          "mfcr   %0"         "\n\t"
    521          "srwi   %0,%0,29"   "\n\t"
    522          "andi.  %0,%0,1"    "\n"
    523          : /*out*/"=b"(success)
    524          : /*in*/ "b"(p), "b"(((unsigned long)n))
    525          : /*trash*/ "memory", "cc", "r15"
    526       );
    527    } while (success != 1);
    528 #elif defined(VGA_arm)
    529    unsigned int block[3]
    530       = { (unsigned int)p, (unsigned int)n, 0xFFFFFFFF };
    531    do {
    532       __asm__ __volatile__(
    533          "mov   r5, %0"         "\n\t"
    534          "ldr   r9, [r5, #0]"   "\n\t" // p
    535          "ldr   r10, [r5, #4]"  "\n\t" // n
    536          "ldrex r8, [r9]"       "\n\t"
    537          "add   r8, r8, r10"    "\n\t"
    538          "strex r4, r8, [r9]"   "\n\t"
    539          "str   r4, [r5, #8]"   "\n\t"
    540          : /*out*/
    541          : /*in*/ "r"(&block[0])
    542          : /*trash*/ "memory", "cc", "r5", "r8", "r9", "r10", "r4"
    543       );
    544    } while (block[2] != 0);
    545 #elif defined(VGA_arm64)
    546    unsigned long long int block[3]
    547       = { (unsigned long long int)p, (unsigned long long int)n,
    548           0xFFFFFFFFFFFFFFFFULL};
    549    do {
    550       __asm__ __volatile__(
    551          "mov   x5, %0"         "\n\t"
    552          "ldr   x9, [x5, #0]"   "\n\t" // p
    553          "ldr   x10, [x5, #8]"  "\n\t" // n
    554          "ldxr  w8, [x9]"       "\n\t"
    555          "add   x8, x8, x10"    "\n\t"
    556          "stxr  w4, w8, [x9]"    "\n\t"
    557          "str   x4, [x5, #16]"   "\n\t"
    558          : /*out*/
    559          : /*in*/ "r"(&block[0])
    560          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    561       );
    562    } while (block[2] != 0);
    563 #elif defined(VGA_s390x)
    564    __asm__ __volatile__(
    565       "   l	0,%0\n\t"
    566       "0: lr	1,0\n\t"
    567       "   ar	1,%1\n\t"
    568       "   cs	0,1,%0\n\t"
    569       "   jl    0b\n\t"
    570       : "+m" (*p)
    571       : "d" (n)
    572       : "cc", "memory", "0", "1");
    573 #elif defined(VGA_mips32)
    574    unsigned int block[3]
    575       = { (unsigned int)p, (unsigned int)n, 0x0 };
    576    do {
    577       __asm__ __volatile__(
    578          "move $t0, %0"        "\n\t"
    579          "lw   $t1, 0($t0)"    "\n\t"  // p
    580          "lw   $t2, 4($t0)"    "\n\t"  // n
    581          "ll   $t3, 0($t1)"    "\n\t"
    582          "addu $t3, $t3, $t2"  "\n\t"
    583          "sc   $t3, 0($t1)"    "\n\t"
    584          "sw   $t3, 8($t0)"    "\n\t"
    585          : /*out*/
    586          : /*in*/ "r"(&block[0])
    587          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    588       );
    589    } while (block[2] != 1);
    590 #elif defined(VGA_mips64)
    591    unsigned long block[3]
    592       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    593    do {
    594       __asm__ __volatile__(
    595          "move  $t0, %0"        "\n\t"
    596          "ld    $t1, 0($t0)"    "\n\t"  // p
    597          "ld    $t2, 8($t0)"    "\n\t"  // n
    598          "ll    $t3, 0($t1)"    "\n\t"
    599          "addu  $t3, $t3, $t2"  "\n\t"
    600          "sc    $t3, 0($t1)"    "\n\t"
    601          "sd    $t3, 16($t0)"   "\n\t"
    602          : /*out*/
    603          : /*in*/ "r"(&block[0])
    604          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    605       );
    606    } while (block[2] != 1);
    607 #else
    608 # error "Unsupported arch"
    609 #endif
    610 }
    611 
    612 __attribute__((noinline)) void atomic_add_64bit ( long long int* p, int n )
    613 {
    614 #if defined(VGA_x86) || defined(VGA_ppc32) || defined(VGA_mips32)
    615    /* do nothing; is not supported */
    616 #elif defined(VGA_amd64)
    617    // this is a bit subtle.  It relies on the fact that, on a 64-bit platform,
    618    // sizeof(unsigned long long int) == sizeof(unsigned long) == sizeof(void*)
    619    unsigned long long int block[2];
    620    block[0] = (unsigned long long int)(unsigned long)p;
    621    block[1] = n;
    622    __asm__ __volatile__(
    623       "movq 0(%%rsi),%%rax"      "\n\t"
    624       "movq 8(%%rsi),%%rbx"      "\n\t"
    625       "lock; addq %%rbx,(%%rax)" "\n"
    626       : : "S"(&block[0])/* S means "rsi only" */ : "memory","cc","rax","rbx"
    627    );
    628 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
    629    unsigned long success;
    630    do {
    631       __asm__ __volatile__(
    632          "ldarx  15,0,%1"    "\n\t"
    633          "add    15,15,%2"   "\n\t"
    634          "stdcx. 15,0,%1"    "\n\t"
    635          "mfcr   %0"         "\n\t"
    636          "srwi   %0,%0,29"   "\n\t"
    637          "andi.  %0,%0,1"    "\n"
    638          : /*out*/"=b"(success)
    639          : /*in*/ "b"(p), "b"(n)
    640          : /*trash*/ "memory", "cc", "r15"
    641       );
    642    } while (success != 1);
    643 #elif defined(VGA_arm)
    644    unsigned long long int block[3]
    645      = { (unsigned long long int)(unsigned long)p,
    646          (unsigned long long int)n,
    647          0xFFFFFFFFFFFFFFFFULL };
    648    do {
    649       __asm__ __volatile__(
    650          "mov    r5, %0"             "\n\t"
    651          "ldr    r8,     [r5, #0]"   "\n\t" // p
    652          "ldrd   r2, r3, [r5, #8]"   "\n\t" // n
    653          "ldrexd r0, r1, [r8]"       "\n\t"
    654          "adds   r2, r2, r0"         "\n\t"
    655          "adc    r3, r3, r1"         "\n\t"
    656          "strexd r1, r2, r3, [r8]"   "\n\t"
    657          "str    r1, [r5, #16]"      "\n\t"
    658          : /*out*/
    659          : /*in*/ "r"(&block[0])
    660          : /*trash*/ "memory", "cc", "r5", "r0", "r1", "r8", "r2", "r3"
    661       );
    662    } while (block[2] != 0xFFFFFFFF00000000ULL);
    663 #elif defined(VGA_arm64)
    664    unsigned long long int block[3]
    665       = { (unsigned long long int)p, (unsigned long long int)n,
    666           0xFFFFFFFFFFFFFFFFULL};
    667    do {
    668       __asm__ __volatile__(
    669          "mov   x5, %0"         "\n\t"
    670          "ldr   x9, [x5, #0]"   "\n\t" // p
    671          "ldr   x10, [x5, #8]"  "\n\t" // n
    672          "ldxr  x8, [x9]"       "\n\t"
    673          "add   x8, x8, x10"    "\n\t"
    674          "stxr  w4, x8, [x9]"   "\n\t"
    675          "str   x4, [x5, #16]"   "\n\t"
    676          : /*out*/
    677          : /*in*/ "r"(&block[0])
    678          : /*trash*/ "memory", "cc", "x5", "x8", "x9", "x10", "x4"
    679       );
    680    } while (block[2] != 0);
    681 #elif defined(VGA_s390x)
    682    __asm__ __volatile__(
    683       "   lg	0,%0\n\t"
    684       "0: lgr	1,0\n\t"
    685       "   agr	1,%1\n\t"
    686       "   csg	0,1,%0\n\t"
    687       "   jl    0b\n\t"
    688       : "+m" (*p)
    689       : "d" (n)
    690       : "cc", "memory", "0", "1");
    691 #elif defined(VGA_mips64)
    692    unsigned long block[3]
    693       = { (unsigned long)p, (unsigned long)n, 0x0ULL };
    694    do {
    695       __asm__ __volatile__(
    696          "move  $t0, %0"        "\n\t"
    697          "ld    $t1, 0($t0)"    "\n\t" // p
    698          "ld    $t2, 8($t0)"    "\n\t" // n
    699          "lld   $t3, 0($t1)"    "\n\t"
    700          "daddu $t3, $t3, $t2"  "\n\t"
    701          "scd   $t3, 0($t1)"    "\n\t"
    702          "sd    $t3, 16($t0)"   "\n\t"
    703          : /*out*/
    704          : /*in*/ "r"(&block[0])
    705          : /*trash*/ "memory", "t0", "t1", "t2", "t3"
    706       );
    707    } while (block[2] != 1);
    708 #else
    709 # error "Unsupported arch"
    710 #endif
    711 }
    712 
    713 int main ( int argc, char** argv )
    714 {
    715    int    i, status;
    716    char*  page;
    717    char*  p8;
    718    short* p16;
    719    int*   p32;
    720    long long int* p64;
    721    pid_t  child, p2;
    722 
    723    printf("parent, pre-fork\n");
    724 
    725    page = mmap( 0, sysconf(_SC_PAGESIZE),
    726                    PROT_READ|PROT_WRITE,
    727                    MAP_ANONYMOUS|MAP_SHARED, -1, 0 );
    728    if (page == MAP_FAILED) {
    729       perror("mmap failed");
    730       exit(1);
    731    }
    732 
    733    p8  = (char*)(page+0);
    734    p16 = (short*)(page+256);
    735    p32 = (int*)(page+512);
    736    p64 = (long long int*)(page+768);
    737 
    738    assert( IS_8_ALIGNED(p8) );
    739    assert( IS_8_ALIGNED(p16) );
    740    assert( IS_8_ALIGNED(p32) );
    741    assert( IS_8_ALIGNED(p64) );
    742 
    743    memset(page, 0, 1024);
    744 
    745    *p8  = 0;
    746    *p16 = 0;
    747    *p32 = 0;
    748    *p64 = 0;
    749 
    750    child = fork();
    751    if (child == -1) {
    752       perror("fork() failed\n");
    753       return 1;
    754    }
    755 
    756    if (child == 0) {
    757       /* --- CHILD --- */
    758       printf("child\n");
    759       for (i = 0; i < NNN; i++) {
    760          atomic_add_8bit(p8, 1);
    761          atomic_add_16bit(p16, 1);
    762          atomic_add_32bit(p32, 1);
    763          atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    764       }
    765       return 1;
    766       /* NOTREACHED */
    767 
    768    }
    769 
    770    /* --- PARENT --- */
    771 
    772    printf("parent\n");
    773 
    774    for (i = 0; i < NNN; i++) {
    775       atomic_add_8bit(p8, 1);
    776       atomic_add_16bit(p16, 1);
    777       atomic_add_32bit(p32, 1);
    778       atomic_add_64bit(p64, 98765 ); /* ensure we hit the upper 32 bits */
    779    }
    780 
    781    p2 = waitpid(child, &status, 0);
    782    assert(p2 == child);
    783 
    784    /* assert that child finished normally */
    785    assert(WIFEXITED(status));
    786 
    787    printf("FINAL VALUES:  8 bit %d,  16 bit %d,  32 bit %d,  64 bit %lld\n",
    788           (int)(*(signed char*)p8), (int)(*p16), *p32, *p64 );
    789 
    790    if (-74 == (int)(*(signed char*)p8)
    791        && 32694 == (int)(*p16)
    792        && 6913974 == *p32
    793        && (0LL == *p64 || 682858642110LL == *p64)) {
    794       printf("PASS\n");
    795    } else {
    796       printf("FAIL -- see source code for expected values\n");
    797    }
    798 
    799    printf("parent exits\n");
    800 
    801    return 0;
    802 }
    803