Home | History | Annotate | Download | only in amd64
      1 
      2 /* A program to test SSE4.1/SSE4.2 instructions.
      3    Revisions:  Nov.208     - wrote this file
      4                Apr.10.2010 - added PEXTR* tests
      5                Apr.16.2010 - added PINS*  tests
      6 */
      7 
      8 /* HOW TO COMPILE:
      9    gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
     10 */
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include <assert.h>
     15 #include "tests/malloc.h"
     16 #include <string.h>
     17 
     18 
     19 typedef  unsigned char           V128[16];
     20 typedef  unsigned int            UInt;
     21 typedef  signed int              Int;
     22 typedef  unsigned char           UChar;
     23 typedef  unsigned long long int  ULong;
     24 
     25 typedef  unsigned char           Bool;
     26 #define False ((Bool)0)
     27 #define True  ((Bool)1)
     28 
     29 
     30 typedef
     31    struct {
     32       V128 arg1;
     33       V128 arg2;
     34       V128 res;
     35    }
     36    RRArgs;
     37 
     38 typedef
     39    struct {
     40       V128 arg1;
     41       V128 res;
     42    }
     43    RMArgs;
     44 
     45 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
     46 {
     47    // try to sidestep strict-aliasing snafus by memcpying explicitly
     48    UChar* p = (UChar*)res;
     49    memcpy(&p[8], (UChar*)&wHi, 8);
     50    memcpy(&p[0], (UChar*)&wLo, 8);
     51 }
     52 
     53 static UChar randUChar ( void )
     54 {
     55    static UInt seed = 80021;
     56    seed = 1103515245 * seed + 12345;
     57    return (seed >> 17) & 0xFF;
     58 }
     59 
     60 static ULong randULong ( void )
     61 {
     62    Int i;
     63    ULong r = 0;
     64    for (i = 0; i < 8; i++) {
     65       r = (r << 8) | (ULong)(0xFF & randUChar());
     66    }
     67    return r;
     68 }
     69 
     70 static void randV128 ( V128* v )
     71 {
     72    Int i;
     73    for (i = 0; i < 16; i++)
     74       (*v)[i] = randUChar();
     75 }
     76 
     77 static void showV128 ( V128* v )
     78 {
     79    Int i;
     80    for (i = 15; i >= 0; i--)
     81       printf("%02x", (Int)(*v)[i]);
     82 }
     83 
     84 static void showMaskedV128 ( V128* v, V128* mask )
     85 {
     86    Int i;
     87    for (i = 15; i >= 0; i--)
     88       printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
     89 }
     90 
     91 static void showIGVV( char* rOrM, char* op, Int imm,
     92                       ULong src64, V128* dst, V128* res )
     93 {
     94    printf("%s %10s $%d ", rOrM, op, imm);
     95    printf("%016llx", src64);
     96    printf(" ");
     97    showV128(dst);
     98    printf(" ");
     99    showV128(res);
    100    printf("\n");
    101 }
    102 
    103 static void showIAG ( char* rOrM, char* op, Int imm,
    104                       V128* argL, ULong argR, ULong res )
    105 {
    106    printf("%s %10s $%d ", rOrM, op, imm);
    107    showV128(argL);
    108    printf(" ");
    109    printf("%016llx", argR);
    110    printf(" ");
    111    printf("%016llx", res);
    112    printf("\n");
    113 }
    114 
    115 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
    116 {
    117    printf("%s %10s $%d ", rOrM, op, imm);
    118    showV128(&rra->arg1);
    119    printf(" ");
    120    showV128(&rra->arg2);
    121    printf(" ");
    122    showMaskedV128(&rra->res, rmask);
    123    printf("\n");
    124 }
    125 
    126 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
    127 {
    128    printf("%s %10s ", rOrM, op);
    129    showV128(&rra->arg1);
    130    printf(" ");
    131    showV128(&rra->arg2);
    132    printf(" ");
    133    showMaskedV128(&rra->res, rmask);
    134    printf("\n");
    135 }
    136 
    137 /* Note: these are little endian.  Hence first byte is the least
    138    significant byte of lane zero. */
    139 
    140 /* Mask for insns where all result bits are non-approximated. */
    141 static V128 AllMask  = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
    142                          0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
    143 
    144 /* Mark for insns which produce approximated vector short results. */
    145 __attribute__((unused))
    146 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
    147                          0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
    148 
    149 /* Mark for insns which produce approximated scalar short results. */
    150 __attribute__((unused))
    151 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
    152                          0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
    153 
    154 static V128 fives    = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
    155                          0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
    156 
    157 static V128 zeroes   = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
    158                          0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
    159 
    160 double mkPosInf ( void ) { return 1.0 / 0.0; }
    161 double mkNegInf ( void ) { return -mkPosInf(); }
    162 double mkPosNan ( void ) { return 0.0 / 0.0; }
    163 double mkNegNan ( void ) { return -mkPosNan(); }
    164 
    165 __attribute__((noinline))
    166 UInt get_mxcsr ( void )
    167 {
    168    ULong w64;
    169    __asm__ __volatile__(
    170       "subq    $8, %%rsp"    "\n\t"
    171       "stmxcsr (%%rsp)"      "\n\t"
    172       "movq    (%%rsp), %0"  "\n"
    173       "addq    $8, %%rsp"
    174       : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
    175    );
    176    if (0) printf("get %08x\n", (UInt)w64);
    177    return (UInt)w64;
    178 }
    179 
    180 __attribute__((noinline))
    181 void set_mxcsr ( UInt w32 )
    182 {
    183    if (0) printf("set %08x\n", w32);
    184    ULong w64 = (ULong)w32;
    185    __asm__ __volatile__(
    186       "subq    $8, %%rsp"    "\n\t"
    187       "movq    %0, (%%rsp)"  "\n\t"
    188       "ldmxcsr (%%rsp)"      "\n\t"
    189       "addq    $8, %%rsp"
    190       : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
    191    );
    192 }
    193 
    194 UInt get_sse_roundingmode ( void )
    195 {
    196    UInt w = get_mxcsr();
    197    return (w >> 13) & 3;
    198 }
    199 
    200 void set_sse_roundingmode ( UInt m )
    201 {
    202    UInt w;
    203    assert(0 == (m & ~3));
    204    w = get_mxcsr();
    205    w &= ~(3 << 13);
    206    w |= (m << 13);
    207    set_mxcsr(w);
    208 }
    209 
    210 
    211 #define DO_imm_r_r(_opname, _imm, _src, _dst)  \
    212    {  \
    213       V128 _tmp;  \
    214       __asm__ __volatile__(  \
    215          "movupd (%0), %%xmm2"    "\n\t"  \
    216          "movupd (%1), %%xmm11"   "\n\t"  \
    217          _opname " $" #_imm ", %%xmm2, %%xmm11"  "\n\t"  \
    218          "movupd %%xmm11, (%2)" "\n"  \
    219          : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
    220          : "cc", "memory", "xmm2", "xmm11"                            \
    221       );  \
    222       RRArgs rra;  \
    223       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    224       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    225       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    226       showIAA("r", (_opname), (_imm), &rra, &AllMask);  \
    227    }
    228 
    229 #define DO_imm_m_r(_opname, _imm, _src, _dst)  \
    230    {  \
    231       V128 _tmp;  \
    232       V128* _srcM = memalign16(sizeof(V128));  \
    233       memcpy(_srcM, &(_src), sizeof(V128));  \
    234       __asm__ __volatile__(  \
    235          "movupd (%1), %%xmm11"   "\n\t"  \
    236          _opname " $" #_imm ", (%0), %%xmm11"  "\n\t"  \
    237          "movupd %%xmm11, (%2)" "\n"  \
    238          : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
    239          : "cc", "memory", "xmm11"  \
    240       );  \
    241       RRArgs rra;  \
    242       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    243       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    244       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    245       showIAA("m", (_opname), (_imm), &rra, &AllMask);  \
    246       free(_srcM);  \
    247    }
    248 
    249 #define DO_imm_mandr_r(_opname, _imm, _src, _dst)  \
    250       DO_imm_r_r( _opname, _imm, _src, _dst ) \
    251       DO_imm_m_r( _opname, _imm, _src, _dst )
    252 
    253 
    254 
    255 
    256 
    257 #define DO_r_r(_opname, _src, _dst)  \
    258    {  \
    259       V128 _tmp;  \
    260       __asm__ __volatile__(  \
    261          "movupd (%0), %%xmm2"    "\n\t"  \
    262          "movupd (%1), %%xmm11"   "\n\t"  \
    263          _opname " %%xmm2, %%xmm11"  "\n\t"  \
    264          "movupd %%xmm11, (%2)" "\n"  \
    265          : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
    266          : "cc", "memory", "xmm2", "xmm11"  \
    267       );  \
    268       RRArgs rra;  \
    269       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    270       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    271       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    272       showAA("r", (_opname), &rra, &AllMask);  \
    273    }
    274 
    275 #define DO_m_r(_opname, _src, _dst)  \
    276    {  \
    277       V128 _tmp;  \
    278       V128* _srcM = memalign16(sizeof(V128));  \
    279       memcpy(_srcM, &(_src), sizeof(V128));  \
    280       __asm__ __volatile__(  \
    281          "movupd (%1), %%xmm11"   "\n\t"  \
    282          _opname " (%0), %%xmm11"  "\n\t"  \
    283          "movupd %%xmm11, (%2)" "\n"  \
    284          : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
    285          : "cc", "memory", "xmm11"  \
    286       );  \
    287       RRArgs rra;  \
    288       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    289       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    290       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    291       showAA("m", (_opname), &rra, &AllMask);  \
    292       free(_srcM);  \
    293    }
    294 
    295 #define DO_mandr_r(_opname, _src, _dst)  \
    296       DO_r_r(_opname, _src, _dst) \
    297       DO_m_r(_opname, _src, _dst)
    298 
    299 
    300 
    301 
    302 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix)       \
    303    {  \
    304       ULong _scbefore = 0x5555555555555555ULL;  \
    305       ULong _scafter  = 0xAAAAAAAAAAAAAAAAULL; \
    306       /* This assumes that gcc won't make any of %0, %1, %2 */ \
    307       /* be r11.  That should be ensured (cough, cough) */ \
    308       /* by declaring r11 to be clobbered. */ \
    309       __asm__ __volatile__(  \
    310          "movupd (%0), %%xmm2"    "\n\t"  \
    311          "movq   (%1), %%r11"   "\n\t"  \
    312          _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix  "\n\t"  \
    313          "movq   %%r11, (%2)" "\n"  \
    314          : /*out*/ \
    315          : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter))  \
    316          : "cc", "memory", "xmm2", "r11"  \
    317       );  \
    318       showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
    319    }
    320 
    321 #define DO_imm_r_to_mscalar(_opname, _imm, _src)   \
    322    {  \
    323       ULong _scbefore = 0x5555555555555555ULL;  \
    324       ULong _scafter = _scbefore; \
    325       __asm__ __volatile__(  \
    326          "movupd (%0), %%xmm2"    "\n\t"  \
    327          _opname " $" #_imm ", %%xmm2, (%1)"  "\n\t"  \
    328          : /*out*/ \
    329          : /*in*/ "r"(&(_src)), "r"(&(_scafter))  \
    330          : "cc", "memory", "xmm2"  \
    331       );  \
    332       showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
    333    }
    334 
    335 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix)   \
    336       DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix )       \
    337       DO_imm_r_to_mscalar( _opname, _imm, _src )
    338 
    339 
    340 
    341 
    342 
    343 
    344 
    345 
    346 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix)       \
    347    {  \
    348       V128  dstv;         \
    349       V128  res;          \
    350       ULong src64 = (ULong)(_src); \
    351       memcpy(dstv, fives, sizeof(dstv)); \
    352       memcpy(res,  zeroes, sizeof(res)); \
    353       /* This assumes that gcc won't make any of %0, %1, %2 */ \
    354       /* be r11.  That should be ensured (cough, cough) */ \
    355       /* by declaring r11 to be clobbered. */ \
    356       __asm__ __volatile__(  \
    357          "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
    358          "movq   (%1), %%r11"     "\n\t"   /*src64*/  \
    359          _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2"   "\n\t"  \
    360          "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
    361          : /*out*/ \
    362          : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
    363          : "cc", "memory", "xmm2", "r11"  \
    364       );  \
    365       showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
    366    }
    367 #define DO_imm_mscalar_to_r(_opname, _imm, _src)       \
    368    {  \
    369       V128  dstv;         \
    370       V128  res;          \
    371       ULong src64 = (ULong)(_src); \
    372       memcpy(dstv, fives, sizeof(dstv)); \
    373       memcpy(res,  zeroes, sizeof(res)); \
    374       __asm__ __volatile__(  \
    375          "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
    376          _opname " $" #_imm ", (%1), %%xmm2"   "\n\t"  \
    377          "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
    378          : /*out*/ \
    379          : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
    380          : "cc", "memory", "xmm2"  \
    381       );  \
    382       showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
    383    }
    384 
    385 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix)   \
    386       DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix )       \
    387       DO_imm_mscalar_to_r( _opname, _imm, _src )
    388 
    389 
    390 
    391 
    392 
    393 void test_BLENDPD ( void )
    394 {
    395    V128 src, dst;
    396    Int i;
    397    for (i = 0; i < 10; i++) {
    398       randV128(&src);
    399       randV128(&dst);
    400       DO_imm_mandr_r("blendpd", 0, src, dst);
    401       DO_imm_mandr_r("blendpd", 1, src, dst);
    402       DO_imm_mandr_r("blendpd", 2, src, dst);
    403       DO_imm_mandr_r("blendpd", 3, src, dst);
    404    }
    405 }
    406 
    407 void test_BLENDPS ( void )
    408 {
    409    V128 src, dst;
    410    Int i;
    411    for (i = 0; i < 10; i++) {
    412       randV128(&src);
    413       randV128(&dst);
    414       DO_imm_mandr_r("blendps", 0, src, dst);
    415       DO_imm_mandr_r("blendps", 1, src, dst);
    416       DO_imm_mandr_r("blendps", 2, src, dst);
    417       DO_imm_mandr_r("blendps", 3, src, dst);
    418       DO_imm_mandr_r("blendps", 4, src, dst);
    419       DO_imm_mandr_r("blendps", 5, src, dst);
    420       DO_imm_mandr_r("blendps", 6, src, dst);
    421       DO_imm_mandr_r("blendps", 7, src, dst);
    422       DO_imm_mandr_r("blendps", 8, src, dst);
    423       DO_imm_mandr_r("blendps", 9, src, dst);
    424       DO_imm_mandr_r("blendps", 10, src, dst);
    425       DO_imm_mandr_r("blendps", 11, src, dst);
    426       DO_imm_mandr_r("blendps", 12, src, dst);
    427       DO_imm_mandr_r("blendps", 13, src, dst);
    428       DO_imm_mandr_r("blendps", 14, src, dst);
    429       DO_imm_mandr_r("blendps", 15, src, dst);
    430    }
    431 }
    432 
    433 void test_DPPD ( void )
    434 {
    435    V128 src, dst;
    436    {
    437       *(double*)(&src[0]) =  1.2345;
    438       *(double*)(&src[8]) = -6.78910;
    439       *(double*)(&dst[0]) = -11.121314;
    440       *(double*)(&dst[8]) =  15.161718;
    441       DO_imm_mandr_r("dppd", 0, src, dst);
    442       DO_imm_mandr_r("dppd", 1, src, dst);
    443       DO_imm_mandr_r("dppd", 2, src, dst);
    444       DO_imm_mandr_r("dppd", 3, src, dst);
    445       DO_imm_mandr_r("dppd", 4, src, dst);
    446       DO_imm_mandr_r("dppd", 5, src, dst);
    447       DO_imm_mandr_r("dppd", 6, src, dst);
    448       DO_imm_mandr_r("dppd", 7, src, dst);
    449       DO_imm_mandr_r("dppd", 8, src, dst);
    450       DO_imm_mandr_r("dppd", 9, src, dst);
    451       DO_imm_mandr_r("dppd", 10, src, dst);
    452       DO_imm_mandr_r("dppd", 11, src, dst);
    453       DO_imm_mandr_r("dppd", 12, src, dst);
    454       DO_imm_mandr_r("dppd", 13, src, dst);
    455       DO_imm_mandr_r("dppd", 14, src, dst);
    456       DO_imm_mandr_r("dppd", 15, src, dst);
    457       DO_imm_mandr_r("dppd", 16, src, dst);
    458       DO_imm_mandr_r("dppd", 17, src, dst);
    459       DO_imm_mandr_r("dppd", 18, src, dst);
    460       DO_imm_mandr_r("dppd", 19, src, dst);
    461       DO_imm_mandr_r("dppd", 20, src, dst);
    462       DO_imm_mandr_r("dppd", 21, src, dst);
    463       DO_imm_mandr_r("dppd", 22, src, dst);
    464       DO_imm_mandr_r("dppd", 23, src, dst);
    465       DO_imm_mandr_r("dppd", 24, src, dst);
    466       DO_imm_mandr_r("dppd", 25, src, dst);
    467       DO_imm_mandr_r("dppd", 26, src, dst);
    468       DO_imm_mandr_r("dppd", 27, src, dst);
    469       DO_imm_mandr_r("dppd", 28, src, dst);
    470       DO_imm_mandr_r("dppd", 29, src, dst);
    471       DO_imm_mandr_r("dppd", 30, src, dst);
    472       DO_imm_mandr_r("dppd", 31, src, dst);
    473       DO_imm_mandr_r("dppd", 32, src, dst);
    474       DO_imm_mandr_r("dppd", 33, src, dst);
    475       DO_imm_mandr_r("dppd", 34, src, dst);
    476       DO_imm_mandr_r("dppd", 35, src, dst);
    477       DO_imm_mandr_r("dppd", 36, src, dst);
    478       DO_imm_mandr_r("dppd", 37, src, dst);
    479       DO_imm_mandr_r("dppd", 38, src, dst);
    480       DO_imm_mandr_r("dppd", 39, src, dst);
    481       DO_imm_mandr_r("dppd", 40, src, dst);
    482       DO_imm_mandr_r("dppd", 41, src, dst);
    483       DO_imm_mandr_r("dppd", 42, src, dst);
    484       DO_imm_mandr_r("dppd", 43, src, dst);
    485       DO_imm_mandr_r("dppd", 44, src, dst);
    486       DO_imm_mandr_r("dppd", 45, src, dst);
    487       DO_imm_mandr_r("dppd", 46, src, dst);
    488       DO_imm_mandr_r("dppd", 47, src, dst);
    489       DO_imm_mandr_r("dppd", 48, src, dst);
    490       DO_imm_mandr_r("dppd", 49, src, dst);
    491       DO_imm_mandr_r("dppd", 50, src, dst);
    492       DO_imm_mandr_r("dppd", 51, src, dst);
    493       DO_imm_mandr_r("dppd", 52, src, dst);
    494       DO_imm_mandr_r("dppd", 53, src, dst);
    495       DO_imm_mandr_r("dppd", 54, src, dst);
    496       DO_imm_mandr_r("dppd", 55, src, dst);
    497       DO_imm_mandr_r("dppd", 56, src, dst);
    498       DO_imm_mandr_r("dppd", 57, src, dst);
    499       DO_imm_mandr_r("dppd", 58, src, dst);
    500       DO_imm_mandr_r("dppd", 59, src, dst);
    501       DO_imm_mandr_r("dppd", 60, src, dst);
    502       DO_imm_mandr_r("dppd", 61, src, dst);
    503       DO_imm_mandr_r("dppd", 62, src, dst);
    504       DO_imm_mandr_r("dppd", 63, src, dst);
    505       DO_imm_mandr_r("dppd", 64, src, dst);
    506       DO_imm_mandr_r("dppd", 65, src, dst);
    507       DO_imm_mandr_r("dppd", 66, src, dst);
    508       DO_imm_mandr_r("dppd", 67, src, dst);
    509       DO_imm_mandr_r("dppd", 68, src, dst);
    510       DO_imm_mandr_r("dppd", 69, src, dst);
    511       DO_imm_mandr_r("dppd", 70, src, dst);
    512       DO_imm_mandr_r("dppd", 71, src, dst);
    513       DO_imm_mandr_r("dppd", 72, src, dst);
    514       DO_imm_mandr_r("dppd", 73, src, dst);
    515       DO_imm_mandr_r("dppd", 74, src, dst);
    516       DO_imm_mandr_r("dppd", 75, src, dst);
    517       DO_imm_mandr_r("dppd", 76, src, dst);
    518       DO_imm_mandr_r("dppd", 77, src, dst);
    519       DO_imm_mandr_r("dppd", 78, src, dst);
    520       DO_imm_mandr_r("dppd", 79, src, dst);
    521       DO_imm_mandr_r("dppd", 80, src, dst);
    522       DO_imm_mandr_r("dppd", 81, src, dst);
    523       DO_imm_mandr_r("dppd", 82, src, dst);
    524       DO_imm_mandr_r("dppd", 83, src, dst);
    525       DO_imm_mandr_r("dppd", 84, src, dst);
    526       DO_imm_mandr_r("dppd", 85, src, dst);
    527       DO_imm_mandr_r("dppd", 86, src, dst);
    528       DO_imm_mandr_r("dppd", 87, src, dst);
    529       DO_imm_mandr_r("dppd", 88, src, dst);
    530       DO_imm_mandr_r("dppd", 89, src, dst);
    531       DO_imm_mandr_r("dppd", 90, src, dst);
    532       DO_imm_mandr_r("dppd", 91, src, dst);
    533       DO_imm_mandr_r("dppd", 92, src, dst);
    534       DO_imm_mandr_r("dppd", 93, src, dst);
    535       DO_imm_mandr_r("dppd", 94, src, dst);
    536       DO_imm_mandr_r("dppd", 95, src, dst);
    537       DO_imm_mandr_r("dppd", 96, src, dst);
    538       DO_imm_mandr_r("dppd", 97, src, dst);
    539       DO_imm_mandr_r("dppd", 98, src, dst);
    540       DO_imm_mandr_r("dppd", 99, src, dst);
    541       DO_imm_mandr_r("dppd", 100, src, dst);
    542       DO_imm_mandr_r("dppd", 101, src, dst);
    543       DO_imm_mandr_r("dppd", 102, src, dst);
    544       DO_imm_mandr_r("dppd", 103, src, dst);
    545       DO_imm_mandr_r("dppd", 104, src, dst);
    546       DO_imm_mandr_r("dppd", 105, src, dst);
    547       DO_imm_mandr_r("dppd", 106, src, dst);
    548       DO_imm_mandr_r("dppd", 107, src, dst);
    549       DO_imm_mandr_r("dppd", 108, src, dst);
    550       DO_imm_mandr_r("dppd", 109, src, dst);
    551       DO_imm_mandr_r("dppd", 110, src, dst);
    552       DO_imm_mandr_r("dppd", 111, src, dst);
    553       DO_imm_mandr_r("dppd", 112, src, dst);
    554       DO_imm_mandr_r("dppd", 113, src, dst);
    555       DO_imm_mandr_r("dppd", 114, src, dst);
    556       DO_imm_mandr_r("dppd", 115, src, dst);
    557       DO_imm_mandr_r("dppd", 116, src, dst);
    558       DO_imm_mandr_r("dppd", 117, src, dst);
    559       DO_imm_mandr_r("dppd", 118, src, dst);
    560       DO_imm_mandr_r("dppd", 119, src, dst);
    561       DO_imm_mandr_r("dppd", 120, src, dst);
    562       DO_imm_mandr_r("dppd", 121, src, dst);
    563       DO_imm_mandr_r("dppd", 122, src, dst);
    564       DO_imm_mandr_r("dppd", 123, src, dst);
    565       DO_imm_mandr_r("dppd", 124, src, dst);
    566       DO_imm_mandr_r("dppd", 125, src, dst);
    567       DO_imm_mandr_r("dppd", 126, src, dst);
    568       DO_imm_mandr_r("dppd", 127, src, dst);
    569       DO_imm_mandr_r("dppd", 128, src, dst);
    570       DO_imm_mandr_r("dppd", 129, src, dst);
    571       DO_imm_mandr_r("dppd", 130, src, dst);
    572       DO_imm_mandr_r("dppd", 131, src, dst);
    573       DO_imm_mandr_r("dppd", 132, src, dst);
    574       DO_imm_mandr_r("dppd", 133, src, dst);
    575       DO_imm_mandr_r("dppd", 134, src, dst);
    576       DO_imm_mandr_r("dppd", 135, src, dst);
    577       DO_imm_mandr_r("dppd", 136, src, dst);
    578       DO_imm_mandr_r("dppd", 137, src, dst);
    579       DO_imm_mandr_r("dppd", 138, src, dst);
    580       DO_imm_mandr_r("dppd", 139, src, dst);
    581       DO_imm_mandr_r("dppd", 140, src, dst);
    582       DO_imm_mandr_r("dppd", 141, src, dst);
    583       DO_imm_mandr_r("dppd", 142, src, dst);
    584       DO_imm_mandr_r("dppd", 143, src, dst);
    585       DO_imm_mandr_r("dppd", 144, src, dst);
    586       DO_imm_mandr_r("dppd", 145, src, dst);
    587       DO_imm_mandr_r("dppd", 146, src, dst);
    588       DO_imm_mandr_r("dppd", 147, src, dst);
    589       DO_imm_mandr_r("dppd", 148, src, dst);
    590       DO_imm_mandr_r("dppd", 149, src, dst);
    591       DO_imm_mandr_r("dppd", 150, src, dst);
    592       DO_imm_mandr_r("dppd", 151, src, dst);
    593       DO_imm_mandr_r("dppd", 152, src, dst);
    594       DO_imm_mandr_r("dppd", 153, src, dst);
    595       DO_imm_mandr_r("dppd", 154, src, dst);
    596       DO_imm_mandr_r("dppd", 155, src, dst);
    597       DO_imm_mandr_r("dppd", 156, src, dst);
    598       DO_imm_mandr_r("dppd", 157, src, dst);
    599       DO_imm_mandr_r("dppd", 158, src, dst);
    600       DO_imm_mandr_r("dppd", 159, src, dst);
    601       DO_imm_mandr_r("dppd", 160, src, dst);
    602       DO_imm_mandr_r("dppd", 161, src, dst);
    603       DO_imm_mandr_r("dppd", 162, src, dst);
    604       DO_imm_mandr_r("dppd", 163, src, dst);
    605       DO_imm_mandr_r("dppd", 164, src, dst);
    606       DO_imm_mandr_r("dppd", 165, src, dst);
    607       DO_imm_mandr_r("dppd", 166, src, dst);
    608       DO_imm_mandr_r("dppd", 167, src, dst);
    609       DO_imm_mandr_r("dppd", 168, src, dst);
    610       DO_imm_mandr_r("dppd", 169, src, dst);
    611       DO_imm_mandr_r("dppd", 170, src, dst);
    612       DO_imm_mandr_r("dppd", 171, src, dst);
    613       DO_imm_mandr_r("dppd", 172, src, dst);
    614       DO_imm_mandr_r("dppd", 173, src, dst);
    615       DO_imm_mandr_r("dppd", 174, src, dst);
    616       DO_imm_mandr_r("dppd", 175, src, dst);
    617       DO_imm_mandr_r("dppd", 176, src, dst);
    618       DO_imm_mandr_r("dppd", 177, src, dst);
    619       DO_imm_mandr_r("dppd", 178, src, dst);
    620       DO_imm_mandr_r("dppd", 179, src, dst);
    621       DO_imm_mandr_r("dppd", 180, src, dst);
    622       DO_imm_mandr_r("dppd", 181, src, dst);
    623       DO_imm_mandr_r("dppd", 182, src, dst);
    624       DO_imm_mandr_r("dppd", 183, src, dst);
    625       DO_imm_mandr_r("dppd", 184, src, dst);
    626       DO_imm_mandr_r("dppd", 185, src, dst);
    627       DO_imm_mandr_r("dppd", 186, src, dst);
    628       DO_imm_mandr_r("dppd", 187, src, dst);
    629       DO_imm_mandr_r("dppd", 188, src, dst);
    630       DO_imm_mandr_r("dppd", 189, src, dst);
    631       DO_imm_mandr_r("dppd", 190, src, dst);
    632       DO_imm_mandr_r("dppd", 191, src, dst);
    633       DO_imm_mandr_r("dppd", 192, src, dst);
    634       DO_imm_mandr_r("dppd", 193, src, dst);
    635       DO_imm_mandr_r("dppd", 194, src, dst);
    636       DO_imm_mandr_r("dppd", 195, src, dst);
    637       DO_imm_mandr_r("dppd", 196, src, dst);
    638       DO_imm_mandr_r("dppd", 197, src, dst);
    639       DO_imm_mandr_r("dppd", 198, src, dst);
    640       DO_imm_mandr_r("dppd", 199, src, dst);
    641       DO_imm_mandr_r("dppd", 200, src, dst);
    642       DO_imm_mandr_r("dppd", 201, src, dst);
    643       DO_imm_mandr_r("dppd", 202, src, dst);
    644       DO_imm_mandr_r("dppd", 203, src, dst);
    645       DO_imm_mandr_r("dppd", 204, src, dst);
    646       DO_imm_mandr_r("dppd", 205, src, dst);
    647       DO_imm_mandr_r("dppd", 206, src, dst);
    648       DO_imm_mandr_r("dppd", 207, src, dst);
    649       DO_imm_mandr_r("dppd", 208, src, dst);
    650       DO_imm_mandr_r("dppd", 209, src, dst);
    651       DO_imm_mandr_r("dppd", 210, src, dst);
    652       DO_imm_mandr_r("dppd", 211, src, dst);
    653       DO_imm_mandr_r("dppd", 212, src, dst);
    654       DO_imm_mandr_r("dppd", 213, src, dst);
    655       DO_imm_mandr_r("dppd", 214, src, dst);
    656       DO_imm_mandr_r("dppd", 215, src, dst);
    657       DO_imm_mandr_r("dppd", 216, src, dst);
    658       DO_imm_mandr_r("dppd", 217, src, dst);
    659       DO_imm_mandr_r("dppd", 218, src, dst);
    660       DO_imm_mandr_r("dppd", 219, src, dst);
    661       DO_imm_mandr_r("dppd", 220, src, dst);
    662       DO_imm_mandr_r("dppd", 221, src, dst);
    663       DO_imm_mandr_r("dppd", 222, src, dst);
    664       DO_imm_mandr_r("dppd", 223, src, dst);
    665       DO_imm_mandr_r("dppd", 224, src, dst);
    666       DO_imm_mandr_r("dppd", 225, src, dst);
    667       DO_imm_mandr_r("dppd", 226, src, dst);
    668       DO_imm_mandr_r("dppd", 227, src, dst);
    669       DO_imm_mandr_r("dppd", 228, src, dst);
    670       DO_imm_mandr_r("dppd", 229, src, dst);
    671       DO_imm_mandr_r("dppd", 230, src, dst);
    672       DO_imm_mandr_r("dppd", 231, src, dst);
    673       DO_imm_mandr_r("dppd", 232, src, dst);
    674       DO_imm_mandr_r("dppd", 233, src, dst);
    675       DO_imm_mandr_r("dppd", 234, src, dst);
    676       DO_imm_mandr_r("dppd", 235, src, dst);
    677       DO_imm_mandr_r("dppd", 236, src, dst);
    678       DO_imm_mandr_r("dppd", 237, src, dst);
    679       DO_imm_mandr_r("dppd", 238, src, dst);
    680       DO_imm_mandr_r("dppd", 239, src, dst);
    681       DO_imm_mandr_r("dppd", 240, src, dst);
    682       DO_imm_mandr_r("dppd", 241, src, dst);
    683       DO_imm_mandr_r("dppd", 242, src, dst);
    684       DO_imm_mandr_r("dppd", 243, src, dst);
    685       DO_imm_mandr_r("dppd", 244, src, dst);
    686       DO_imm_mandr_r("dppd", 245, src, dst);
    687       DO_imm_mandr_r("dppd", 246, src, dst);
    688       DO_imm_mandr_r("dppd", 247, src, dst);
    689       DO_imm_mandr_r("dppd", 248, src, dst);
    690       DO_imm_mandr_r("dppd", 249, src, dst);
    691       DO_imm_mandr_r("dppd", 250, src, dst);
    692       DO_imm_mandr_r("dppd", 251, src, dst);
    693       DO_imm_mandr_r("dppd", 252, src, dst);
    694       DO_imm_mandr_r("dppd", 253, src, dst);
    695       DO_imm_mandr_r("dppd", 254, src, dst);
    696       DO_imm_mandr_r("dppd", 255, src, dst);
    697    }
    698 }
    699 
    700 void test_DPPS ( void )
    701 {
    702    V128 src, dst;
    703    {
    704       *(float*)(&src[0])  =   1.2;
    705       *(float*)(&src[4])  =  -3.4;
    706       *(float*)(&src[8])  =  -6.7;
    707       *(float*)(&src[12]) =   8.9;
    708       *(float*)(&dst[0])  = -10.11;
    709       *(float*)(&dst[4])  =  12.13;
    710       *(float*)(&dst[8])  =  14.15;
    711       *(float*)(&dst[12]) = -16.17;
    712       DO_imm_mandr_r("dpps", 0, src, dst);
    713       DO_imm_mandr_r("dpps", 1, src, dst);
    714       DO_imm_mandr_r("dpps", 2, src, dst);
    715       DO_imm_mandr_r("dpps", 3, src, dst);
    716       DO_imm_mandr_r("dpps", 4, src, dst);
    717       DO_imm_mandr_r("dpps", 5, src, dst);
    718       DO_imm_mandr_r("dpps", 6, src, dst);
    719       DO_imm_mandr_r("dpps", 7, src, dst);
    720       DO_imm_mandr_r("dpps", 8, src, dst);
    721       DO_imm_mandr_r("dpps", 9, src, dst);
    722       DO_imm_mandr_r("dpps", 10, src, dst);
    723       DO_imm_mandr_r("dpps", 11, src, dst);
    724       DO_imm_mandr_r("dpps", 12, src, dst);
    725       DO_imm_mandr_r("dpps", 13, src, dst);
    726       DO_imm_mandr_r("dpps", 14, src, dst);
    727       DO_imm_mandr_r("dpps", 15, src, dst);
    728       DO_imm_mandr_r("dpps", 16, src, dst);
    729       DO_imm_mandr_r("dpps", 17, src, dst);
    730       DO_imm_mandr_r("dpps", 18, src, dst);
    731       DO_imm_mandr_r("dpps", 19, src, dst);
    732       DO_imm_mandr_r("dpps", 20, src, dst);
    733       DO_imm_mandr_r("dpps", 21, src, dst);
    734       DO_imm_mandr_r("dpps", 22, src, dst);
    735       DO_imm_mandr_r("dpps", 23, src, dst);
    736       DO_imm_mandr_r("dpps", 24, src, dst);
    737       DO_imm_mandr_r("dpps", 25, src, dst);
    738       DO_imm_mandr_r("dpps", 26, src, dst);
    739       DO_imm_mandr_r("dpps", 27, src, dst);
    740       DO_imm_mandr_r("dpps", 28, src, dst);
    741       DO_imm_mandr_r("dpps", 29, src, dst);
    742       DO_imm_mandr_r("dpps", 30, src, dst);
    743       DO_imm_mandr_r("dpps", 31, src, dst);
    744       DO_imm_mandr_r("dpps", 32, src, dst);
    745       DO_imm_mandr_r("dpps", 33, src, dst);
    746       DO_imm_mandr_r("dpps", 34, src, dst);
    747       DO_imm_mandr_r("dpps", 35, src, dst);
    748       DO_imm_mandr_r("dpps", 36, src, dst);
    749       DO_imm_mandr_r("dpps", 37, src, dst);
    750       DO_imm_mandr_r("dpps", 38, src, dst);
    751       DO_imm_mandr_r("dpps", 39, src, dst);
    752       DO_imm_mandr_r("dpps", 40, src, dst);
    753       DO_imm_mandr_r("dpps", 41, src, dst);
    754       DO_imm_mandr_r("dpps", 42, src, dst);
    755       DO_imm_mandr_r("dpps", 43, src, dst);
    756       DO_imm_mandr_r("dpps", 44, src, dst);
    757       DO_imm_mandr_r("dpps", 45, src, dst);
    758       DO_imm_mandr_r("dpps", 46, src, dst);
    759       DO_imm_mandr_r("dpps", 47, src, dst);
    760       DO_imm_mandr_r("dpps", 48, src, dst);
    761       DO_imm_mandr_r("dpps", 49, src, dst);
    762       DO_imm_mandr_r("dpps", 50, src, dst);
    763       DO_imm_mandr_r("dpps", 51, src, dst);
    764       DO_imm_mandr_r("dpps", 52, src, dst);
    765       DO_imm_mandr_r("dpps", 53, src, dst);
    766       DO_imm_mandr_r("dpps", 54, src, dst);
    767       DO_imm_mandr_r("dpps", 55, src, dst);
    768       DO_imm_mandr_r("dpps", 56, src, dst);
    769       DO_imm_mandr_r("dpps", 57, src, dst);
    770       DO_imm_mandr_r("dpps", 58, src, dst);
    771       DO_imm_mandr_r("dpps", 59, src, dst);
    772       DO_imm_mandr_r("dpps", 60, src, dst);
    773       DO_imm_mandr_r("dpps", 61, src, dst);
    774       DO_imm_mandr_r("dpps", 62, src, dst);
    775       DO_imm_mandr_r("dpps", 63, src, dst);
    776       DO_imm_mandr_r("dpps", 64, src, dst);
    777       DO_imm_mandr_r("dpps", 65, src, dst);
    778       DO_imm_mandr_r("dpps", 66, src, dst);
    779       DO_imm_mandr_r("dpps", 67, src, dst);
    780       DO_imm_mandr_r("dpps", 68, src, dst);
    781       DO_imm_mandr_r("dpps", 69, src, dst);
    782       DO_imm_mandr_r("dpps", 70, src, dst);
    783       DO_imm_mandr_r("dpps", 71, src, dst);
    784       DO_imm_mandr_r("dpps", 72, src, dst);
    785       DO_imm_mandr_r("dpps", 73, src, dst);
    786       DO_imm_mandr_r("dpps", 74, src, dst);
    787       DO_imm_mandr_r("dpps", 75, src, dst);
    788       DO_imm_mandr_r("dpps", 76, src, dst);
    789       DO_imm_mandr_r("dpps", 77, src, dst);
    790       DO_imm_mandr_r("dpps", 78, src, dst);
    791       DO_imm_mandr_r("dpps", 79, src, dst);
    792       DO_imm_mandr_r("dpps", 80, src, dst);
    793       DO_imm_mandr_r("dpps", 81, src, dst);
    794       DO_imm_mandr_r("dpps", 82, src, dst);
    795       DO_imm_mandr_r("dpps", 83, src, dst);
    796       DO_imm_mandr_r("dpps", 84, src, dst);
    797       DO_imm_mandr_r("dpps", 85, src, dst);
    798       DO_imm_mandr_r("dpps", 86, src, dst);
    799       DO_imm_mandr_r("dpps", 87, src, dst);
    800       DO_imm_mandr_r("dpps", 88, src, dst);
    801       DO_imm_mandr_r("dpps", 89, src, dst);
    802       DO_imm_mandr_r("dpps", 90, src, dst);
    803       DO_imm_mandr_r("dpps", 91, src, dst);
    804       DO_imm_mandr_r("dpps", 92, src, dst);
    805       DO_imm_mandr_r("dpps", 93, src, dst);
    806       DO_imm_mandr_r("dpps", 94, src, dst);
    807       DO_imm_mandr_r("dpps", 95, src, dst);
    808       DO_imm_mandr_r("dpps", 96, src, dst);
    809       DO_imm_mandr_r("dpps", 97, src, dst);
    810       DO_imm_mandr_r("dpps", 98, src, dst);
    811       DO_imm_mandr_r("dpps", 99, src, dst);
    812       DO_imm_mandr_r("dpps", 100, src, dst);
    813       DO_imm_mandr_r("dpps", 101, src, dst);
    814       DO_imm_mandr_r("dpps", 102, src, dst);
    815       DO_imm_mandr_r("dpps", 103, src, dst);
    816       DO_imm_mandr_r("dpps", 104, src, dst);
    817       DO_imm_mandr_r("dpps", 105, src, dst);
    818       DO_imm_mandr_r("dpps", 106, src, dst);
    819       DO_imm_mandr_r("dpps", 107, src, dst);
    820       DO_imm_mandr_r("dpps", 108, src, dst);
    821       DO_imm_mandr_r("dpps", 109, src, dst);
    822       DO_imm_mandr_r("dpps", 110, src, dst);
    823       DO_imm_mandr_r("dpps", 111, src, dst);
    824       DO_imm_mandr_r("dpps", 112, src, dst);
    825       DO_imm_mandr_r("dpps", 113, src, dst);
    826       DO_imm_mandr_r("dpps", 114, src, dst);
    827       DO_imm_mandr_r("dpps", 115, src, dst);
    828       DO_imm_mandr_r("dpps", 116, src, dst);
    829       DO_imm_mandr_r("dpps", 117, src, dst);
    830       DO_imm_mandr_r("dpps", 118, src, dst);
    831       DO_imm_mandr_r("dpps", 119, src, dst);
    832       DO_imm_mandr_r("dpps", 120, src, dst);
    833       DO_imm_mandr_r("dpps", 121, src, dst);
    834       DO_imm_mandr_r("dpps", 122, src, dst);
    835       DO_imm_mandr_r("dpps", 123, src, dst);
    836       DO_imm_mandr_r("dpps", 124, src, dst);
    837       DO_imm_mandr_r("dpps", 125, src, dst);
    838       DO_imm_mandr_r("dpps", 126, src, dst);
    839       DO_imm_mandr_r("dpps", 127, src, dst);
    840       DO_imm_mandr_r("dpps", 128, src, dst);
    841       DO_imm_mandr_r("dpps", 129, src, dst);
    842       DO_imm_mandr_r("dpps", 130, src, dst);
    843       DO_imm_mandr_r("dpps", 131, src, dst);
    844       DO_imm_mandr_r("dpps", 132, src, dst);
    845       DO_imm_mandr_r("dpps", 133, src, dst);
    846       DO_imm_mandr_r("dpps", 134, src, dst);
    847       DO_imm_mandr_r("dpps", 135, src, dst);
    848       DO_imm_mandr_r("dpps", 136, src, dst);
    849       DO_imm_mandr_r("dpps", 137, src, dst);
    850       DO_imm_mandr_r("dpps", 138, src, dst);
    851       DO_imm_mandr_r("dpps", 139, src, dst);
    852       DO_imm_mandr_r("dpps", 140, src, dst);
    853       DO_imm_mandr_r("dpps", 141, src, dst);
    854       DO_imm_mandr_r("dpps", 142, src, dst);
    855       DO_imm_mandr_r("dpps", 143, src, dst);
    856       DO_imm_mandr_r("dpps", 144, src, dst);
    857       DO_imm_mandr_r("dpps", 145, src, dst);
    858       DO_imm_mandr_r("dpps", 146, src, dst);
    859       DO_imm_mandr_r("dpps", 147, src, dst);
    860       DO_imm_mandr_r("dpps", 148, src, dst);
    861       DO_imm_mandr_r("dpps", 149, src, dst);
    862       DO_imm_mandr_r("dpps", 150, src, dst);
    863       DO_imm_mandr_r("dpps", 151, src, dst);
    864       DO_imm_mandr_r("dpps", 152, src, dst);
    865       DO_imm_mandr_r("dpps", 153, src, dst);
    866       DO_imm_mandr_r("dpps", 154, src, dst);
    867       DO_imm_mandr_r("dpps", 155, src, dst);
    868       DO_imm_mandr_r("dpps", 156, src, dst);
    869       DO_imm_mandr_r("dpps", 157, src, dst);
    870       DO_imm_mandr_r("dpps", 158, src, dst);
    871       DO_imm_mandr_r("dpps", 159, src, dst);
    872       DO_imm_mandr_r("dpps", 160, src, dst);
    873       DO_imm_mandr_r("dpps", 161, src, dst);
    874       DO_imm_mandr_r("dpps", 162, src, dst);
    875       DO_imm_mandr_r("dpps", 163, src, dst);
    876       DO_imm_mandr_r("dpps", 164, src, dst);
    877       DO_imm_mandr_r("dpps", 165, src, dst);
    878       DO_imm_mandr_r("dpps", 166, src, dst);
    879       DO_imm_mandr_r("dpps", 167, src, dst);
    880       DO_imm_mandr_r("dpps", 168, src, dst);
    881       DO_imm_mandr_r("dpps", 169, src, dst);
    882       DO_imm_mandr_r("dpps", 170, src, dst);
    883       DO_imm_mandr_r("dpps", 171, src, dst);
    884       DO_imm_mandr_r("dpps", 172, src, dst);
    885       DO_imm_mandr_r("dpps", 173, src, dst);
    886       DO_imm_mandr_r("dpps", 174, src, dst);
    887       DO_imm_mandr_r("dpps", 175, src, dst);
    888       DO_imm_mandr_r("dpps", 176, src, dst);
    889       DO_imm_mandr_r("dpps", 177, src, dst);
    890       DO_imm_mandr_r("dpps", 178, src, dst);
    891       DO_imm_mandr_r("dpps", 179, src, dst);
    892       DO_imm_mandr_r("dpps", 180, src, dst);
    893       DO_imm_mandr_r("dpps", 181, src, dst);
    894       DO_imm_mandr_r("dpps", 182, src, dst);
    895       DO_imm_mandr_r("dpps", 183, src, dst);
    896       DO_imm_mandr_r("dpps", 184, src, dst);
    897       DO_imm_mandr_r("dpps", 185, src, dst);
    898       DO_imm_mandr_r("dpps", 186, src, dst);
    899       DO_imm_mandr_r("dpps", 187, src, dst);
    900       DO_imm_mandr_r("dpps", 188, src, dst);
    901       DO_imm_mandr_r("dpps", 189, src, dst);
    902       DO_imm_mandr_r("dpps", 190, src, dst);
    903       DO_imm_mandr_r("dpps", 191, src, dst);
    904       DO_imm_mandr_r("dpps", 192, src, dst);
    905       DO_imm_mandr_r("dpps", 193, src, dst);
    906       DO_imm_mandr_r("dpps", 194, src, dst);
    907       DO_imm_mandr_r("dpps", 195, src, dst);
    908       DO_imm_mandr_r("dpps", 196, src, dst);
    909       DO_imm_mandr_r("dpps", 197, src, dst);
    910       DO_imm_mandr_r("dpps", 198, src, dst);
    911       DO_imm_mandr_r("dpps", 199, src, dst);
    912       DO_imm_mandr_r("dpps", 200, src, dst);
    913       DO_imm_mandr_r("dpps", 201, src, dst);
    914       DO_imm_mandr_r("dpps", 202, src, dst);
    915       DO_imm_mandr_r("dpps", 203, src, dst);
    916       DO_imm_mandr_r("dpps", 204, src, dst);
    917       DO_imm_mandr_r("dpps", 205, src, dst);
    918       DO_imm_mandr_r("dpps", 206, src, dst);
    919       DO_imm_mandr_r("dpps", 207, src, dst);
    920       DO_imm_mandr_r("dpps", 208, src, dst);
    921       DO_imm_mandr_r("dpps", 209, src, dst);
    922       DO_imm_mandr_r("dpps", 210, src, dst);
    923       DO_imm_mandr_r("dpps", 211, src, dst);
    924       DO_imm_mandr_r("dpps", 212, src, dst);
    925       DO_imm_mandr_r("dpps", 213, src, dst);
    926       DO_imm_mandr_r("dpps", 214, src, dst);
    927       DO_imm_mandr_r("dpps", 215, src, dst);
    928       DO_imm_mandr_r("dpps", 216, src, dst);
    929       DO_imm_mandr_r("dpps", 217, src, dst);
    930       DO_imm_mandr_r("dpps", 218, src, dst);
    931       DO_imm_mandr_r("dpps", 219, src, dst);
    932       DO_imm_mandr_r("dpps", 220, src, dst);
    933       DO_imm_mandr_r("dpps", 221, src, dst);
    934       DO_imm_mandr_r("dpps", 222, src, dst);
    935       DO_imm_mandr_r("dpps", 223, src, dst);
    936       DO_imm_mandr_r("dpps", 224, src, dst);
    937       DO_imm_mandr_r("dpps", 225, src, dst);
    938       DO_imm_mandr_r("dpps", 226, src, dst);
    939       DO_imm_mandr_r("dpps", 227, src, dst);
    940       DO_imm_mandr_r("dpps", 228, src, dst);
    941       DO_imm_mandr_r("dpps", 229, src, dst);
    942       DO_imm_mandr_r("dpps", 230, src, dst);
    943       DO_imm_mandr_r("dpps", 231, src, dst);
    944       DO_imm_mandr_r("dpps", 232, src, dst);
    945       DO_imm_mandr_r("dpps", 233, src, dst);
    946       DO_imm_mandr_r("dpps", 234, src, dst);
    947       DO_imm_mandr_r("dpps", 235, src, dst);
    948       DO_imm_mandr_r("dpps", 236, src, dst);
    949       DO_imm_mandr_r("dpps", 237, src, dst);
    950       DO_imm_mandr_r("dpps", 238, src, dst);
    951       DO_imm_mandr_r("dpps", 239, src, dst);
    952       DO_imm_mandr_r("dpps", 240, src, dst);
    953       DO_imm_mandr_r("dpps", 241, src, dst);
    954       DO_imm_mandr_r("dpps", 242, src, dst);
    955       DO_imm_mandr_r("dpps", 243, src, dst);
    956       DO_imm_mandr_r("dpps", 244, src, dst);
    957       DO_imm_mandr_r("dpps", 245, src, dst);
    958       DO_imm_mandr_r("dpps", 246, src, dst);
    959       DO_imm_mandr_r("dpps", 247, src, dst);
    960       DO_imm_mandr_r("dpps", 248, src, dst);
    961       DO_imm_mandr_r("dpps", 249, src, dst);
    962       DO_imm_mandr_r("dpps", 250, src, dst);
    963       DO_imm_mandr_r("dpps", 251, src, dst);
    964       DO_imm_mandr_r("dpps", 252, src, dst);
    965       DO_imm_mandr_r("dpps", 253, src, dst);
    966       DO_imm_mandr_r("dpps", 254, src, dst);
    967       DO_imm_mandr_r("dpps", 255, src, dst);
    968    }
    969 }
    970 
    971 void test_INSERTPS ( void )
    972 {
    973    V128 src, dst;
    974    {
    975       *(float*)(&src[0])  =   1.2;
    976       *(float*)(&src[4])  =  -3.4;
    977       *(float*)(&src[8])  =  -6.7;
    978       *(float*)(&src[12]) =   8.9;
    979       *(float*)(&dst[0])  = -10.11;
    980       *(float*)(&dst[4])  =  12.13;
    981       *(float*)(&dst[8])  =  14.15;
    982       *(float*)(&dst[12]) = -16.17;
    983       DO_imm_mandr_r("insertps", 0, src, dst);
    984       DO_imm_mandr_r("insertps", 1, src, dst);
    985       DO_imm_mandr_r("insertps", 2, src, dst);
    986       DO_imm_mandr_r("insertps", 3, src, dst);
    987       DO_imm_mandr_r("insertps", 4, src, dst);
    988       DO_imm_mandr_r("insertps", 5, src, dst);
    989       DO_imm_mandr_r("insertps", 6, src, dst);
    990       DO_imm_mandr_r("insertps", 7, src, dst);
    991       DO_imm_mandr_r("insertps", 8, src, dst);
    992       DO_imm_mandr_r("insertps", 9, src, dst);
    993       DO_imm_mandr_r("insertps", 10, src, dst);
    994       DO_imm_mandr_r("insertps", 11, src, dst);
    995       DO_imm_mandr_r("insertps", 12, src, dst);
    996       DO_imm_mandr_r("insertps", 13, src, dst);
    997       DO_imm_mandr_r("insertps", 14, src, dst);
    998       DO_imm_mandr_r("insertps", 15, src, dst);
    999       DO_imm_mandr_r("insertps", 16, src, dst);
   1000       DO_imm_mandr_r("insertps", 17, src, dst);
   1001       DO_imm_mandr_r("insertps", 18, src, dst);
   1002       DO_imm_mandr_r("insertps", 19, src, dst);
   1003       DO_imm_mandr_r("insertps", 20, src, dst);
   1004       DO_imm_mandr_r("insertps", 21, src, dst);
   1005       DO_imm_mandr_r("insertps", 22, src, dst);
   1006       DO_imm_mandr_r("insertps", 23, src, dst);
   1007       DO_imm_mandr_r("insertps", 24, src, dst);
   1008       DO_imm_mandr_r("insertps", 25, src, dst);
   1009       DO_imm_mandr_r("insertps", 26, src, dst);
   1010       DO_imm_mandr_r("insertps", 27, src, dst);
   1011       DO_imm_mandr_r("insertps", 28, src, dst);
   1012       DO_imm_mandr_r("insertps", 29, src, dst);
   1013       DO_imm_mandr_r("insertps", 30, src, dst);
   1014       DO_imm_mandr_r("insertps", 31, src, dst);
   1015       DO_imm_mandr_r("insertps", 32, src, dst);
   1016       DO_imm_mandr_r("insertps", 33, src, dst);
   1017       DO_imm_mandr_r("insertps", 34, src, dst);
   1018       DO_imm_mandr_r("insertps", 35, src, dst);
   1019       DO_imm_mandr_r("insertps", 36, src, dst);
   1020       DO_imm_mandr_r("insertps", 37, src, dst);
   1021       DO_imm_mandr_r("insertps", 38, src, dst);
   1022       DO_imm_mandr_r("insertps", 39, src, dst);
   1023       DO_imm_mandr_r("insertps", 40, src, dst);
   1024       DO_imm_mandr_r("insertps", 41, src, dst);
   1025       DO_imm_mandr_r("insertps", 42, src, dst);
   1026       DO_imm_mandr_r("insertps", 43, src, dst);
   1027       DO_imm_mandr_r("insertps", 44, src, dst);
   1028       DO_imm_mandr_r("insertps", 45, src, dst);
   1029       DO_imm_mandr_r("insertps", 46, src, dst);
   1030       DO_imm_mandr_r("insertps", 47, src, dst);
   1031       DO_imm_mandr_r("insertps", 48, src, dst);
   1032       DO_imm_mandr_r("insertps", 49, src, dst);
   1033       DO_imm_mandr_r("insertps", 50, src, dst);
   1034       DO_imm_mandr_r("insertps", 51, src, dst);
   1035       DO_imm_mandr_r("insertps", 52, src, dst);
   1036       DO_imm_mandr_r("insertps", 53, src, dst);
   1037       DO_imm_mandr_r("insertps", 54, src, dst);
   1038       DO_imm_mandr_r("insertps", 55, src, dst);
   1039       DO_imm_mandr_r("insertps", 56, src, dst);
   1040       DO_imm_mandr_r("insertps", 57, src, dst);
   1041       DO_imm_mandr_r("insertps", 58, src, dst);
   1042       DO_imm_mandr_r("insertps", 59, src, dst);
   1043       DO_imm_mandr_r("insertps", 60, src, dst);
   1044       DO_imm_mandr_r("insertps", 61, src, dst);
   1045       DO_imm_mandr_r("insertps", 62, src, dst);
   1046       DO_imm_mandr_r("insertps", 63, src, dst);
   1047       DO_imm_mandr_r("insertps", 64, src, dst);
   1048       DO_imm_mandr_r("insertps", 65, src, dst);
   1049       DO_imm_mandr_r("insertps", 66, src, dst);
   1050       DO_imm_mandr_r("insertps", 67, src, dst);
   1051       DO_imm_mandr_r("insertps", 68, src, dst);
   1052       DO_imm_mandr_r("insertps", 69, src, dst);
   1053       DO_imm_mandr_r("insertps", 70, src, dst);
   1054       DO_imm_mandr_r("insertps", 71, src, dst);
   1055       DO_imm_mandr_r("insertps", 72, src, dst);
   1056       DO_imm_mandr_r("insertps", 73, src, dst);
   1057       DO_imm_mandr_r("insertps", 74, src, dst);
   1058       DO_imm_mandr_r("insertps", 75, src, dst);
   1059       DO_imm_mandr_r("insertps", 76, src, dst);
   1060       DO_imm_mandr_r("insertps", 77, src, dst);
   1061       DO_imm_mandr_r("insertps", 78, src, dst);
   1062       DO_imm_mandr_r("insertps", 79, src, dst);
   1063       DO_imm_mandr_r("insertps", 80, src, dst);
   1064       DO_imm_mandr_r("insertps", 81, src, dst);
   1065       DO_imm_mandr_r("insertps", 82, src, dst);
   1066       DO_imm_mandr_r("insertps", 83, src, dst);
   1067       DO_imm_mandr_r("insertps", 84, src, dst);
   1068       DO_imm_mandr_r("insertps", 85, src, dst);
   1069       DO_imm_mandr_r("insertps", 86, src, dst);
   1070       DO_imm_mandr_r("insertps", 87, src, dst);
   1071       DO_imm_mandr_r("insertps", 88, src, dst);
   1072       DO_imm_mandr_r("insertps", 89, src, dst);
   1073       DO_imm_mandr_r("insertps", 90, src, dst);
   1074       DO_imm_mandr_r("insertps", 91, src, dst);
   1075       DO_imm_mandr_r("insertps", 92, src, dst);
   1076       DO_imm_mandr_r("insertps", 93, src, dst);
   1077       DO_imm_mandr_r("insertps", 94, src, dst);
   1078       DO_imm_mandr_r("insertps", 95, src, dst);
   1079       DO_imm_mandr_r("insertps", 96, src, dst);
   1080       DO_imm_mandr_r("insertps", 97, src, dst);
   1081       DO_imm_mandr_r("insertps", 98, src, dst);
   1082       DO_imm_mandr_r("insertps", 99, src, dst);
   1083       DO_imm_mandr_r("insertps", 100, src, dst);
   1084       DO_imm_mandr_r("insertps", 101, src, dst);
   1085       DO_imm_mandr_r("insertps", 102, src, dst);
   1086       DO_imm_mandr_r("insertps", 103, src, dst);
   1087       DO_imm_mandr_r("insertps", 104, src, dst);
   1088       DO_imm_mandr_r("insertps", 105, src, dst);
   1089       DO_imm_mandr_r("insertps", 106, src, dst);
   1090       DO_imm_mandr_r("insertps", 107, src, dst);
   1091       DO_imm_mandr_r("insertps", 108, src, dst);
   1092       DO_imm_mandr_r("insertps", 109, src, dst);
   1093       DO_imm_mandr_r("insertps", 110, src, dst);
   1094       DO_imm_mandr_r("insertps", 111, src, dst);
   1095       DO_imm_mandr_r("insertps", 112, src, dst);
   1096       DO_imm_mandr_r("insertps", 113, src, dst);
   1097       DO_imm_mandr_r("insertps", 114, src, dst);
   1098       DO_imm_mandr_r("insertps", 115, src, dst);
   1099       DO_imm_mandr_r("insertps", 116, src, dst);
   1100       DO_imm_mandr_r("insertps", 117, src, dst);
   1101       DO_imm_mandr_r("insertps", 118, src, dst);
   1102       DO_imm_mandr_r("insertps", 119, src, dst);
   1103       DO_imm_mandr_r("insertps", 120, src, dst);
   1104       DO_imm_mandr_r("insertps", 121, src, dst);
   1105       DO_imm_mandr_r("insertps", 122, src, dst);
   1106       DO_imm_mandr_r("insertps", 123, src, dst);
   1107       DO_imm_mandr_r("insertps", 124, src, dst);
   1108       DO_imm_mandr_r("insertps", 125, src, dst);
   1109       DO_imm_mandr_r("insertps", 126, src, dst);
   1110       DO_imm_mandr_r("insertps", 127, src, dst);
   1111       DO_imm_mandr_r("insertps", 128, src, dst);
   1112       DO_imm_mandr_r("insertps", 129, src, dst);
   1113       DO_imm_mandr_r("insertps", 130, src, dst);
   1114       DO_imm_mandr_r("insertps", 131, src, dst);
   1115       DO_imm_mandr_r("insertps", 132, src, dst);
   1116       DO_imm_mandr_r("insertps", 133, src, dst);
   1117       DO_imm_mandr_r("insertps", 134, src, dst);
   1118       DO_imm_mandr_r("insertps", 135, src, dst);
   1119       DO_imm_mandr_r("insertps", 136, src, dst);
   1120       DO_imm_mandr_r("insertps", 137, src, dst);
   1121       DO_imm_mandr_r("insertps", 138, src, dst);
   1122       DO_imm_mandr_r("insertps", 139, src, dst);
   1123       DO_imm_mandr_r("insertps", 140, src, dst);
   1124       DO_imm_mandr_r("insertps", 141, src, dst);
   1125       DO_imm_mandr_r("insertps", 142, src, dst);
   1126       DO_imm_mandr_r("insertps", 143, src, dst);
   1127       DO_imm_mandr_r("insertps", 144, src, dst);
   1128       DO_imm_mandr_r("insertps", 145, src, dst);
   1129       DO_imm_mandr_r("insertps", 146, src, dst);
   1130       DO_imm_mandr_r("insertps", 147, src, dst);
   1131       DO_imm_mandr_r("insertps", 148, src, dst);
   1132       DO_imm_mandr_r("insertps", 149, src, dst);
   1133       DO_imm_mandr_r("insertps", 150, src, dst);
   1134       DO_imm_mandr_r("insertps", 151, src, dst);
   1135       DO_imm_mandr_r("insertps", 152, src, dst);
   1136       DO_imm_mandr_r("insertps", 153, src, dst);
   1137       DO_imm_mandr_r("insertps", 154, src, dst);
   1138       DO_imm_mandr_r("insertps", 155, src, dst);
   1139       DO_imm_mandr_r("insertps", 156, src, dst);
   1140       DO_imm_mandr_r("insertps", 157, src, dst);
   1141       DO_imm_mandr_r("insertps", 158, src, dst);
   1142       DO_imm_mandr_r("insertps", 159, src, dst);
   1143       DO_imm_mandr_r("insertps", 160, src, dst);
   1144       DO_imm_mandr_r("insertps", 161, src, dst);
   1145       DO_imm_mandr_r("insertps", 162, src, dst);
   1146       DO_imm_mandr_r("insertps", 163, src, dst);
   1147       DO_imm_mandr_r("insertps", 164, src, dst);
   1148       DO_imm_mandr_r("insertps", 165, src, dst);
   1149       DO_imm_mandr_r("insertps", 166, src, dst);
   1150       DO_imm_mandr_r("insertps", 167, src, dst);
   1151       DO_imm_mandr_r("insertps", 168, src, dst);
   1152       DO_imm_mandr_r("insertps", 169, src, dst);
   1153       DO_imm_mandr_r("insertps", 170, src, dst);
   1154       DO_imm_mandr_r("insertps", 171, src, dst);
   1155       DO_imm_mandr_r("insertps", 172, src, dst);
   1156       DO_imm_mandr_r("insertps", 173, src, dst);
   1157       DO_imm_mandr_r("insertps", 174, src, dst);
   1158       DO_imm_mandr_r("insertps", 175, src, dst);
   1159       DO_imm_mandr_r("insertps", 176, src, dst);
   1160       DO_imm_mandr_r("insertps", 177, src, dst);
   1161       DO_imm_mandr_r("insertps", 178, src, dst);
   1162       DO_imm_mandr_r("insertps", 179, src, dst);
   1163       DO_imm_mandr_r("insertps", 180, src, dst);
   1164       DO_imm_mandr_r("insertps", 181, src, dst);
   1165       DO_imm_mandr_r("insertps", 182, src, dst);
   1166       DO_imm_mandr_r("insertps", 183, src, dst);
   1167       DO_imm_mandr_r("insertps", 184, src, dst);
   1168       DO_imm_mandr_r("insertps", 185, src, dst);
   1169       DO_imm_mandr_r("insertps", 186, src, dst);
   1170       DO_imm_mandr_r("insertps", 187, src, dst);
   1171       DO_imm_mandr_r("insertps", 188, src, dst);
   1172       DO_imm_mandr_r("insertps", 189, src, dst);
   1173       DO_imm_mandr_r("insertps", 190, src, dst);
   1174       DO_imm_mandr_r("insertps", 191, src, dst);
   1175       DO_imm_mandr_r("insertps", 192, src, dst);
   1176       DO_imm_mandr_r("insertps", 193, src, dst);
   1177       DO_imm_mandr_r("insertps", 194, src, dst);
   1178       DO_imm_mandr_r("insertps", 195, src, dst);
   1179       DO_imm_mandr_r("insertps", 196, src, dst);
   1180       DO_imm_mandr_r("insertps", 197, src, dst);
   1181       DO_imm_mandr_r("insertps", 198, src, dst);
   1182       DO_imm_mandr_r("insertps", 199, src, dst);
   1183       DO_imm_mandr_r("insertps", 200, src, dst);
   1184       DO_imm_mandr_r("insertps", 201, src, dst);
   1185       DO_imm_mandr_r("insertps", 202, src, dst);
   1186       DO_imm_mandr_r("insertps", 203, src, dst);
   1187       DO_imm_mandr_r("insertps", 204, src, dst);
   1188       DO_imm_mandr_r("insertps", 205, src, dst);
   1189       DO_imm_mandr_r("insertps", 206, src, dst);
   1190       DO_imm_mandr_r("insertps", 207, src, dst);
   1191       DO_imm_mandr_r("insertps", 208, src, dst);
   1192       DO_imm_mandr_r("insertps", 209, src, dst);
   1193       DO_imm_mandr_r("insertps", 210, src, dst);
   1194       DO_imm_mandr_r("insertps", 211, src, dst);
   1195       DO_imm_mandr_r("insertps", 212, src, dst);
   1196       DO_imm_mandr_r("insertps", 213, src, dst);
   1197       DO_imm_mandr_r("insertps", 214, src, dst);
   1198       DO_imm_mandr_r("insertps", 215, src, dst);
   1199       DO_imm_mandr_r("insertps", 216, src, dst);
   1200       DO_imm_mandr_r("insertps", 217, src, dst);
   1201       DO_imm_mandr_r("insertps", 218, src, dst);
   1202       DO_imm_mandr_r("insertps", 219, src, dst);
   1203       DO_imm_mandr_r("insertps", 220, src, dst);
   1204       DO_imm_mandr_r("insertps", 221, src, dst);
   1205       DO_imm_mandr_r("insertps", 222, src, dst);
   1206       DO_imm_mandr_r("insertps", 223, src, dst);
   1207       DO_imm_mandr_r("insertps", 224, src, dst);
   1208       DO_imm_mandr_r("insertps", 225, src, dst);
   1209       DO_imm_mandr_r("insertps", 226, src, dst);
   1210       DO_imm_mandr_r("insertps", 227, src, dst);
   1211       DO_imm_mandr_r("insertps", 228, src, dst);
   1212       DO_imm_mandr_r("insertps", 229, src, dst);
   1213       DO_imm_mandr_r("insertps", 230, src, dst);
   1214       DO_imm_mandr_r("insertps", 231, src, dst);
   1215       DO_imm_mandr_r("insertps", 232, src, dst);
   1216       DO_imm_mandr_r("insertps", 233, src, dst);
   1217       DO_imm_mandr_r("insertps", 234, src, dst);
   1218       DO_imm_mandr_r("insertps", 235, src, dst);
   1219       DO_imm_mandr_r("insertps", 236, src, dst);
   1220       DO_imm_mandr_r("insertps", 237, src, dst);
   1221       DO_imm_mandr_r("insertps", 238, src, dst);
   1222       DO_imm_mandr_r("insertps", 239, src, dst);
   1223       DO_imm_mandr_r("insertps", 240, src, dst);
   1224       DO_imm_mandr_r("insertps", 241, src, dst);
   1225       DO_imm_mandr_r("insertps", 242, src, dst);
   1226       DO_imm_mandr_r("insertps", 243, src, dst);
   1227       DO_imm_mandr_r("insertps", 244, src, dst);
   1228       DO_imm_mandr_r("insertps", 245, src, dst);
   1229       DO_imm_mandr_r("insertps", 246, src, dst);
   1230       DO_imm_mandr_r("insertps", 247, src, dst);
   1231       DO_imm_mandr_r("insertps", 248, src, dst);
   1232       DO_imm_mandr_r("insertps", 249, src, dst);
   1233       DO_imm_mandr_r("insertps", 250, src, dst);
   1234       DO_imm_mandr_r("insertps", 251, src, dst);
   1235       DO_imm_mandr_r("insertps", 252, src, dst);
   1236       DO_imm_mandr_r("insertps", 253, src, dst);
   1237       DO_imm_mandr_r("insertps", 254, src, dst);
   1238       DO_imm_mandr_r("insertps", 255, src, dst);
   1239    }
   1240 }
   1241 
   1242 void test_MPSADBW ( void )
   1243 {
   1244    V128 src, dst;
   1245    Int i;
   1246    for (i = 0; i < 50; i++) {
   1247       randV128(&src);
   1248       randV128(&dst);
   1249       DO_imm_mandr_r("mpsadbw", 0, src, dst);
   1250       DO_imm_mandr_r("mpsadbw", 1, src, dst);
   1251       DO_imm_mandr_r("mpsadbw", 2, src, dst);
   1252       DO_imm_mandr_r("mpsadbw", 3, src, dst);
   1253       DO_imm_mandr_r("mpsadbw", 4, src, dst);
   1254       DO_imm_mandr_r("mpsadbw", 5, src, dst);
   1255       DO_imm_mandr_r("mpsadbw", 6, src, dst);
   1256       DO_imm_mandr_r("mpsadbw", 7, src, dst);
   1257    }
   1258 }
   1259 
   1260 void test_PACKUSDW ( void )
   1261 {
   1262    V128 src, dst;
   1263    Int i;
   1264    for (i = 0; i < 10; i++) {
   1265       if (i < 9) {
   1266          randV128(&src);
   1267          randV128(&dst);
   1268       } else {
   1269          memset(&src, 0, sizeof(src));
   1270          memset(&dst, 0, sizeof(src));
   1271          src[0] = 0x11; src[1] = 0x22;
   1272          src[4] = 0x33; src[5] = 0x44;
   1273          src[8] = 0x55; src[9] = 0x66;
   1274          src[12] = 0x77; src[13] = 0x88;
   1275          dst[0] = 0xaa; dst[1] = 0xbb;
   1276          dst[4] = 0xcc; dst[5] = 0xdd;
   1277          dst[8] = 0xee; dst[9] = 0xff;
   1278          dst[12] = 0xa1; dst[13] = 0xb2;
   1279       }
   1280       DO_mandr_r("packusdw", src, dst);
   1281    }
   1282 }
   1283 
   1284 void test_PBLENDW ( void )
   1285 {
   1286    V128 src, dst;
   1287    randV128(&src);
   1288    randV128(&dst);
   1289    {
   1290       DO_imm_mandr_r("pblendw", 0, src, dst);
   1291       DO_imm_mandr_r("pblendw", 1, src, dst);
   1292       DO_imm_mandr_r("pblendw", 2, src, dst);
   1293       DO_imm_mandr_r("pblendw", 3, src, dst);
   1294       DO_imm_mandr_r("pblendw", 4, src, dst);
   1295       DO_imm_mandr_r("pblendw", 5, src, dst);
   1296       DO_imm_mandr_r("pblendw", 6, src, dst);
   1297       DO_imm_mandr_r("pblendw", 7, src, dst);
   1298       DO_imm_mandr_r("pblendw", 8, src, dst);
   1299       DO_imm_mandr_r("pblendw", 9, src, dst);
   1300       DO_imm_mandr_r("pblendw", 10, src, dst);
   1301       DO_imm_mandr_r("pblendw", 11, src, dst);
   1302       DO_imm_mandr_r("pblendw", 12, src, dst);
   1303       DO_imm_mandr_r("pblendw", 13, src, dst);
   1304       DO_imm_mandr_r("pblendw", 14, src, dst);
   1305       DO_imm_mandr_r("pblendw", 15, src, dst);
   1306       DO_imm_mandr_r("pblendw", 16, src, dst);
   1307       DO_imm_mandr_r("pblendw", 17, src, dst);
   1308       DO_imm_mandr_r("pblendw", 18, src, dst);
   1309       DO_imm_mandr_r("pblendw", 19, src, dst);
   1310       DO_imm_mandr_r("pblendw", 20, src, dst);
   1311       DO_imm_mandr_r("pblendw", 21, src, dst);
   1312       DO_imm_mandr_r("pblendw", 22, src, dst);
   1313       DO_imm_mandr_r("pblendw", 23, src, dst);
   1314       DO_imm_mandr_r("pblendw", 24, src, dst);
   1315       DO_imm_mandr_r("pblendw", 25, src, dst);
   1316       DO_imm_mandr_r("pblendw", 26, src, dst);
   1317       DO_imm_mandr_r("pblendw", 27, src, dst);
   1318       DO_imm_mandr_r("pblendw", 28, src, dst);
   1319       DO_imm_mandr_r("pblendw", 29, src, dst);
   1320       DO_imm_mandr_r("pblendw", 30, src, dst);
   1321       DO_imm_mandr_r("pblendw", 31, src, dst);
   1322       DO_imm_mandr_r("pblendw", 32, src, dst);
   1323       DO_imm_mandr_r("pblendw", 33, src, dst);
   1324       DO_imm_mandr_r("pblendw", 34, src, dst);
   1325       DO_imm_mandr_r("pblendw", 35, src, dst);
   1326       DO_imm_mandr_r("pblendw", 36, src, dst);
   1327       DO_imm_mandr_r("pblendw", 37, src, dst);
   1328       DO_imm_mandr_r("pblendw", 38, src, dst);
   1329       DO_imm_mandr_r("pblendw", 39, src, dst);
   1330       DO_imm_mandr_r("pblendw", 40, src, dst);
   1331       DO_imm_mandr_r("pblendw", 41, src, dst);
   1332       DO_imm_mandr_r("pblendw", 42, src, dst);
   1333       DO_imm_mandr_r("pblendw", 43, src, dst);
   1334       DO_imm_mandr_r("pblendw", 44, src, dst);
   1335       DO_imm_mandr_r("pblendw", 45, src, dst);
   1336       DO_imm_mandr_r("pblendw", 46, src, dst);
   1337       DO_imm_mandr_r("pblendw", 47, src, dst);
   1338       DO_imm_mandr_r("pblendw", 48, src, dst);
   1339       DO_imm_mandr_r("pblendw", 49, src, dst);
   1340       DO_imm_mandr_r("pblendw", 50, src, dst);
   1341       DO_imm_mandr_r("pblendw", 51, src, dst);
   1342       DO_imm_mandr_r("pblendw", 52, src, dst);
   1343       DO_imm_mandr_r("pblendw", 53, src, dst);
   1344       DO_imm_mandr_r("pblendw", 54, src, dst);
   1345       DO_imm_mandr_r("pblendw", 55, src, dst);
   1346       DO_imm_mandr_r("pblendw", 56, src, dst);
   1347       DO_imm_mandr_r("pblendw", 57, src, dst);
   1348       DO_imm_mandr_r("pblendw", 58, src, dst);
   1349       DO_imm_mandr_r("pblendw", 59, src, dst);
   1350       DO_imm_mandr_r("pblendw", 60, src, dst);
   1351       DO_imm_mandr_r("pblendw", 61, src, dst);
   1352       DO_imm_mandr_r("pblendw", 62, src, dst);
   1353       DO_imm_mandr_r("pblendw", 63, src, dst);
   1354       DO_imm_mandr_r("pblendw", 64, src, dst);
   1355       DO_imm_mandr_r("pblendw", 65, src, dst);
   1356       DO_imm_mandr_r("pblendw", 66, src, dst);
   1357       DO_imm_mandr_r("pblendw", 67, src, dst);
   1358       DO_imm_mandr_r("pblendw", 68, src, dst);
   1359       DO_imm_mandr_r("pblendw", 69, src, dst);
   1360       DO_imm_mandr_r("pblendw", 70, src, dst);
   1361       DO_imm_mandr_r("pblendw", 71, src, dst);
   1362       DO_imm_mandr_r("pblendw", 72, src, dst);
   1363       DO_imm_mandr_r("pblendw", 73, src, dst);
   1364       DO_imm_mandr_r("pblendw", 74, src, dst);
   1365       DO_imm_mandr_r("pblendw", 75, src, dst);
   1366       DO_imm_mandr_r("pblendw", 76, src, dst);
   1367       DO_imm_mandr_r("pblendw", 77, src, dst);
   1368       DO_imm_mandr_r("pblendw", 78, src, dst);
   1369       DO_imm_mandr_r("pblendw", 79, src, dst);
   1370       DO_imm_mandr_r("pblendw", 80, src, dst);
   1371       DO_imm_mandr_r("pblendw", 81, src, dst);
   1372       DO_imm_mandr_r("pblendw", 82, src, dst);
   1373       DO_imm_mandr_r("pblendw", 83, src, dst);
   1374       DO_imm_mandr_r("pblendw", 84, src, dst);
   1375       DO_imm_mandr_r("pblendw", 85, src, dst);
   1376       DO_imm_mandr_r("pblendw", 86, src, dst);
   1377       DO_imm_mandr_r("pblendw", 87, src, dst);
   1378       DO_imm_mandr_r("pblendw", 88, src, dst);
   1379       DO_imm_mandr_r("pblendw", 89, src, dst);
   1380       DO_imm_mandr_r("pblendw", 90, src, dst);
   1381       DO_imm_mandr_r("pblendw", 91, src, dst);
   1382       DO_imm_mandr_r("pblendw", 92, src, dst);
   1383       DO_imm_mandr_r("pblendw", 93, src, dst);
   1384       DO_imm_mandr_r("pblendw", 94, src, dst);
   1385       DO_imm_mandr_r("pblendw", 95, src, dst);
   1386       DO_imm_mandr_r("pblendw", 96, src, dst);
   1387       DO_imm_mandr_r("pblendw", 97, src, dst);
   1388       DO_imm_mandr_r("pblendw", 98, src, dst);
   1389       DO_imm_mandr_r("pblendw", 99, src, dst);
   1390       DO_imm_mandr_r("pblendw", 100, src, dst);
   1391       DO_imm_mandr_r("pblendw", 101, src, dst);
   1392       DO_imm_mandr_r("pblendw", 102, src, dst);
   1393       DO_imm_mandr_r("pblendw", 103, src, dst);
   1394       DO_imm_mandr_r("pblendw", 104, src, dst);
   1395       DO_imm_mandr_r("pblendw", 105, src, dst);
   1396       DO_imm_mandr_r("pblendw", 106, src, dst);
   1397       DO_imm_mandr_r("pblendw", 107, src, dst);
   1398       DO_imm_mandr_r("pblendw", 108, src, dst);
   1399       DO_imm_mandr_r("pblendw", 109, src, dst);
   1400       DO_imm_mandr_r("pblendw", 110, src, dst);
   1401       DO_imm_mandr_r("pblendw", 111, src, dst);
   1402       DO_imm_mandr_r("pblendw", 112, src, dst);
   1403       DO_imm_mandr_r("pblendw", 113, src, dst);
   1404       DO_imm_mandr_r("pblendw", 114, src, dst);
   1405       DO_imm_mandr_r("pblendw", 115, src, dst);
   1406       DO_imm_mandr_r("pblendw", 116, src, dst);
   1407       DO_imm_mandr_r("pblendw", 117, src, dst);
   1408       DO_imm_mandr_r("pblendw", 118, src, dst);
   1409       DO_imm_mandr_r("pblendw", 119, src, dst);
   1410       DO_imm_mandr_r("pblendw", 120, src, dst);
   1411       DO_imm_mandr_r("pblendw", 121, src, dst);
   1412       DO_imm_mandr_r("pblendw", 122, src, dst);
   1413       DO_imm_mandr_r("pblendw", 123, src, dst);
   1414       DO_imm_mandr_r("pblendw", 124, src, dst);
   1415       DO_imm_mandr_r("pblendw", 125, src, dst);
   1416       DO_imm_mandr_r("pblendw", 126, src, dst);
   1417       DO_imm_mandr_r("pblendw", 127, src, dst);
   1418       DO_imm_mandr_r("pblendw", 128, src, dst);
   1419       DO_imm_mandr_r("pblendw", 129, src, dst);
   1420       DO_imm_mandr_r("pblendw", 130, src, dst);
   1421       DO_imm_mandr_r("pblendw", 131, src, dst);
   1422       DO_imm_mandr_r("pblendw", 132, src, dst);
   1423       DO_imm_mandr_r("pblendw", 133, src, dst);
   1424       DO_imm_mandr_r("pblendw", 134, src, dst);
   1425       DO_imm_mandr_r("pblendw", 135, src, dst);
   1426       DO_imm_mandr_r("pblendw", 136, src, dst);
   1427       DO_imm_mandr_r("pblendw", 137, src, dst);
   1428       DO_imm_mandr_r("pblendw", 138, src, dst);
   1429       DO_imm_mandr_r("pblendw", 139, src, dst);
   1430       DO_imm_mandr_r("pblendw", 140, src, dst);
   1431       DO_imm_mandr_r("pblendw", 141, src, dst);
   1432       DO_imm_mandr_r("pblendw", 142, src, dst);
   1433       DO_imm_mandr_r("pblendw", 143, src, dst);
   1434       DO_imm_mandr_r("pblendw", 144, src, dst);
   1435       DO_imm_mandr_r("pblendw", 145, src, dst);
   1436       DO_imm_mandr_r("pblendw", 146, src, dst);
   1437       DO_imm_mandr_r("pblendw", 147, src, dst);
   1438       DO_imm_mandr_r("pblendw", 148, src, dst);
   1439       DO_imm_mandr_r("pblendw", 149, src, dst);
   1440       DO_imm_mandr_r("pblendw", 150, src, dst);
   1441       DO_imm_mandr_r("pblendw", 151, src, dst);
   1442       DO_imm_mandr_r("pblendw", 152, src, dst);
   1443       DO_imm_mandr_r("pblendw", 153, src, dst);
   1444       DO_imm_mandr_r("pblendw", 154, src, dst);
   1445       DO_imm_mandr_r("pblendw", 155, src, dst);
   1446       DO_imm_mandr_r("pblendw", 156, src, dst);
   1447       DO_imm_mandr_r("pblendw", 157, src, dst);
   1448       DO_imm_mandr_r("pblendw", 158, src, dst);
   1449       DO_imm_mandr_r("pblendw", 159, src, dst);
   1450       DO_imm_mandr_r("pblendw", 160, src, dst);
   1451       DO_imm_mandr_r("pblendw", 161, src, dst);
   1452       DO_imm_mandr_r("pblendw", 162, src, dst);
   1453       DO_imm_mandr_r("pblendw", 163, src, dst);
   1454       DO_imm_mandr_r("pblendw", 164, src, dst);
   1455       DO_imm_mandr_r("pblendw", 165, src, dst);
   1456       DO_imm_mandr_r("pblendw", 166, src, dst);
   1457       DO_imm_mandr_r("pblendw", 167, src, dst);
   1458       DO_imm_mandr_r("pblendw", 168, src, dst);
   1459       DO_imm_mandr_r("pblendw", 169, src, dst);
   1460       DO_imm_mandr_r("pblendw", 170, src, dst);
   1461       DO_imm_mandr_r("pblendw", 171, src, dst);
   1462       DO_imm_mandr_r("pblendw", 172, src, dst);
   1463       DO_imm_mandr_r("pblendw", 173, src, dst);
   1464       DO_imm_mandr_r("pblendw", 174, src, dst);
   1465       DO_imm_mandr_r("pblendw", 175, src, dst);
   1466       DO_imm_mandr_r("pblendw", 176, src, dst);
   1467       DO_imm_mandr_r("pblendw", 177, src, dst);
   1468       DO_imm_mandr_r("pblendw", 178, src, dst);
   1469       DO_imm_mandr_r("pblendw", 179, src, dst);
   1470       DO_imm_mandr_r("pblendw", 180, src, dst);
   1471       DO_imm_mandr_r("pblendw", 181, src, dst);
   1472       DO_imm_mandr_r("pblendw", 182, src, dst);
   1473       DO_imm_mandr_r("pblendw", 183, src, dst);
   1474       DO_imm_mandr_r("pblendw", 184, src, dst);
   1475       DO_imm_mandr_r("pblendw", 185, src, dst);
   1476       DO_imm_mandr_r("pblendw", 186, src, dst);
   1477       DO_imm_mandr_r("pblendw", 187, src, dst);
   1478       DO_imm_mandr_r("pblendw", 188, src, dst);
   1479       DO_imm_mandr_r("pblendw", 189, src, dst);
   1480       DO_imm_mandr_r("pblendw", 190, src, dst);
   1481       DO_imm_mandr_r("pblendw", 191, src, dst);
   1482       DO_imm_mandr_r("pblendw", 192, src, dst);
   1483       DO_imm_mandr_r("pblendw", 193, src, dst);
   1484       DO_imm_mandr_r("pblendw", 194, src, dst);
   1485       DO_imm_mandr_r("pblendw", 195, src, dst);
   1486       DO_imm_mandr_r("pblendw", 196, src, dst);
   1487       DO_imm_mandr_r("pblendw", 197, src, dst);
   1488       DO_imm_mandr_r("pblendw", 198, src, dst);
   1489       DO_imm_mandr_r("pblendw", 199, src, dst);
   1490       DO_imm_mandr_r("pblendw", 200, src, dst);
   1491       DO_imm_mandr_r("pblendw", 201, src, dst);
   1492       DO_imm_mandr_r("pblendw", 202, src, dst);
   1493       DO_imm_mandr_r("pblendw", 203, src, dst);
   1494       DO_imm_mandr_r("pblendw", 204, src, dst);
   1495       DO_imm_mandr_r("pblendw", 205, src, dst);
   1496       DO_imm_mandr_r("pblendw", 206, src, dst);
   1497       DO_imm_mandr_r("pblendw", 207, src, dst);
   1498       DO_imm_mandr_r("pblendw", 208, src, dst);
   1499       DO_imm_mandr_r("pblendw", 209, src, dst);
   1500       DO_imm_mandr_r("pblendw", 210, src, dst);
   1501       DO_imm_mandr_r("pblendw", 211, src, dst);
   1502       DO_imm_mandr_r("pblendw", 212, src, dst);
   1503       DO_imm_mandr_r("pblendw", 213, src, dst);
   1504       DO_imm_mandr_r("pblendw", 214, src, dst);
   1505       DO_imm_mandr_r("pblendw", 215, src, dst);
   1506       DO_imm_mandr_r("pblendw", 216, src, dst);
   1507       DO_imm_mandr_r("pblendw", 217, src, dst);
   1508       DO_imm_mandr_r("pblendw", 218, src, dst);
   1509       DO_imm_mandr_r("pblendw", 219, src, dst);
   1510       DO_imm_mandr_r("pblendw", 220, src, dst);
   1511       DO_imm_mandr_r("pblendw", 221, src, dst);
   1512       DO_imm_mandr_r("pblendw", 222, src, dst);
   1513       DO_imm_mandr_r("pblendw", 223, src, dst);
   1514       DO_imm_mandr_r("pblendw", 224, src, dst);
   1515       DO_imm_mandr_r("pblendw", 225, src, dst);
   1516       DO_imm_mandr_r("pblendw", 226, src, dst);
   1517       DO_imm_mandr_r("pblendw", 227, src, dst);
   1518       DO_imm_mandr_r("pblendw", 228, src, dst);
   1519       DO_imm_mandr_r("pblendw", 229, src, dst);
   1520       DO_imm_mandr_r("pblendw", 230, src, dst);
   1521       DO_imm_mandr_r("pblendw", 231, src, dst);
   1522       DO_imm_mandr_r("pblendw", 232, src, dst);
   1523       DO_imm_mandr_r("pblendw", 233, src, dst);
   1524       DO_imm_mandr_r("pblendw", 234, src, dst);
   1525       DO_imm_mandr_r("pblendw", 235, src, dst);
   1526       DO_imm_mandr_r("pblendw", 236, src, dst);
   1527       DO_imm_mandr_r("pblendw", 237, src, dst);
   1528       DO_imm_mandr_r("pblendw", 238, src, dst);
   1529       DO_imm_mandr_r("pblendw", 239, src, dst);
   1530       DO_imm_mandr_r("pblendw", 240, src, dst);
   1531       DO_imm_mandr_r("pblendw", 241, src, dst);
   1532       DO_imm_mandr_r("pblendw", 242, src, dst);
   1533       DO_imm_mandr_r("pblendw", 243, src, dst);
   1534       DO_imm_mandr_r("pblendw", 244, src, dst);
   1535       DO_imm_mandr_r("pblendw", 245, src, dst);
   1536       DO_imm_mandr_r("pblendw", 246, src, dst);
   1537       DO_imm_mandr_r("pblendw", 247, src, dst);
   1538       DO_imm_mandr_r("pblendw", 248, src, dst);
   1539       DO_imm_mandr_r("pblendw", 249, src, dst);
   1540       DO_imm_mandr_r("pblendw", 250, src, dst);
   1541       DO_imm_mandr_r("pblendw", 251, src, dst);
   1542       DO_imm_mandr_r("pblendw", 252, src, dst);
   1543       DO_imm_mandr_r("pblendw", 253, src, dst);
   1544       DO_imm_mandr_r("pblendw", 254, src, dst);
   1545       DO_imm_mandr_r("pblendw", 255, src, dst);
   1546    }
   1547 }
   1548 
   1549 
   1550 void test_PCMPEQQ ( void )
   1551 {
   1552    V128 src, dst;
   1553    Int i;
   1554    for (i = 0; i < 10; i++) {
   1555       randV128(&src);
   1556       randV128(&dst);
   1557       switch (i - 6) {
   1558          case 0: memset(&src[0], 0x55, 8);
   1559                  memset(&dst[0], 0x55, 8); break;
   1560          case 1: memset(&src[8], 0x55, 8);
   1561                  memset(&dst[8], 0x55, 8); break;
   1562          default:
   1563             break;
   1564       }
   1565       DO_mandr_r("pcmpeqq", src, dst);
   1566    }
   1567 }
   1568 
   1569 
   1570 void test_PEXTRB ( void )
   1571 {
   1572    V128 src;
   1573    randV128(&src);
   1574    DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
   1575    DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
   1576    DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
   1577    DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
   1578    DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
   1579    DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
   1580    DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
   1581    DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
   1582    DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
   1583    DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
   1584    DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
   1585    DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
   1586    DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
   1587    DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
   1588    DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
   1589    DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
   1590 }
   1591 
   1592 void test_PINSRB ( void )
   1593 {
   1594    ULong src;
   1595    src = randULong();
   1596    DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
   1597    src = randULong();
   1598    DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
   1599    src = randULong();
   1600    DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
   1601    src = randULong();
   1602    DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
   1603    src = randULong();
   1604    DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
   1605    src = randULong();
   1606    DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
   1607    src = randULong();
   1608    DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
   1609    src = randULong();
   1610    DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
   1611    src = randULong();
   1612    DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
   1613    src = randULong();
   1614    DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
   1615    src = randULong();
   1616    DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
   1617    src = randULong();
   1618    DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
   1619    src = randULong();
   1620    DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
   1621    src = randULong();
   1622    DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
   1623    src = randULong();
   1624    DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
   1625    src = randULong();
   1626    DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
   1627 }
   1628 
   1629 
   1630 void test_PEXTRW ( void )
   1631 {
   1632    V128 src;
   1633    randV128(&src);
   1634    DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
   1635    DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
   1636    DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
   1637    DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
   1638    DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
   1639    DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
   1640    DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
   1641    DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
   1642 }
   1643 
   1644 void test_PINSRW ( void )
   1645 {
   1646    ULong src;
   1647    src = randULong();
   1648    DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
   1649    src = randULong();
   1650    DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
   1651    src = randULong();
   1652    DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
   1653    src = randULong();
   1654    DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
   1655    src = randULong();
   1656    DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
   1657    src = randULong();
   1658    DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
   1659    src = randULong();
   1660    DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
   1661    src = randULong();
   1662    DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
   1663 }
   1664 
   1665 
   1666 void test_PEXTRD ( void )
   1667 {
   1668    V128 src;
   1669    randV128(&src);
   1670    DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
   1671    DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
   1672    DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
   1673    DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
   1674 }
   1675 
   1676 void test_PINSRD ( void )
   1677 {
   1678    ULong src;
   1679    src = randULong();
   1680    DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
   1681    src = randULong();
   1682    DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
   1683    src = randULong();
   1684    DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
   1685    src = randULong();
   1686    DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
   1687 }
   1688 
   1689 
   1690 void test_PEXTRQ ( void )
   1691 {
   1692    V128 src;
   1693    randV128(&src);
   1694    DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
   1695    DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
   1696 }
   1697 
   1698 void test_PINSRQ ( void )
   1699 {
   1700    ULong src;
   1701    src = randULong();
   1702    DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
   1703    src = randULong();
   1704    DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
   1705 }
   1706 
   1707 
   1708 void test_EXTRACTPS ( void )
   1709 {
   1710    V128 src;
   1711    randV128(&src);
   1712    DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
   1713    DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
   1714    DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
   1715    DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
   1716 }
   1717 
   1718 
   1719 void test_PHMINPOSUW ( void )
   1720 {
   1721    V128 src, dst;
   1722    Int i;
   1723    for (i = 0; i < 20; i++) {
   1724       randV128(&src);
   1725       randV128(&dst);
   1726       DO_mandr_r("phminposuw", src, dst);
   1727    }
   1728    memset(src, 0x55, sizeof(src));
   1729    memset(dst, 0xAA, sizeof(dst));
   1730    DO_mandr_r("phminposuw", src, dst);
   1731 }
   1732 
   1733 void test_PMAXSB ( void )
   1734 {
   1735    V128 src, dst;
   1736    Int i;
   1737    for (i = 0; i < 10; i++) {
   1738       randV128(&src);
   1739       randV128(&dst);
   1740       DO_mandr_r("pmaxsb", src, dst);
   1741    }
   1742 }
   1743 
   1744 void test_PMAXSD ( void )
   1745 {
   1746    V128 src, dst;
   1747    Int i;
   1748    for (i = 0; i < 10; i++) {
   1749       randV128(&src);
   1750       randV128(&dst);
   1751       DO_mandr_r("pmaxsd", src, dst);
   1752    }
   1753 }
   1754 
   1755 void test_PMAXUD ( void )
   1756 {
   1757    V128 src, dst;
   1758    Int i;
   1759    for (i = 0; i < 10; i++) {
   1760       randV128(&src);
   1761       randV128(&dst);
   1762       DO_mandr_r("pmaxud", src, dst);
   1763    }
   1764 }
   1765 
   1766 void test_PMAXUW ( void )
   1767 {
   1768    V128 src, dst;
   1769    Int i;
   1770    for (i = 0; i < 10; i++) {
   1771       randV128(&src);
   1772       randV128(&dst);
   1773       DO_mandr_r("pmaxuw", src, dst);
   1774    }
   1775 }
   1776 
   1777 void test_PMINSB ( void )
   1778 {
   1779    V128 src, dst;
   1780    Int i;
   1781    for (i = 0; i < 10; i++) {
   1782       randV128(&src);
   1783       randV128(&dst);
   1784       DO_mandr_r("pminsb", src, dst);
   1785    }
   1786 }
   1787 
   1788 void test_PMINSD ( void )
   1789 {
   1790    V128 src, dst;
   1791    Int i;
   1792    for (i = 0; i < 10; i++) {
   1793       randV128(&src);
   1794       randV128(&dst);
   1795       DO_mandr_r("pminsd", src, dst);
   1796    }
   1797 }
   1798 
   1799 void test_PMINUD ( void )
   1800 {
   1801    V128 src, dst;
   1802    Int i;
   1803    for (i = 0; i < 10; i++) {
   1804       randV128(&src);
   1805       randV128(&dst);
   1806       DO_mandr_r("pminud", src, dst);
   1807    }
   1808 }
   1809 
   1810 void test_PMINUW ( void )
   1811 {
   1812    V128 src, dst;
   1813    Int i;
   1814    for (i = 0; i < 10; i++) {
   1815       randV128(&src);
   1816       randV128(&dst);
   1817       DO_mandr_r("pminuw", src, dst);
   1818    }
   1819 }
   1820 
   1821 void test_PMOVSXBW ( void )
   1822 {
   1823    V128 src, dst;
   1824    Int i;
   1825    for (i = 0; i < 10; i++) {
   1826       randV128(&src);
   1827       randV128(&dst);
   1828       DO_mandr_r("pmovsxbw", src, dst);
   1829    }
   1830 }
   1831 
   1832 void test_PMOVSXBD ( void )
   1833 {
   1834    V128 src, dst;
   1835    Int i;
   1836    for (i = 0; i < 10; i++) {
   1837       randV128(&src);
   1838       randV128(&dst);
   1839       DO_mandr_r("pmovsxbd", src, dst);
   1840    }
   1841 }
   1842 
   1843 void test_PMOVSXBQ ( void )
   1844 {
   1845    V128 src, dst;
   1846    Int i;
   1847    for (i = 0; i < 10; i++) {
   1848       randV128(&src);
   1849       randV128(&dst);
   1850       DO_mandr_r("pmovsxbq", src, dst);
   1851    }
   1852 }
   1853 
   1854 void test_PMOVSXWD ( void )
   1855 {
   1856    V128 src, dst;
   1857    Int i;
   1858    for (i = 0; i < 10; i++) {
   1859       randV128(&src);
   1860       randV128(&dst);
   1861       DO_mandr_r("pmovsxwd", src, dst);
   1862    }
   1863 }
   1864 
   1865 void test_PMOVSXWQ ( void )
   1866 {
   1867    V128 src, dst;
   1868    Int i;
   1869    for (i = 0; i < 10; i++) {
   1870       randV128(&src);
   1871       randV128(&dst);
   1872       DO_mandr_r("pmovsxwq", src, dst);
   1873    }
   1874 }
   1875 
   1876 void test_PMOVSXDQ ( void )
   1877 {
   1878    V128 src, dst;
   1879    Int i;
   1880    for (i = 0; i < 10; i++) {
   1881       randV128(&src);
   1882       randV128(&dst);
   1883       DO_mandr_r("pmovsxdq", src, dst);
   1884    }
   1885 }
   1886 
   1887 void test_PMOVZXBW ( void )
   1888 {
   1889    V128 src, dst;
   1890    Int i;
   1891    for (i = 0; i < 10; i++) {
   1892       randV128(&src);
   1893       randV128(&dst);
   1894       DO_mandr_r("pmovzxbw", src, dst);
   1895    }
   1896 }
   1897 
   1898 void test_PMOVZXBD ( void )
   1899 {
   1900    V128 src, dst;
   1901    Int i;
   1902    for (i = 0; i < 10; i++) {
   1903       randV128(&src);
   1904       randV128(&dst);
   1905       DO_mandr_r("pmovzxbd", src, dst);
   1906    }
   1907 }
   1908 
   1909 void test_PMOVZXBQ ( void )
   1910 {
   1911    V128 src, dst;
   1912    Int i;
   1913    for (i = 0; i < 10; i++) {
   1914       randV128(&src);
   1915       randV128(&dst);
   1916       DO_mandr_r("pmovzxbq", src, dst);
   1917    }
   1918 }
   1919 
   1920 void test_PMOVZXWD ( void )
   1921 {
   1922    V128 src, dst;
   1923    Int i;
   1924    for (i = 0; i < 10; i++) {
   1925       randV128(&src);
   1926       randV128(&dst);
   1927       DO_mandr_r("pmovzxwd", src, dst);
   1928    }
   1929 }
   1930 
   1931 void test_PMOVZXWQ ( void )
   1932 {
   1933    V128 src, dst;
   1934    Int i;
   1935    for (i = 0; i < 10; i++) {
   1936       randV128(&src);
   1937       randV128(&dst);
   1938       DO_mandr_r("pmovzxwq", src, dst);
   1939    }
   1940 }
   1941 
   1942 void test_PMOVZXDQ ( void )
   1943 {
   1944    V128 src, dst;
   1945    Int i;
   1946    for (i = 0; i < 10; i++) {
   1947       randV128(&src);
   1948       randV128(&dst);
   1949       DO_mandr_r("pmovzxdq", src, dst);
   1950    }
   1951 }
   1952 
   1953 void test_PMULDQ ( void )
   1954 {
   1955    V128 src, dst;
   1956    Int i;
   1957    for (i = 0; i < 10; i++) {
   1958       randV128(&src);
   1959       randV128(&dst);
   1960       DO_mandr_r("pmuldq", src, dst);
   1961    }
   1962 }
   1963 
   1964 
   1965 void test_PMULLD ( void )
   1966 {
   1967    V128 src, dst;
   1968    Int i;
   1969    for (i = 0; i < 10; i++) {
   1970       randV128(&src);
   1971       randV128(&dst);
   1972       DO_mandr_r("pmulld", src, dst);
   1973    }
   1974 }
   1975 
   1976 
   1977 void test_POPCNTQ ( void )
   1978 {
   1979    ULong block[4];
   1980    Int i;
   1981    ULong oszacp_mask = 0x8D5;
   1982    for (i = 0; i < 10; i++) {
   1983       block[0] = i == 0 ? 0 : randULong();
   1984       block[1] = randULong();
   1985       block[2] = randULong();
   1986       block[3] = randULong();
   1987       __asm__ __volatile__(
   1988          "movq %0,       %%rax"  "\n\t"
   1989          "movq 0(%%rax), %%rdi"  "\n\t"
   1990          "movq 8(%%rax), %%r11"  "\n\t"
   1991 #ifndef VGP_amd64_darwin
   1992          "popcntq %%rdi, %%r11"  "\n\t"
   1993 #else
   1994          "popcnt  %%rdi, %%r11"  "\n\t"
   1995 #endif
   1996          "movq %%r11, 16(%%rax)"  "\n\t"
   1997          "pushfq"                 "\n\t"
   1998          "popq %%r12"             "\n\t"
   1999          "movq %%r12, 24(%%rax)"  "\n"
   2000          : /*out*/
   2001          : /*in*/"r"(&block[0])
   2002          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2003       );
   2004       printf("r popcntq  %016llx %016llx  %016llx %016llx\n",
   2005              block[0], block[1], block[2], block[3] & oszacp_mask);
   2006 
   2007       block[0] = i == 0 ? 0 : randULong();
   2008       block[1] = randULong();
   2009       block[2] = randULong();
   2010       block[3] = randULong();
   2011       __asm__ __volatile__(
   2012          "movq %0,       %%rax"  "\n\t"
   2013          "movq 8(%%rax), %%r11"  "\n\t"
   2014 #ifndef VGP_amd64_darwin
   2015          "popcntq 0(%%rax), %%r11"  "\n\t"
   2016 #else
   2017          "popcnt  0(%%rax), %%r11"  "\n\t"
   2018 #endif
   2019          "movq %%r11, 16(%%rax)"  "\n\t"
   2020          "pushfq"                 "\n\t"
   2021          "popq %%r12"             "\n\t"
   2022          "movq %%r12, 24(%%rax)"  "\n"
   2023          : /*out*/
   2024          : /*in*/"r"(&block[0])
   2025          : /*trash*/ "cc", "memory", "r11", "r12"
   2026       );
   2027       printf("m popcntq  %016llx %016llx  %016llx %016llx\n",
   2028              block[0], block[1], block[2], block[3] & oszacp_mask);
   2029    }
   2030 }
   2031 
   2032 
   2033 void test_POPCNTL ( void )
   2034 {
   2035    ULong block[4];
   2036    Int i;
   2037    ULong oszacp_mask = 0x8D5;
   2038    for (i = 0; i < 10; i++) {
   2039       block[0] = i == 0 ? 0 : randULong();
   2040       block[1] = randULong();
   2041       block[2] = randULong();
   2042       block[3] = randULong();
   2043       __asm__ __volatile__(
   2044          "movq %0,       %%rax"  "\n\t"
   2045          "movq 0(%%rax), %%rdi"  "\n\t"
   2046          "movq 8(%%rax), %%r11"  "\n\t"
   2047 #ifndef VGP_amd64_darwin
   2048          "popcntl %%edi, %%r11d"  "\n\t"
   2049 #else
   2050          "popcnt  %%edi, %%r11d"  "\n\t"
   2051 #endif
   2052          "movq %%r11, 16(%%rax)"  "\n\t"
   2053          "pushfq"                 "\n\t"
   2054          "popq %%r12"             "\n\t"
   2055          "movq %%r12, 24(%%rax)"  "\n"
   2056          : /*out*/
   2057          : /*in*/"r"(&block[0])
   2058          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2059       );
   2060       printf("r popcntl  %016llx %016llx  %016llx %016llx\n",
   2061              block[0], block[1], block[2], block[3] & oszacp_mask);
   2062 
   2063       block[0] = i == 0 ? 0 : randULong();
   2064       block[1] = randULong();
   2065       block[2] = randULong();
   2066       block[3] = randULong();
   2067       __asm__ __volatile__(
   2068          "movq %0,       %%rax"  "\n\t"
   2069          "movq 8(%%rax), %%r11"  "\n\t"
   2070 #ifndef VGP_amd64_darwin
   2071          "popcntl 0(%%rax), %%r11d"  "\n\t"
   2072 #else
   2073          "popcnt  0(%%rax), %%r11d"  "\n\t"
   2074 #endif
   2075          "movq %%r11, 16(%%rax)"  "\n\t"
   2076          "pushfq"                 "\n\t"
   2077          "popq %%r12"             "\n\t"
   2078          "movq %%r12, 24(%%rax)"  "\n"
   2079          : /*out*/
   2080          : /*in*/"r"(&block[0])
   2081          : /*trash*/ "cc", "memory", "r11", "r12"
   2082       );
   2083       printf("m popcntl  %016llx %016llx  %016llx %016llx\n",
   2084              block[0], block[1], block[2], block[3] & oszacp_mask);
   2085    }
   2086 }
   2087 
   2088 
   2089 void test_POPCNTW ( void )
   2090 {
   2091    ULong block[4];
   2092    Int i;
   2093    ULong oszacp_mask = 0x8D5;
   2094    for (i = 0; i < 10; i++) {
   2095       block[0] = i == 0 ? 0 : randULong();
   2096       block[1] = randULong();
   2097       block[2] = randULong();
   2098       block[3] = randULong();
   2099       __asm__ __volatile__(
   2100          "movq %0,       %%rax"  "\n\t"
   2101          "movq 0(%%rax), %%rdi"  "\n\t"
   2102          "movq 8(%%rax), %%r11"  "\n\t"
   2103 #ifndef VGP_amd64_darwin
   2104          "popcntw %%di,  %%r11w"  "\n\t"
   2105 #else
   2106          "popcnt  %%di,  %%r11w"  "\n\t"
   2107 #endif
   2108          "movq %%r11, 16(%%rax)"  "\n\t"
   2109          "pushfq"                 "\n\t"
   2110          "popq %%r12"             "\n\t"
   2111          "movq %%r12, 24(%%rax)"  "\n"
   2112          : /*out*/
   2113          : /*in*/"r"(&block[0])
   2114          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2115       );
   2116       printf("r popcntw  %016llx %016llx  %016llx %016llx\n",
   2117              block[0], block[1], block[2], block[3] & oszacp_mask);
   2118 
   2119       block[0] = i == 0 ? 0 : randULong();
   2120       block[1] = randULong();
   2121       block[2] = randULong();
   2122       block[3] = randULong();
   2123       __asm__ __volatile__(
   2124          "movq %0,       %%rax"  "\n\t"
   2125          "movq 8(%%rax), %%r11"  "\n\t"
   2126 #ifndef VGP_amd64_darwin
   2127          "popcntw 0(%%rax), %%r11w"  "\n\t"
   2128 #else
   2129          "popcnt  0(%%rax), %%r11w"  "\n\t"
   2130 #endif
   2131          "movq %%r11, 16(%%rax)"  "\n\t"
   2132          "pushfq"                 "\n\t"
   2133          "popq %%r12"             "\n\t"
   2134          "movq %%r12, 24(%%rax)"  "\n"
   2135          : /*out*/
   2136          : /*in*/"r"(&block[0])
   2137          : /*trash*/ "cc", "memory", "r11", "r12"
   2138       );
   2139       printf("m popcntw  %016llx %016llx  %016llx %016llx\n",
   2140              block[0], block[1], block[2], block[3] & oszacp_mask);
   2141    }
   2142 }
   2143 
   2144 
   2145 void test_PCMPGTQ ( void )
   2146 {
   2147    V128 spec[7];
   2148    do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
   2149    do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
   2150    do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
   2151    do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
   2152    do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
   2153    do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
   2154    do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
   2155 
   2156    V128 src, dst;
   2157    Int i, j;
   2158    for (i = 0; i < 10; i++) {
   2159       randV128(&src);
   2160       randV128(&dst);
   2161       DO_mandr_r("pcmpgtq", src, dst);
   2162    }
   2163    for (i = 0; i < 7; i++) {
   2164       for (j = 0; j < 7; j++) {
   2165          memcpy(&src, &spec[i], 16);
   2166          memcpy(&dst, &spec[j], 16);
   2167          DO_mandr_r("pcmpgtq", src, dst);
   2168       }
   2169    }
   2170 }
   2171 
   2172 /* ------------ ROUNDSD ------------ */
   2173 
   2174 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2175 {
   2176    if (mem) {
   2177       __asm__ __volatile__(
   2178          "movupd  (%1), %%xmm11"       "\n\t"
   2179          "roundsd $0, (%0), %%xmm11"   "\n\t"
   2180          "movupd  %%xmm11, (%1)"       "\n"
   2181          : /*OUT*/
   2182          : /*IN*/ "r"(src), "r"(dst)
   2183          : /*TRASH*/ "xmm11"
   2184       );
   2185    } else {
   2186       __asm__ __volatile__(
   2187          "movupd  (%1), %%xmm11"         "\n\t"
   2188          "movupd  (%0), %%xmm2"          "\n\t"
   2189          "roundsd $0, %%xmm2, %%xmm11"   "\n\t"
   2190          "movupd  %%xmm11, (%1)"         "\n"
   2191          : /*OUT*/
   2192          : /*IN*/ "r"(src), "r"(dst)
   2193          : /*TRASH*/ "xmm11","xmm2"
   2194       );
   2195    }
   2196 }
   2197 
   2198 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2199 {
   2200    if (mem) {
   2201       __asm__ __volatile__(
   2202          "movupd  (%1), %%xmm11"       "\n\t"
   2203          "roundsd $1, (%0), %%xmm11"   "\n\t"
   2204          "movupd  %%xmm11, (%1)"       "\n"
   2205          : /*OUT*/
   2206          : /*IN*/ "r"(src), "r"(dst)
   2207          : /*TRASH*/ "xmm11"
   2208       );
   2209    } else {
   2210       __asm__ __volatile__(
   2211          "movupd  (%1), %%xmm11"         "\n\t"
   2212          "movupd  (%0), %%xmm2"          "\n\t"
   2213          "roundsd $1, %%xmm2, %%xmm11"   "\n\t"
   2214          "movupd  %%xmm11, (%1)"         "\n"
   2215          : /*OUT*/
   2216          : /*IN*/ "r"(src), "r"(dst)
   2217          : /*TRASH*/ "xmm11","xmm2"
   2218       );
   2219    }
   2220 }
   2221 
   2222 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2223 {
   2224    if (mem) {
   2225       __asm__ __volatile__(
   2226          "movupd  (%1), %%xmm11"       "\n\t"
   2227          "roundsd $2, (%0), %%xmm11"   "\n\t"
   2228          "movupd  %%xmm11, (%1)"       "\n"
   2229          : /*OUT*/
   2230          : /*IN*/ "r"(src), "r"(dst)
   2231          : /*TRASH*/ "xmm11"
   2232       );
   2233    } else {
   2234       __asm__ __volatile__(
   2235          "movupd  (%1), %%xmm11"         "\n\t"
   2236          "movupd  (%0), %%xmm2"          "\n\t"
   2237          "roundsd $2, %%xmm2, %%xmm11"   "\n\t"
   2238          "movupd  %%xmm11, (%1)"         "\n"
   2239          : /*OUT*/
   2240          : /*IN*/ "r"(src), "r"(dst)
   2241          : /*TRASH*/ "xmm11","xmm2"
   2242       );
   2243    }
   2244 }
   2245 
   2246 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2247 {
   2248    if (mem) {
   2249       __asm__ __volatile__(
   2250          "movupd  (%1), %%xmm11"       "\n\t"
   2251          "roundsd $3, (%0), %%xmm11"   "\n\t"
   2252          "movupd  %%xmm11, (%1)"       "\n"
   2253          : /*OUT*/
   2254          : /*IN*/ "r"(src), "r"(dst)
   2255          : /*TRASH*/ "xmm11"
   2256       );
   2257    } else {
   2258       __asm__ __volatile__(
   2259          "movupd  (%1), %%xmm11"         "\n\t"
   2260          "movupd  (%0), %%xmm2"          "\n\t"
   2261          "roundsd $3, %%xmm2, %%xmm11"   "\n\t"
   2262          "movupd  %%xmm11, (%1)"         "\n"
   2263          : /*OUT*/
   2264          : /*IN*/ "r"(src), "r"(dst)
   2265          : /*TRASH*/ "xmm11","xmm2"
   2266       );
   2267    }
   2268 }
   2269 
   2270 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2271 {
   2272    if (mem) {
   2273       __asm__ __volatile__(
   2274          "movupd  (%1), %%xmm11"       "\n\t"
   2275          "roundsd $4, (%0), %%xmm11"   "\n\t"
   2276          "movupd  %%xmm11, (%1)"       "\n"
   2277          : /*OUT*/
   2278          : /*IN*/ "r"(src), "r"(dst)
   2279          : /*TRASH*/ "xmm11"
   2280       );
   2281    } else {
   2282       __asm__ __volatile__(
   2283          "movupd  (%1), %%xmm11"         "\n\t"
   2284          "movupd  (%0), %%xmm2"          "\n\t"
   2285          "roundsd $4, %%xmm2, %%xmm11"   "\n\t"
   2286          "movupd  %%xmm11, (%1)"         "\n"
   2287          : /*OUT*/
   2288          : /*IN*/ "r"(src), "r"(dst)
   2289          : /*TRASH*/ "xmm11","xmm2"
   2290       );
   2291    }
   2292 }
   2293 
   2294 void test_ROUNDSD_w_immediate_rounding ( void )
   2295 {
   2296    double vals[22];
   2297    Int i = 0;
   2298    vals[i++] = 0.0;
   2299    vals[i++] = -0.0;
   2300    vals[i++] = mkPosInf();
   2301    vals[i++] = mkNegInf();
   2302    vals[i++] = mkPosNan();
   2303    vals[i++] = mkNegNan();
   2304    vals[i++] = -1.3;
   2305    vals[i++] = -1.1;
   2306    vals[i++] = -0.9;
   2307    vals[i++] = -0.7;
   2308    vals[i++] = -0.50001;
   2309    vals[i++] = -0.49999;
   2310    vals[i++] = -0.3;
   2311    vals[i++] = -0.1;
   2312    vals[i++] = 0.1;
   2313    vals[i++] = 0.3;
   2314    vals[i++] = 0.49999;
   2315    vals[i++] = 0.50001;
   2316    vals[i++] = 0.7;
   2317    vals[i++] = 0.9;
   2318    vals[i++] = 1.1;
   2319    vals[i++] = 1.3;
   2320    assert(i == 22);
   2321 
   2322    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2323       V128 src, dst;
   2324 
   2325       randV128(&src);
   2326       randV128(&dst);
   2327       memcpy(&src[0], &vals[i], 8);
   2328       do_ROUNDSD_000(False/*reg*/, &src, &dst);
   2329       printf("r roundsd_000  ");
   2330       showV128(&src);
   2331       printf(" ");
   2332       showV128(&dst);
   2333       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2334       printf("\n");
   2335 
   2336       randV128(&src);
   2337       randV128(&dst);
   2338       memcpy(&src[0], &vals[i], 8);
   2339       do_ROUNDSD_000(True/*mem*/, &src, &dst);
   2340       printf("m roundsd_000  ");
   2341       showV128(&src);
   2342       printf(" ");
   2343       showV128(&dst);
   2344       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2345       printf("\n");
   2346 
   2347 
   2348       randV128(&src);
   2349       randV128(&dst);
   2350       memcpy(&src[0], &vals[i], 8);
   2351       do_ROUNDSD_001(False/*reg*/, &src, &dst);
   2352       printf("r roundsd_001  ");
   2353       showV128(&src);
   2354       printf(" ");
   2355       showV128(&dst);
   2356       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2357       printf("\n");
   2358 
   2359       randV128(&src);
   2360       randV128(&dst);
   2361       memcpy(&src[0], &vals[i], 8);
   2362       do_ROUNDSD_001(True/*mem*/, &src, &dst);
   2363       printf("m roundsd_001  ");
   2364       showV128(&src);
   2365       printf(" ");
   2366       showV128(&dst);
   2367       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2368       printf("\n");
   2369 
   2370 
   2371       randV128(&src);
   2372       randV128(&dst);
   2373       memcpy(&src[0], &vals[i], 8);
   2374       do_ROUNDSD_010(False/*reg*/, &src, &dst);
   2375       printf("r roundsd_010  ");
   2376       showV128(&src);
   2377       printf(" ");
   2378       showV128(&dst);
   2379       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2380       printf("\n");
   2381 
   2382       randV128(&src);
   2383       randV128(&dst);
   2384       memcpy(&src[0], &vals[i], 8);
   2385       do_ROUNDSD_010(True/*mem*/, &src, &dst);
   2386       printf("m roundsd_010  ");
   2387       showV128(&src);
   2388       printf(" ");
   2389       showV128(&dst);
   2390       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2391       printf("\n");
   2392 
   2393 
   2394       randV128(&src);
   2395       randV128(&dst);
   2396       memcpy(&src[0], &vals[i], 8);
   2397       do_ROUNDSD_011(False/*reg*/, &src, &dst);
   2398       printf("r roundsd_011  ");
   2399       showV128(&src);
   2400       printf(" ");
   2401       showV128(&dst);
   2402       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2403       printf("\n");
   2404 
   2405       randV128(&src);
   2406       randV128(&dst);
   2407       memcpy(&src[0], &vals[i], 8);
   2408       do_ROUNDSD_011(True/*mem*/, &src, &dst);
   2409       printf("m roundsd_011  ");
   2410       showV128(&src);
   2411       printf(" ");
   2412       showV128(&dst);
   2413       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2414       printf("\n");
   2415    }
   2416 }
   2417 
   2418 void test_ROUNDSD_w_mxcsr_rounding ( void )
   2419 {
   2420    UInt rm;
   2421    double vals[22];
   2422    Int i = 0;
   2423    vals[i++] = 0.0;
   2424    vals[i++] = -0.0;
   2425    vals[i++] = mkPosInf();
   2426    vals[i++] = mkNegInf();
   2427    vals[i++] = mkPosNan();
   2428    vals[i++] = mkNegNan();
   2429    vals[i++] = -1.3;
   2430    vals[i++] = -1.1;
   2431    vals[i++] = -0.9;
   2432    vals[i++] = -0.7;
   2433    vals[i++] = -0.50001;
   2434    vals[i++] = -0.49999;
   2435    vals[i++] = -0.3;
   2436    vals[i++] = -0.1;
   2437    vals[i++] = 0.1;
   2438    vals[i++] = 0.3;
   2439    vals[i++] = 0.49999;
   2440    vals[i++] = 0.50001;
   2441    vals[i++] = 0.7;
   2442    vals[i++] = 0.9;
   2443    vals[i++] = 1.1;
   2444    vals[i++] = 1.3;
   2445    assert(i == 22);
   2446 
   2447    rm = get_sse_roundingmode();
   2448    assert(rm == 0); // 0 == RN == default
   2449 
   2450    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2451       V128 src, dst;
   2452 
   2453       for (rm = 0; rm <= 3; rm++) {
   2454          set_sse_roundingmode(rm);
   2455 
   2456          randV128(&src);
   2457          randV128(&dst);
   2458          memcpy(&src[0], &vals[i], 8);
   2459          do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
   2460          printf("r (rm=%u) roundsd_1XX  ", rm);
   2461          showV128(&src);
   2462          printf(" ");
   2463          showV128(&dst);
   2464          printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2465          printf("\n");
   2466 
   2467          randV128(&src);
   2468          randV128(&dst);
   2469          memcpy(&src[0], &vals[i], 8);
   2470          do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
   2471          printf("m (rm=%u) roundsd_1XX  ", rm);
   2472          showV128(&src);
   2473          printf(" ");
   2474          showV128(&dst);
   2475          printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2476          printf("\n");
   2477       }
   2478    }
   2479 
   2480    rm = get_sse_roundingmode();
   2481    assert(rm == 3);
   2482    set_sse_roundingmode(0);
   2483    rm = get_sse_roundingmode();
   2484    assert(rm == 0); // 0 == RN == default
   2485 }
   2486 
   2487 
   2488 /* ------------ ROUNDSS ------------ */
   2489 
   2490 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2491 {
   2492    if (mem) {
   2493       __asm__ __volatile__(
   2494          "movupd  (%1), %%xmm11"       "\n\t"
   2495          "roundss $0, (%0), %%xmm11"   "\n\t"
   2496          "movupd  %%xmm11, (%1)"       "\n"
   2497          : /*OUT*/
   2498          : /*IN*/ "r"(src), "r"(dst)
   2499          : /*TRASH*/ "xmm11"
   2500       );
   2501    } else {
   2502       __asm__ __volatile__(
   2503          "movupd  (%1), %%xmm11"         "\n\t"
   2504          "movupd  (%0), %%xmm2"          "\n\t"
   2505          "roundss $0, %%xmm2, %%xmm11"   "\n\t"
   2506          "movupd  %%xmm11, (%1)"         "\n"
   2507          : /*OUT*/
   2508          : /*IN*/ "r"(src), "r"(dst)
   2509          : /*TRASH*/ "xmm11","xmm2"
   2510       );
   2511    }
   2512 }
   2513 
   2514 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2515 {
   2516    if (mem) {
   2517       __asm__ __volatile__(
   2518          "movupd  (%1), %%xmm11"       "\n\t"
   2519          "roundss $1, (%0), %%xmm11"   "\n\t"
   2520          "movupd  %%xmm11, (%1)"       "\n"
   2521          : /*OUT*/
   2522          : /*IN*/ "r"(src), "r"(dst)
   2523          : /*TRASH*/ "xmm11"
   2524       );
   2525    } else {
   2526       __asm__ __volatile__(
   2527          "movupd  (%1), %%xmm11"         "\n\t"
   2528          "movupd  (%0), %%xmm2"          "\n\t"
   2529          "roundss $1, %%xmm2, %%xmm11"   "\n\t"
   2530          "movupd  %%xmm11, (%1)"         "\n"
   2531          : /*OUT*/
   2532          : /*IN*/ "r"(src), "r"(dst)
   2533          : /*TRASH*/ "xmm11","xmm2"
   2534       );
   2535    }
   2536 }
   2537 
   2538 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2539 {
   2540    if (mem) {
   2541       __asm__ __volatile__(
   2542          "movupd  (%1), %%xmm11"       "\n\t"
   2543          "roundss $2, (%0), %%xmm11"   "\n\t"
   2544          "movupd  %%xmm11, (%1)"       "\n"
   2545          : /*OUT*/
   2546          : /*IN*/ "r"(src), "r"(dst)
   2547          : /*TRASH*/ "xmm11"
   2548       );
   2549    } else {
   2550       __asm__ __volatile__(
   2551          "movupd  (%1), %%xmm11"         "\n\t"
   2552          "movupd  (%0), %%xmm2"          "\n\t"
   2553          "roundss $2, %%xmm2, %%xmm11"   "\n\t"
   2554          "movupd  %%xmm11, (%1)"         "\n"
   2555          : /*OUT*/
   2556          : /*IN*/ "r"(src), "r"(dst)
   2557          : /*TRASH*/ "xmm11","xmm2"
   2558       );
   2559    }
   2560 }
   2561 
   2562 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2563 {
   2564    if (mem) {
   2565       __asm__ __volatile__(
   2566          "movupd  (%1), %%xmm11"       "\n\t"
   2567          "roundss $3, (%0), %%xmm11"   "\n\t"
   2568          "movupd  %%xmm11, (%1)"       "\n"
   2569          : /*OUT*/
   2570          : /*IN*/ "r"(src), "r"(dst)
   2571          : /*TRASH*/ "xmm11"
   2572       );
   2573    } else {
   2574       __asm__ __volatile__(
   2575          "movupd  (%1), %%xmm11"         "\n\t"
   2576          "movupd  (%0), %%xmm2"          "\n\t"
   2577          "roundss $3, %%xmm2, %%xmm11"   "\n\t"
   2578          "movupd  %%xmm11, (%1)"         "\n"
   2579          : /*OUT*/
   2580          : /*IN*/ "r"(src), "r"(dst)
   2581          : /*TRASH*/ "xmm11","xmm2"
   2582       );
   2583    }
   2584 }
   2585 
   2586 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2587 {
   2588    if (mem) {
   2589       __asm__ __volatile__(
   2590          "movupd  (%1), %%xmm11"       "\n\t"
   2591          "roundss $4, (%0), %%xmm11"   "\n\t"
   2592          "movupd  %%xmm11, (%1)"       "\n"
   2593          : /*OUT*/
   2594          : /*IN*/ "r"(src), "r"(dst)
   2595          : /*TRASH*/ "xmm11"
   2596       );
   2597    } else {
   2598       __asm__ __volatile__(
   2599          "movupd  (%1), %%xmm11"         "\n\t"
   2600          "movupd  (%0), %%xmm2"          "\n\t"
   2601          "roundss $4, %%xmm2, %%xmm11"   "\n\t"
   2602          "movupd  %%xmm11, (%1)"         "\n"
   2603          : /*OUT*/
   2604          : /*IN*/ "r"(src), "r"(dst)
   2605          : /*TRASH*/ "xmm11","xmm2"
   2606       );
   2607    }
   2608 }
   2609 
   2610 void test_ROUNDSS_w_immediate_rounding ( void )
   2611 {
   2612    float vals[22];
   2613    Int i = 0;
   2614    vals[i++] = 0.0;
   2615    vals[i++] = -0.0;
   2616    vals[i++] = mkPosInf();
   2617    vals[i++] = mkNegInf();
   2618    vals[i++] = mkPosNan();
   2619    vals[i++] = mkNegNan();
   2620    vals[i++] = -1.3;
   2621    vals[i++] = -1.1;
   2622    vals[i++] = -0.9;
   2623    vals[i++] = -0.7;
   2624    vals[i++] = -0.50001;
   2625    vals[i++] = -0.49999;
   2626    vals[i++] = -0.3;
   2627    vals[i++] = -0.1;
   2628    vals[i++] = 0.1;
   2629    vals[i++] = 0.3;
   2630    vals[i++] = 0.49999;
   2631    vals[i++] = 0.50001;
   2632    vals[i++] = 0.7;
   2633    vals[i++] = 0.9;
   2634    vals[i++] = 1.1;
   2635    vals[i++] = 1.3;
   2636    assert(i == 22);
   2637 
   2638    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2639       V128 src, dst;
   2640 
   2641       randV128(&src);
   2642       randV128(&dst);
   2643       memcpy(&src[0], &vals[i], 4);
   2644       do_ROUNDSS_000(False/*reg*/, &src, &dst);
   2645       printf("r roundss_000  ");
   2646       showV128(&src);
   2647       printf(" ");
   2648       showV128(&dst);
   2649       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2650       printf("\n");
   2651 
   2652       randV128(&src);
   2653       randV128(&dst);
   2654       memcpy(&src[0], &vals[i], 4);
   2655       do_ROUNDSS_000(True/*mem*/, &src, &dst);
   2656       printf("m roundss_000  ");
   2657       showV128(&src);
   2658       printf(" ");
   2659       showV128(&dst);
   2660       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2661       printf("\n");
   2662 
   2663 
   2664       randV128(&src);
   2665       randV128(&dst);
   2666       memcpy(&src[0], &vals[i], 4);
   2667       do_ROUNDSS_001(False/*reg*/, &src, &dst);
   2668       printf("r roundss_001  ");
   2669       showV128(&src);
   2670       printf(" ");
   2671       showV128(&dst);
   2672       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2673       printf("\n");
   2674 
   2675       randV128(&src);
   2676       randV128(&dst);
   2677       memcpy(&src[0], &vals[i], 4);
   2678       do_ROUNDSS_001(True/*mem*/, &src, &dst);
   2679       printf("m roundss_001  ");
   2680       showV128(&src);
   2681       printf(" ");
   2682       showV128(&dst);
   2683       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2684       printf("\n");
   2685 
   2686 
   2687       randV128(&src);
   2688       randV128(&dst);
   2689       memcpy(&src[0], &vals[i], 4);
   2690       do_ROUNDSS_010(False/*reg*/, &src, &dst);
   2691       printf("r roundss_010  ");
   2692       showV128(&src);
   2693       printf(" ");
   2694       showV128(&dst);
   2695       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2696       printf("\n");
   2697 
   2698       randV128(&src);
   2699       randV128(&dst);
   2700       memcpy(&src[0], &vals[i], 4);
   2701       do_ROUNDSS_010(True/*mem*/, &src, &dst);
   2702       printf("m roundss_010  ");
   2703       showV128(&src);
   2704       printf(" ");
   2705       showV128(&dst);
   2706       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2707       printf("\n");
   2708 
   2709 
   2710       randV128(&src);
   2711       randV128(&dst);
   2712       memcpy(&src[0], &vals[i], 4);
   2713       do_ROUNDSS_011(False/*reg*/, &src, &dst);
   2714       printf("r roundss_011  ");
   2715       showV128(&src);
   2716       printf(" ");
   2717       showV128(&dst);
   2718       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2719       printf("\n");
   2720 
   2721       randV128(&src);
   2722       randV128(&dst);
   2723       memcpy(&src[0], &vals[i], 4);
   2724       do_ROUNDSS_011(True/*mem*/, &src, &dst);
   2725       printf("m roundss_011  ");
   2726       showV128(&src);
   2727       printf(" ");
   2728       showV128(&dst);
   2729       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2730       printf("\n");
   2731    }
   2732 }
   2733 
   2734 void test_ROUNDSS_w_mxcsr_rounding ( void )
   2735 {
   2736    UInt rm;
   2737    float vals[22];
   2738    Int i = 0;
   2739    vals[i++] = 0.0;
   2740    vals[i++] = -0.0;
   2741    vals[i++] = mkPosInf();
   2742    vals[i++] = mkNegInf();
   2743    vals[i++] = mkPosNan();
   2744    vals[i++] = mkNegNan();
   2745    vals[i++] = -1.3;
   2746    vals[i++] = -1.1;
   2747    vals[i++] = -0.9;
   2748    vals[i++] = -0.7;
   2749    vals[i++] = -0.50001;
   2750    vals[i++] = -0.49999;
   2751    vals[i++] = -0.3;
   2752    vals[i++] = -0.1;
   2753    vals[i++] = 0.1;
   2754    vals[i++] = 0.3;
   2755    vals[i++] = 0.49999;
   2756    vals[i++] = 0.50001;
   2757    vals[i++] = 0.7;
   2758    vals[i++] = 0.9;
   2759    vals[i++] = 1.1;
   2760    vals[i++] = 1.3;
   2761    assert(i == 22);
   2762 
   2763    rm = get_sse_roundingmode();
   2764    assert(rm == 0); // 0 == RN == default
   2765 
   2766    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2767       V128 src, dst;
   2768 
   2769       for (rm = 0; rm <= 3; rm++) {
   2770          set_sse_roundingmode(rm);
   2771 
   2772          randV128(&src);
   2773          randV128(&dst);
   2774          memcpy(&src[0], &vals[i], 4);
   2775          do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
   2776          printf("r (rm=%u) roundss_1XX  ", rm);
   2777          showV128(&src);
   2778          printf(" ");
   2779          showV128(&dst);
   2780          printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2781          printf("\n");
   2782 
   2783          randV128(&src);
   2784          randV128(&dst);
   2785          memcpy(&src[0], &vals[i], 4);
   2786          do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
   2787          printf("m (rm=%u) roundss_1XX  ", rm);
   2788          showV128(&src);
   2789          printf(" ");
   2790          showV128(&dst);
   2791          printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2792          printf("\n");
   2793       }
   2794    }
   2795 
   2796    rm = get_sse_roundingmode();
   2797    assert(rm == 3);
   2798    set_sse_roundingmode(0);
   2799    rm = get_sse_roundingmode();
   2800    assert(rm == 0); // 0 == RN == default
   2801 }
   2802 
   2803 /* ------------ ROUNDPD ------------ */
   2804 
   2805 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2806 {
   2807    if (mem) {
   2808       __asm__ __volatile__(
   2809          "movupd  (%1), %%xmm11"       "\n\t"
   2810          "roundpd $0, (%0), %%xmm11"   "\n\t"
   2811          "movupd  %%xmm11, (%1)"       "\n"
   2812          : /*OUT*/
   2813          : /*IN*/ "r"(src), "r"(dst)
   2814          : /*TRASH*/ "xmm11"
   2815       );
   2816    } else {
   2817       __asm__ __volatile__(
   2818          "movupd  (%1), %%xmm11"         "\n\t"
   2819          "movupd  (%0), %%xmm2"          "\n\t"
   2820          "roundpd $0, %%xmm2, %%xmm11"   "\n\t"
   2821          "movupd  %%xmm11, (%1)"         "\n"
   2822          : /*OUT*/
   2823          : /*IN*/ "r"(src), "r"(dst)
   2824          : /*TRASH*/ "xmm11","xmm2"
   2825       );
   2826    }
   2827 }
   2828 
   2829 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2830 {
   2831    if (mem) {
   2832       __asm__ __volatile__(
   2833          "movupd  (%1), %%xmm11"       "\n\t"
   2834          "roundpd $1, (%0), %%xmm11"   "\n\t"
   2835          "movupd  %%xmm11, (%1)"       "\n"
   2836          : /*OUT*/
   2837          : /*IN*/ "r"(src), "r"(dst)
   2838          : /*TRASH*/ "xmm11"
   2839       );
   2840    } else {
   2841       __asm__ __volatile__(
   2842          "movupd  (%1), %%xmm11"         "\n\t"
   2843          "movupd  (%0), %%xmm2"          "\n\t"
   2844          "roundpd $1, %%xmm2, %%xmm11"   "\n\t"
   2845          "movupd  %%xmm11, (%1)"         "\n"
   2846          : /*OUT*/
   2847          : /*IN*/ "r"(src), "r"(dst)
   2848          : /*TRASH*/ "xmm11","xmm2"
   2849       );
   2850    }
   2851 }
   2852 
   2853 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2854 {
   2855    if (mem) {
   2856       __asm__ __volatile__(
   2857          "movupd  (%1), %%xmm11"       "\n\t"
   2858          "roundpd $2, (%0), %%xmm11"   "\n\t"
   2859          "movupd  %%xmm11, (%1)"       "\n"
   2860          : /*OUT*/
   2861          : /*IN*/ "r"(src), "r"(dst)
   2862          : /*TRASH*/ "xmm11"
   2863       );
   2864    } else {
   2865       __asm__ __volatile__(
   2866          "movupd  (%1), %%xmm11"         "\n\t"
   2867          "movupd  (%0), %%xmm2"          "\n\t"
   2868          "roundpd $2, %%xmm2, %%xmm11"   "\n\t"
   2869          "movupd  %%xmm11, (%1)"         "\n"
   2870          : /*OUT*/
   2871          : /*IN*/ "r"(src), "r"(dst)
   2872          : /*TRASH*/ "xmm11","xmm2"
   2873       );
   2874    }
   2875 }
   2876 
   2877 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2878 {
   2879    if (mem) {
   2880       __asm__ __volatile__(
   2881          "movupd  (%1), %%xmm11"       "\n\t"
   2882          "roundpd $3, (%0), %%xmm11"   "\n\t"
   2883          "movupd  %%xmm11, (%1)"       "\n"
   2884          : /*OUT*/
   2885          : /*IN*/ "r"(src), "r"(dst)
   2886          : /*TRASH*/ "xmm11"
   2887       );
   2888    } else {
   2889       __asm__ __volatile__(
   2890          "movupd  (%1), %%xmm11"         "\n\t"
   2891          "movupd  (%0), %%xmm2"          "\n\t"
   2892          "roundpd $3, %%xmm2, %%xmm11"   "\n\t"
   2893          "movupd  %%xmm11, (%1)"         "\n"
   2894          : /*OUT*/
   2895          : /*IN*/ "r"(src), "r"(dst)
   2896          : /*TRASH*/ "xmm11","xmm2"
   2897       );
   2898    }
   2899 }
   2900 
   2901 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2902 {
   2903    if (mem) {
   2904       __asm__ __volatile__(
   2905          "movupd  (%1), %%xmm11"       "\n\t"
   2906          "roundpd $4, (%0), %%xmm11"   "\n\t"
   2907          "movupd  %%xmm11, (%1)"       "\n"
   2908          : /*OUT*/
   2909          : /*IN*/ "r"(src), "r"(dst)
   2910          : /*TRASH*/ "xmm11"
   2911       );
   2912    } else {
   2913       __asm__ __volatile__(
   2914          "movupd  (%1), %%xmm11"         "\n\t"
   2915          "movupd  (%0), %%xmm2"          "\n\t"
   2916          "roundpd $4, %%xmm2, %%xmm11"   "\n\t"
   2917          "movupd  %%xmm11, (%1)"         "\n"
   2918          : /*OUT*/
   2919          : /*IN*/ "r"(src), "r"(dst)
   2920          : /*TRASH*/ "xmm11","xmm2"
   2921       );
   2922    }
   2923 }
   2924 
   2925 void test_ROUNDPD_w_immediate_rounding ( void )
   2926 {
   2927    double vals[22];
   2928    Int i = 0;
   2929    vals[i++] = 0.0;
   2930    vals[i++] = -0.0;
   2931    vals[i++] = mkPosInf();
   2932    vals[i++] = mkNegInf();
   2933    vals[i++] = mkPosNan();
   2934    vals[i++] = mkNegNan();
   2935    vals[i++] = -1.3;
   2936    vals[i++] = -1.1;
   2937    vals[i++] = -0.9;
   2938    vals[i++] = -0.7;
   2939    vals[i++] = -0.50001;
   2940    vals[i++] = -0.49999;
   2941    vals[i++] = -0.3;
   2942    vals[i++] = -0.1;
   2943    vals[i++] = 0.1;
   2944    vals[i++] = 0.3;
   2945    vals[i++] = 0.49999;
   2946    vals[i++] = 0.50001;
   2947    vals[i++] = 0.7;
   2948    vals[i++] = 0.9;
   2949    vals[i++] = 1.1;
   2950    vals[i++] = 1.3;
   2951    assert(i == 22);
   2952 
   2953    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2954       V128 src, dst;
   2955 
   2956       randV128(&src);
   2957       randV128(&dst);
   2958       memcpy(&src[0], &vals[i], 8);
   2959       memcpy(&src[8], &vals[(i+11)%22], 8);
   2960       do_ROUNDPD_000(False/*reg*/, &src, &dst);
   2961       printf("r roundpd_000  ");
   2962       showV128(&src);
   2963       printf(" ");
   2964       showV128(&dst);
   2965       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   2966       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   2967       printf("\n");
   2968 
   2969       randV128(&src);
   2970       randV128(&dst);
   2971       memcpy(&src[0], &vals[i], 8);
   2972       memcpy(&src[8], &vals[(i+11)%22], 8);
   2973       do_ROUNDPD_000(True/*mem*/, &src, &dst);
   2974       printf("m roundpd_000  ");
   2975       showV128(&src);
   2976       printf(" ");
   2977       showV128(&dst);
   2978       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   2979       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   2980       printf("\n");
   2981 
   2982 
   2983       randV128(&src);
   2984       randV128(&dst);
   2985       memcpy(&src[0], &vals[i], 8);
   2986       memcpy(&src[8], &vals[(i+11)%22], 8);
   2987       do_ROUNDPD_001(False/*reg*/, &src, &dst);
   2988       printf("r roundpd_001  ");
   2989       showV128(&src);
   2990       printf(" ");
   2991       showV128(&dst);
   2992       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   2993       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   2994       printf("\n");
   2995 
   2996       randV128(&src);
   2997       randV128(&dst);
   2998       memcpy(&src[0], &vals[i], 8);
   2999       memcpy(&src[8], &vals[(i+11)%22], 8);
   3000       do_ROUNDPD_001(True/*mem*/, &src, &dst);
   3001       printf("m roundpd_001  ");
   3002       showV128(&src);
   3003       printf(" ");
   3004       showV128(&dst);
   3005       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3006       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3007       printf("\n");
   3008 
   3009 
   3010       randV128(&src);
   3011       randV128(&dst);
   3012       memcpy(&src[0], &vals[i], 8);
   3013       memcpy(&src[8], &vals[(i+11)%22], 8);
   3014       do_ROUNDPD_010(False/*reg*/, &src, &dst);
   3015       printf("r roundpd_010  ");
   3016       showV128(&src);
   3017       printf(" ");
   3018       showV128(&dst);
   3019       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3020       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3021       printf("\n");
   3022 
   3023       randV128(&src);
   3024       randV128(&dst);
   3025       memcpy(&src[0], &vals[i], 8);
   3026       memcpy(&src[8], &vals[(i+11)%22], 8);
   3027       do_ROUNDPD_010(True/*mem*/, &src, &dst);
   3028       printf("m roundpd_010  ");
   3029       showV128(&src);
   3030       printf(" ");
   3031       showV128(&dst);
   3032       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3033       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3034       printf("\n");
   3035 
   3036 
   3037       randV128(&src);
   3038       randV128(&dst);
   3039       memcpy(&src[0], &vals[i], 8);
   3040       memcpy(&src[8], &vals[(i+11)%22], 8);
   3041       do_ROUNDPD_011(False/*reg*/, &src, &dst);
   3042       printf("r roundpd_011  ");
   3043       showV128(&src);
   3044       printf(" ");
   3045       showV128(&dst);
   3046       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3047       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3048       printf("\n");
   3049 
   3050       randV128(&src);
   3051       randV128(&dst);
   3052       memcpy(&src[0], &vals[i], 8);
   3053       memcpy(&src[8], &vals[(i+11)%22], 8);
   3054       do_ROUNDPD_011(True/*mem*/, &src, &dst);
   3055       printf("m roundpd_011  ");
   3056       showV128(&src);
   3057       printf(" ");
   3058       showV128(&dst);
   3059       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3060       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3061       printf("\n");
   3062    }
   3063 }
   3064 
   3065 void test_ROUNDPD_w_mxcsr_rounding ( void )
   3066 {
   3067    UInt rm;
   3068    double vals[22];
   3069    Int i = 0;
   3070    vals[i++] = 0.0;
   3071    vals[i++] = -0.0;
   3072    vals[i++] = mkPosInf();
   3073    vals[i++] = mkNegInf();
   3074    vals[i++] = mkPosNan();
   3075    vals[i++] = mkNegNan();
   3076    vals[i++] = -1.3;
   3077    vals[i++] = -1.1;
   3078    vals[i++] = -0.9;
   3079    vals[i++] = -0.7;
   3080    vals[i++] = -0.50001;
   3081    vals[i++] = -0.49999;
   3082    vals[i++] = -0.3;
   3083    vals[i++] = -0.1;
   3084    vals[i++] = 0.1;
   3085    vals[i++] = 0.3;
   3086    vals[i++] = 0.49999;
   3087    vals[i++] = 0.50001;
   3088    vals[i++] = 0.7;
   3089    vals[i++] = 0.9;
   3090    vals[i++] = 1.1;
   3091    vals[i++] = 1.3;
   3092    assert(i == 22);
   3093 
   3094    rm = get_sse_roundingmode();
   3095    assert(rm == 0); // 0 == RN == default
   3096 
   3097    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3098       V128 src, dst;
   3099 
   3100       for (rm = 0; rm <= 3; rm++) {
   3101          set_sse_roundingmode(rm);
   3102 
   3103          randV128(&src);
   3104          randV128(&dst);
   3105          memcpy(&src[0], &vals[i], 8);
   3106          memcpy(&src[8], &vals[(i+11)%22], 8);
   3107          do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
   3108          printf("r (rm=%u) roundpd_1XX  ", rm);
   3109          showV128(&src);
   3110          printf(" ");
   3111          showV128(&dst);
   3112          printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3113          printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3114          printf("\n");
   3115 
   3116          randV128(&src);
   3117          randV128(&dst);
   3118          memcpy(&src[0], &vals[i], 8);
   3119          memcpy(&src[8], &vals[(i+11)%22], 8);
   3120          do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
   3121          printf("m (rm=%u) roundpd_1XX  ", rm);
   3122          showV128(&src);
   3123          printf(" ");
   3124          showV128(&dst);
   3125          printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3126          printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3127          printf("\n");
   3128       }
   3129    }
   3130 
   3131    rm = get_sse_roundingmode();
   3132    assert(rm == 3);
   3133    set_sse_roundingmode(0);
   3134    rm = get_sse_roundingmode();
   3135    assert(rm == 0); // 0 == RN == default
   3136 }
   3137 
   3138 /* ------------ ROUNDPS ------------ */
   3139 
   3140 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3141 {
   3142    if (mem) {
   3143       __asm__ __volatile__(
   3144          "movupd  (%1), %%xmm11"       "\n\t"
   3145          "roundps $0, (%0), %%xmm11"   "\n\t"
   3146          "movupd  %%xmm11, (%1)"       "\n"
   3147          : /*OUT*/
   3148          : /*IN*/ "r"(src), "r"(dst)
   3149          : /*TRASH*/ "xmm11"
   3150       );
   3151    } else {
   3152       __asm__ __volatile__(
   3153          "movupd  (%1), %%xmm11"         "\n\t"
   3154          "movupd  (%0), %%xmm2"          "\n\t"
   3155          "roundps $0, %%xmm2, %%xmm11"   "\n\t"
   3156          "movupd  %%xmm11, (%1)"         "\n"
   3157          : /*OUT*/
   3158          : /*IN*/ "r"(src), "r"(dst)
   3159          : /*TRASH*/ "xmm11","xmm2"
   3160       );
   3161    }
   3162 }
   3163 
   3164 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3165 {
   3166    if (mem) {
   3167       __asm__ __volatile__(
   3168          "movupd  (%1), %%xmm11"       "\n\t"
   3169          "roundps $1, (%0), %%xmm11"   "\n\t"
   3170          "movupd  %%xmm11, (%1)"       "\n"
   3171          : /*OUT*/
   3172          : /*IN*/ "r"(src), "r"(dst)
   3173          : /*TRASH*/ "xmm11"
   3174       );
   3175    } else {
   3176       __asm__ __volatile__(
   3177          "movupd  (%1), %%xmm11"         "\n\t"
   3178          "movupd  (%0), %%xmm2"          "\n\t"
   3179          "roundps $1, %%xmm2, %%xmm11"   "\n\t"
   3180          "movupd  %%xmm11, (%1)"         "\n"
   3181          : /*OUT*/
   3182          : /*IN*/ "r"(src), "r"(dst)
   3183          : /*TRASH*/ "xmm11","xmm2"
   3184       );
   3185    }
   3186 }
   3187 
   3188 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3189 {
   3190    if (mem) {
   3191       __asm__ __volatile__(
   3192          "movupd  (%1), %%xmm11"       "\n\t"
   3193          "roundps $2, (%0), %%xmm11"   "\n\t"
   3194          "movupd  %%xmm11, (%1)"       "\n"
   3195          : /*OUT*/
   3196          : /*IN*/ "r"(src), "r"(dst)
   3197          : /*TRASH*/ "xmm11"
   3198       );
   3199    } else {
   3200       __asm__ __volatile__(
   3201          "movupd  (%1), %%xmm11"         "\n\t"
   3202          "movupd  (%0), %%xmm2"          "\n\t"
   3203          "roundps $2, %%xmm2, %%xmm11"   "\n\t"
   3204          "movupd  %%xmm11, (%1)"         "\n"
   3205          : /*OUT*/
   3206          : /*IN*/ "r"(src), "r"(dst)
   3207          : /*TRASH*/ "xmm11","xmm2"
   3208       );
   3209    }
   3210 }
   3211 
   3212 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3213 {
   3214    if (mem) {
   3215       __asm__ __volatile__(
   3216          "movupd  (%1), %%xmm11"       "\n\t"
   3217          "roundps $3, (%0), %%xmm11"   "\n\t"
   3218          "movupd  %%xmm11, (%1)"       "\n"
   3219          : /*OUT*/
   3220          : /*IN*/ "r"(src), "r"(dst)
   3221          : /*TRASH*/ "xmm11"
   3222       );
   3223    } else {
   3224       __asm__ __volatile__(
   3225          "movupd  (%1), %%xmm11"         "\n\t"
   3226          "movupd  (%0), %%xmm2"          "\n\t"
   3227          "roundps $3, %%xmm2, %%xmm11"   "\n\t"
   3228          "movupd  %%xmm11, (%1)"         "\n"
   3229          : /*OUT*/
   3230          : /*IN*/ "r"(src), "r"(dst)
   3231          : /*TRASH*/ "xmm11","xmm2"
   3232       );
   3233    }
   3234 }
   3235 
   3236 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   3237 {
   3238    if (mem) {
   3239       __asm__ __volatile__(
   3240          "movupd  (%1), %%xmm11"       "\n\t"
   3241          "roundps $4, (%0), %%xmm11"   "\n\t"
   3242          "movupd  %%xmm11, (%1)"       "\n"
   3243          : /*OUT*/
   3244          : /*IN*/ "r"(src), "r"(dst)
   3245          : /*TRASH*/ "xmm11"
   3246       );
   3247    } else {
   3248       __asm__ __volatile__(
   3249          "movupd  (%1), %%xmm11"         "\n\t"
   3250          "movupd  (%0), %%xmm2"          "\n\t"
   3251          "roundps $4, %%xmm2, %%xmm11"   "\n\t"
   3252          "movupd  %%xmm11, (%1)"         "\n"
   3253          : /*OUT*/
   3254          : /*IN*/ "r"(src), "r"(dst)
   3255          : /*TRASH*/ "xmm11","xmm2"
   3256       );
   3257    }
   3258 }
   3259 
   3260 void test_ROUNDPS_w_immediate_rounding ( void )
   3261 {
   3262    float vals[22];
   3263    Int i = 0;
   3264    vals[i++] = 0.0;
   3265    vals[i++] = -0.0;
   3266    vals[i++] = mkPosInf();
   3267    vals[i++] = mkNegInf();
   3268    vals[i++] = mkPosNan();
   3269    vals[i++] = mkNegNan();
   3270    vals[i++] = -1.3;
   3271    vals[i++] = -1.1;
   3272    vals[i++] = -0.9;
   3273    vals[i++] = -0.7;
   3274    vals[i++] = -0.50001;
   3275    vals[i++] = -0.49999;
   3276    vals[i++] = -0.3;
   3277    vals[i++] = -0.1;
   3278    vals[i++] = 0.1;
   3279    vals[i++] = 0.3;
   3280    vals[i++] = 0.49999;
   3281    vals[i++] = 0.50001;
   3282    vals[i++] = 0.7;
   3283    vals[i++] = 0.9;
   3284    vals[i++] = 1.1;
   3285    vals[i++] = 1.3;
   3286    assert(i == 22);
   3287 
   3288    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3289       V128 src, dst;
   3290 
   3291       randV128(&src);
   3292       randV128(&dst);
   3293       memcpy(&src[0], &vals[i], 4);
   3294       memcpy(&src[4], &vals[(i+5)%22], 4);
   3295       memcpy(&src[8], &vals[(i+11)%22], 4);
   3296       memcpy(&src[12], &vals[(i+17)%22], 4);
   3297       do_ROUNDPS_000(False/*reg*/, &src, &dst);
   3298       printf("r roundps_000  ");
   3299       showV128(&src);
   3300       printf(" ");
   3301       showV128(&dst);
   3302       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3303       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3304       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3305       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3306       printf("\n");
   3307 
   3308       randV128(&src);
   3309       randV128(&dst);
   3310       memcpy(&src[0], &vals[i], 4);
   3311       memcpy(&src[4], &vals[(i+5)%22], 4);
   3312       memcpy(&src[8], &vals[(i+11)%22], 4);
   3313       memcpy(&src[12], &vals[(i+17)%22], 4);
   3314       do_ROUNDPS_000(True/*mem*/, &src, &dst);
   3315       printf("m roundps_000  ");
   3316       showV128(&src);
   3317       printf(" ");
   3318       showV128(&dst);
   3319       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3320       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3321       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3322       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3323       printf("\n");
   3324 
   3325 
   3326       randV128(&src);
   3327       randV128(&dst);
   3328       memcpy(&src[0], &vals[i], 4);
   3329       memcpy(&src[4], &vals[(i+5)%22], 4);
   3330       memcpy(&src[8], &vals[(i+11)%22], 4);
   3331       memcpy(&src[12], &vals[(i+17)%22], 4);
   3332       do_ROUNDPS_001(False/*reg*/, &src, &dst);
   3333       printf("r roundps_001  ");
   3334       showV128(&src);
   3335       printf(" ");
   3336       showV128(&dst);
   3337       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3338       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3339       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3340       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3341       printf("\n");
   3342 
   3343       randV128(&src);
   3344       randV128(&dst);
   3345       memcpy(&src[0], &vals[i], 4);
   3346       memcpy(&src[4], &vals[(i+5)%22], 4);
   3347       memcpy(&src[8], &vals[(i+11)%22], 4);
   3348       memcpy(&src[12], &vals[(i+17)%22], 4);
   3349       do_ROUNDPS_001(True/*mem*/, &src, &dst);
   3350       printf("m roundps_001  ");
   3351       showV128(&src);
   3352       printf(" ");
   3353       showV128(&dst);
   3354       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3355       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3356       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3357       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3358       printf("\n");
   3359 
   3360 
   3361       randV128(&src);
   3362       randV128(&dst);
   3363       memcpy(&src[0], &vals[i], 4);
   3364       memcpy(&src[4], &vals[(i+5)%22], 4);
   3365       memcpy(&src[8], &vals[(i+11)%22], 4);
   3366       memcpy(&src[12], &vals[(i+17)%22], 4);
   3367       do_ROUNDPS_010(False/*reg*/, &src, &dst);
   3368       printf("r roundps_010  ");
   3369       showV128(&src);
   3370       printf(" ");
   3371       showV128(&dst);
   3372       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3373       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3374       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3375       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3376       printf("\n");
   3377 
   3378       randV128(&src);
   3379       randV128(&dst);
   3380       memcpy(&src[0], &vals[i], 4);
   3381       memcpy(&src[4], &vals[(i+5)%22], 4);
   3382       memcpy(&src[8], &vals[(i+11)%22], 4);
   3383       memcpy(&src[12], &vals[(i+17)%22], 4);
   3384       do_ROUNDPS_010(True/*mem*/, &src, &dst);
   3385       printf("m roundps_010  ");
   3386       showV128(&src);
   3387       printf(" ");
   3388       showV128(&dst);
   3389       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3390       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3391       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3392       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3393       printf("\n");
   3394 
   3395 
   3396       randV128(&src);
   3397       randV128(&dst);
   3398       memcpy(&src[0], &vals[i], 4);
   3399       memcpy(&src[4], &vals[(i+5)%22], 4);
   3400       memcpy(&src[8], &vals[(i+11)%22], 4);
   3401       memcpy(&src[12], &vals[(i+17)%22], 4);
   3402       do_ROUNDPS_011(False/*reg*/, &src, &dst);
   3403       printf("r roundps_011  ");
   3404       showV128(&src);
   3405       printf(" ");
   3406       showV128(&dst);
   3407       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3408       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3409       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3410       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3411       printf("\n");
   3412 
   3413       randV128(&src);
   3414       randV128(&dst);
   3415       memcpy(&src[0], &vals[i], 4);
   3416       memcpy(&src[4], &vals[(i+5)%22], 4);
   3417       memcpy(&src[8], &vals[(i+11)%22], 4);
   3418       memcpy(&src[12], &vals[(i+17)%22], 4);
   3419       do_ROUNDPS_011(True/*mem*/, &src, &dst);
   3420       printf("m roundps_011  ");
   3421       showV128(&src);
   3422       printf(" ");
   3423       showV128(&dst);
   3424       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3425       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3426       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3427       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3428       printf("\n");
   3429    }
   3430 }
   3431 
   3432 void test_ROUNDPS_w_mxcsr_rounding ( void )
   3433 {
   3434    UInt rm;
   3435    float vals[22];
   3436    Int i = 0;
   3437    vals[i++] = 0.0;
   3438    vals[i++] = -0.0;
   3439    vals[i++] = mkPosInf();
   3440    vals[i++] = mkNegInf();
   3441    vals[i++] = mkPosNan();
   3442    vals[i++] = mkNegNan();
   3443    vals[i++] = -1.3;
   3444    vals[i++] = -1.1;
   3445    vals[i++] = -0.9;
   3446    vals[i++] = -0.7;
   3447    vals[i++] = -0.50001;
   3448    vals[i++] = -0.49999;
   3449    vals[i++] = -0.3;
   3450    vals[i++] = -0.1;
   3451    vals[i++] = 0.1;
   3452    vals[i++] = 0.3;
   3453    vals[i++] = 0.49999;
   3454    vals[i++] = 0.50001;
   3455    vals[i++] = 0.7;
   3456    vals[i++] = 0.9;
   3457    vals[i++] = 1.1;
   3458    vals[i++] = 1.3;
   3459    assert(i == 22);
   3460 
   3461    rm = get_sse_roundingmode();
   3462    assert(rm == 0); // 0 == RN == default
   3463 
   3464    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3465       V128 src, dst;
   3466 
   3467       for (rm = 0; rm <= 3; rm++) {
   3468          set_sse_roundingmode(rm);
   3469 
   3470          randV128(&src);
   3471          randV128(&dst);
   3472          memcpy(&src[0], &vals[i], 4);
   3473          memcpy(&src[4], &vals[(i+5)%22], 4);
   3474          memcpy(&src[8], &vals[(i+11)%22], 4);
   3475          memcpy(&src[12], &vals[(i+17)%22], 4);
   3476          do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
   3477          printf("r (rm=%u) roundps_1XX  ", rm);
   3478          showV128(&src);
   3479          printf(" ");
   3480          showV128(&dst);
   3481          printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3482          printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3483          printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3484          printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3485          printf("\n");
   3486 
   3487          randV128(&src);
   3488          randV128(&dst);
   3489          memcpy(&src[0], &vals[i], 4);
   3490          memcpy(&src[4], &vals[(i+5)%22], 4);
   3491          memcpy(&src[8], &vals[(i+11)%22], 4);
   3492          memcpy(&src[12], &vals[(i+17)%22], 4);
   3493          do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
   3494          printf("m (rm=%u) roundps_1XX  ", rm);
   3495          showV128(&src);
   3496          printf(" ");
   3497          showV128(&dst);
   3498          printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3499          printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3500          printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3501          printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3502          printf("\n");
   3503       }
   3504    }
   3505 
   3506    rm = get_sse_roundingmode();
   3507    assert(rm == 3);
   3508    set_sse_roundingmode(0);
   3509    rm = get_sse_roundingmode();
   3510    assert(rm == 0); // 0 == RN == default
   3511 }
   3512 
   3513 /* ------------ PTEST ------------ */
   3514 
   3515 void test_PTEST ( void )
   3516 {
   3517    const Int ntests = 8;
   3518    V128 spec[ntests];
   3519    do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
   3520    do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
   3521    do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
   3522    do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
   3523    do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
   3524    do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
   3525    do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
   3526    do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
   3527    V128 block[2];
   3528    Int i, j;
   3529    ULong flags;
   3530    for (i = 0; i < ntests; i++) {
   3531       for (j = 0; j < ntests; j++) {
   3532          memcpy(&block[0], &spec[i], 16);
   3533          memcpy(&block[1], &spec[j], 16);
   3534          __asm__ __volatile__(
   3535             "subq $256, %%rsp"        "\n\t"
   3536             "movupd 0(%1), %%xmm2"    "\n\t"
   3537             "ptest 16(%1), %%xmm2"    "\n\t"
   3538             "pushfq"                  "\n\t"
   3539             "popq %0"                 "\n\t"
   3540             "addq $256, %%rsp"        "\n\t"
   3541             : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
   3542             "xmm2", "memory", "cc"
   3543          );
   3544          printf("r   ptest ");
   3545          showV128(&block[0]);
   3546          printf(" ");
   3547          showV128(&block[1]);
   3548          printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
   3549       }
   3550    }
   3551 }
   3552 
   3553 /* ------------ PBLENDVB ------------ */
   3554 
   3555 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3556 {
   3557    if (mem) {
   3558       __asm__ __volatile__(
   3559          "movupd   (%2), %%xmm0"         "\n\t"
   3560          "movupd   (%1), %%xmm11"        "\n\t"
   3561          "pblendvb (%0), %%xmm11"        "\n\t"
   3562          "movupd   %%xmm11, (%1)"        "\n"
   3563          : /*OUT*/
   3564          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3565          : /*TRASH*/ "xmm11","xmm0"
   3566       );
   3567    } else {
   3568       __asm__ __volatile__(
   3569          "movupd   (%2), %%xmm0"         "\n\t"
   3570          "movupd   (%1), %%xmm11"        "\n\t"
   3571          "movupd   (%0), %%xmm2"         "\n\t"
   3572          "pblendvb %%xmm2, %%xmm11"      "\n\t"
   3573          "movupd   %%xmm11, (%1)"        "\n"
   3574          : /*OUT*/
   3575          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3576          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3577       );
   3578    }
   3579 }
   3580 
   3581 void test_PBLENDVB ( void )
   3582 {
   3583    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3584    Int i;
   3585    for (i = 0; i < 10; i++) {
   3586       randV128(&t_xmm0);
   3587       randV128(&t_src);
   3588       randV128(&t_dst);
   3589 
   3590       memcpy(&xmm0, &t_xmm0, 16);
   3591       memcpy(&src, &t_src, 16);
   3592       memcpy(&dst, &t_dst, 16);
   3593       do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
   3594       printf("r pblendvb  ");
   3595       showV128(&t_xmm0);
   3596       printf(" ");
   3597       showV128(&t_src);
   3598       printf(" ");
   3599       showV128(&t_dst);
   3600       printf(" -> ");
   3601       showV128(&dst);
   3602       printf("\n");
   3603 
   3604       memcpy(&xmm0, &t_xmm0, 16);
   3605       memcpy(&src, &t_src, 16);
   3606       memcpy(&dst, &t_dst, 16);
   3607       do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
   3608       printf("m pblendvb  ");
   3609       showV128(&t_xmm0);
   3610       printf(" ");
   3611       showV128(&t_src);
   3612       printf(" ");
   3613       showV128(&t_dst);
   3614       printf(" -> ");
   3615       showV128(&dst);
   3616       printf("\n");
   3617    }
   3618 }
   3619 
   3620 /* ------------ BLENDVPD ------------ */
   3621 
   3622 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3623 {
   3624    if (mem) {
   3625       __asm__ __volatile__(
   3626          "movupd   (%2), %%xmm0"         "\n\t"
   3627          "movupd   (%1), %%xmm11"        "\n\t"
   3628          "blendvpd (%0), %%xmm11"        "\n\t"
   3629          "movupd   %%xmm11, (%1)"        "\n"
   3630          : /*OUT*/
   3631          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3632          : /*TRASH*/ "xmm11","xmm0"
   3633       );
   3634    } else {
   3635       __asm__ __volatile__(
   3636          "movupd   (%2), %%xmm0"         "\n\t"
   3637          "movupd   (%1), %%xmm11"        "\n\t"
   3638          "movupd   (%0), %%xmm2"         "\n\t"
   3639          "blendvpd %%xmm2, %%xmm11"      "\n\t"
   3640          "movupd   %%xmm11, (%1)"        "\n"
   3641          : /*OUT*/
   3642          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3643          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3644       );
   3645    }
   3646 }
   3647 
   3648 void test_BLENDVPD ( void )
   3649 {
   3650    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3651    Int i;
   3652    for (i = 0; i < 10; i++) {
   3653       randV128(&t_xmm0);
   3654       randV128(&t_src);
   3655       randV128(&t_dst);
   3656 
   3657       memcpy(&xmm0, &t_xmm0, 16);
   3658       memcpy(&src, &t_src, 16);
   3659       memcpy(&dst, &t_dst, 16);
   3660       do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
   3661       printf("r blendvpd  ");
   3662       showV128(&t_xmm0);
   3663       printf(" ");
   3664       showV128(&t_src);
   3665       printf(" ");
   3666       showV128(&t_dst);
   3667       printf(" -> ");
   3668       showV128(&dst);
   3669       printf("\n");
   3670 
   3671       memcpy(&xmm0, &t_xmm0, 16);
   3672       memcpy(&src, &t_src, 16);
   3673       memcpy(&dst, &t_dst, 16);
   3674       do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
   3675       printf("m blendvpd  ");
   3676       showV128(&t_xmm0);
   3677       printf(" ");
   3678       showV128(&t_src);
   3679       printf(" ");
   3680       showV128(&t_dst);
   3681       printf(" -> ");
   3682       showV128(&dst);
   3683       printf("\n");
   3684    }
   3685 }
   3686 
   3687 /* ------------ BLENDVPS ------------ */
   3688 
   3689 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3690 {
   3691    if (mem) {
   3692       __asm__ __volatile__(
   3693          "movupd   (%2), %%xmm0"         "\n\t"
   3694          "movupd   (%1), %%xmm11"        "\n\t"
   3695          "blendvps (%0), %%xmm11"        "\n\t"
   3696          "movupd   %%xmm11, (%1)"        "\n"
   3697          : /*OUT*/
   3698          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3699          : /*TRASH*/ "xmm11","xmm0"
   3700       );
   3701    } else {
   3702       __asm__ __volatile__(
   3703          "movupd   (%2), %%xmm0"         "\n\t"
   3704          "movupd   (%1), %%xmm11"        "\n\t"
   3705          "movupd   (%0), %%xmm2"         "\n\t"
   3706          "blendvps %%xmm2, %%xmm11"      "\n\t"
   3707          "movupd   %%xmm11, (%1)"        "\n"
   3708          : /*OUT*/
   3709          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3710          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3711       );
   3712    }
   3713 }
   3714 
   3715 void test_BLENDVPS ( void )
   3716 {
   3717    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3718    Int i;
   3719    for (i = 0; i < 10; i++) {
   3720       randV128(&t_xmm0);
   3721       randV128(&t_src);
   3722       randV128(&t_dst);
   3723 
   3724       memcpy(&xmm0, &t_xmm0, 16);
   3725       memcpy(&src, &t_src, 16);
   3726       memcpy(&dst, &t_dst, 16);
   3727       do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
   3728       printf("r blendvps  ");
   3729       showV128(&t_xmm0);
   3730       printf(" ");
   3731       showV128(&t_src);
   3732       printf(" ");
   3733       showV128(&t_dst);
   3734       printf(" -> ");
   3735       showV128(&dst);
   3736       printf("\n");
   3737 
   3738       memcpy(&xmm0, &t_xmm0, 16);
   3739       memcpy(&src, &t_src, 16);
   3740       memcpy(&dst, &t_dst, 16);
   3741       do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
   3742       printf("m blendvps  ");
   3743       showV128(&t_xmm0);
   3744       printf(" ");
   3745       showV128(&t_src);
   3746       printf(" ");
   3747       showV128(&t_dst);
   3748       printf(" -> ");
   3749       showV128(&dst);
   3750       printf("\n");
   3751    }
   3752 }
   3753 
   3754 void test_MOVNTDQA ( void )
   3755 {
   3756    V128 src, dst;
   3757    Int i;
   3758    for (i = 0; i < 10; i++) {
   3759       randV128(&src);
   3760       /* make sure the load actually happens */
   3761       randV128(&dst);
   3762       DO_m_r("movntdqa", src, dst);
   3763    }
   3764 }
   3765 
   3766 /* ------------ main ------------ */
   3767 
   3768 int main ( int argc, char** argv )
   3769 {
   3770 #if 1
   3771    // ------ SSE 4.1 ------
   3772    test_BLENDPD();        // done Apr.01.2010
   3773    test_BLENDPS();        // done Apr.02.2010
   3774    test_PBLENDW();
   3775    test_PBLENDVB();
   3776    test_BLENDVPD();
   3777    test_BLENDVPS();
   3778    test_DPPD();           // done Apr.08.2010
   3779    test_DPPS();           // done Apr.09.2010
   3780    test_EXTRACTPS();
   3781    test_INSERTPS();       // done Apr.01.2010
   3782    test_PCMPEQQ();
   3783    test_PEXTRB();         // done Apr.15.2010
   3784    test_PEXTRD();         // done Apr.14.2010
   3785    test_PEXTRQ();         // done Apr.14.2010
   3786    test_PEXTRW();         // done Apr.14.2010
   3787    test_PINSRQ();         // done Apr.16.2010
   3788    test_PINSRD();         // todo
   3789    test_PINSRW(); /* Umm, this is SSE2, not SSE4.  Right? */
   3790    test_PINSRB();         // todo
   3791    test_PMAXSB();
   3792    test_PMAXSD();         // done Apr.09.2010
   3793    test_PMAXUD();         // done Apr.16.2010
   3794    test_PMAXUW();
   3795    test_PMINSB();
   3796    test_PMINSD();         // done Apr.09.2010
   3797    test_PMINUD();
   3798    test_PMINUW();
   3799    test_PMOVSXBW();       // done Apr.02.2010
   3800    test_PMOVSXBD();       // done Mar.30.2010
   3801    test_PMOVSXBQ();       // done Mar.30.2010
   3802    test_PMOVSXWD();       // done Mar.31.2010
   3803    test_PMOVSXWQ();       // done Mar.31.2010
   3804    test_PMOVSXDQ();       // done Mar.31.2010
   3805    test_PMOVZXBW();       // done Mar.28.2010
   3806    test_PMOVZXBD();       // done Mar.29.2010
   3807    test_PMOVZXBQ();       // done Mar.29.2010
   3808    test_PMOVZXWD();       // done Mar.28.2010
   3809    test_PMOVZXWQ();       // done Mar.29.2010
   3810    test_PMOVZXDQ();       // done Mar.29.2010
   3811    test_POPCNTW();
   3812    test_POPCNTL();
   3813    test_POPCNTQ();
   3814    test_PMULDQ();
   3815    test_PMULLD();
   3816    test_PTEST();
   3817    test_ROUNDSD_w_immediate_rounding();
   3818    test_ROUNDSS_w_immediate_rounding();
   3819    test_ROUNDPD_w_immediate_rounding();
   3820    test_ROUNDPS_w_immediate_rounding();
   3821    test_ROUNDSD_w_mxcsr_rounding();
   3822    test_ROUNDSS_w_mxcsr_rounding();
   3823    test_ROUNDPD_w_mxcsr_rounding();
   3824    test_ROUNDPS_w_mxcsr_rounding();
   3825    // ------ SSE 4.2 ------
   3826    test_PCMPGTQ();
   3827    // CRC32B,Q
   3828    test_PACKUSDW();
   3829    test_PHMINPOSUW();
   3830    test_MPSADBW();
   3831    test_MOVNTDQA(); /* not sure whether this is 4.1 or 4.2 */
   3832 #else
   3833    test_MPSADBW();
   3834 #endif
   3835 
   3836    return 0;
   3837 }
   3838 
   3839