Home | History | Annotate | Download | only in amd64
      1 
      2 /* A program to test SSE4.1/SSE4.2 instructions.
      3    Revisions:  Nov.208     - wrote this file
      4                Apr.10.2010 - added PEXTR* tests
      5                Apr.16.2010 - added PINS*  tests
      6 */
      7 
      8 /* HOW TO COMPILE:
      9    gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
     10 */
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include <assert.h>
     15 //#include "tests/malloc.h" // reenable when reintegrated
     16 #include <string.h>
     17 
     18 
     19 
     20 // rmme when reintegrated
     21 // Allocates a 16-aligned block.  Asserts if the allocation fails.
     22 #ifdef VGO_darwin
     23 #include <stdlib.h>
     24 #else
     25 #include <malloc.h>
     26 #endif
     27 __attribute__((unused))
     28 static void* memalign16(size_t szB)
     29 {
     30    void* x;
     31 #if defined(VGO_darwin)
     32    // Darwin lacks memalign, but its malloc is always 16-aligned anyway.
     33    x = malloc(szB);
     34 #else
     35    x = memalign(16, szB);
     36 #endif
     37    assert(x);
     38    assert(0 == ((16-1) & (unsigned long)x));
     39    return x;
     40 }
     41 
     42 
     43 
     44 typedef  unsigned char           V128[16];
     45 typedef  unsigned int            UInt;
     46 typedef  signed int              Int;
     47 typedef  unsigned char           UChar;
     48 typedef  unsigned long long int  ULong;
     49 
     50 typedef  unsigned char           Bool;
     51 #define False ((Bool)0)
     52 #define True  ((Bool)1)
     53 
     54 
     55 typedef
     56    struct {
     57       V128 arg1;
     58       V128 arg2;
     59       V128 res;
     60    }
     61    RRArgs;
     62 
     63 typedef
     64    struct {
     65       V128 arg1;
     66       V128 res;
     67    }
     68    RMArgs;
     69 
     70 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
     71 {
     72    // try to sidestep strict-aliasing snafus by memcpying explicitly
     73    UChar* p = (UChar*)res;
     74    memcpy(&p[8], (UChar*)&wHi, 8);
     75    memcpy(&p[0], (UChar*)&wLo, 8);
     76 }
     77 
     78 static UChar randUChar ( void )
     79 {
     80    static UInt seed = 80021;
     81    seed = 1103515245 * seed + 12345;
     82    return (seed >> 17) & 0xFF;
     83 }
     84 
     85 static ULong randULong ( void )
     86 {
     87    Int i;
     88    ULong r = 0;
     89    for (i = 0; i < 8; i++) {
     90       r = (r << 8) | (ULong)(0xFF & randUChar());
     91    }
     92    return r;
     93 }
     94 
     95 static void randV128 ( V128* v )
     96 {
     97    Int i;
     98    for (i = 0; i < 16; i++)
     99       (*v)[i] = randUChar();
    100 }
    101 
    102 static void showV128 ( V128* v )
    103 {
    104    Int i;
    105    for (i = 15; i >= 0; i--)
    106       printf("%02x", (Int)(*v)[i]);
    107 }
    108 
    109 static void showMaskedV128 ( V128* v, V128* mask )
    110 {
    111    Int i;
    112    for (i = 15; i >= 0; i--)
    113       printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
    114 }
    115 
    116 static void showIGVV( char* rOrM, char* op, Int imm,
    117                       ULong src64, V128* dst, V128* res )
    118 {
    119    printf("%s %10s $%d ", rOrM, op, imm);
    120    printf("%016llx", src64);
    121    printf(" ");
    122    showV128(dst);
    123    printf(" ");
    124    showV128(res);
    125    printf("\n");
    126 }
    127 
    128 static void showIAG ( char* rOrM, char* op, Int imm,
    129                       V128* argL, ULong argR, ULong res )
    130 {
    131    printf("%s %10s $%d ", rOrM, op, imm);
    132    showV128(argL);
    133    printf(" ");
    134    printf("%016llx", argR);
    135    printf(" ");
    136    printf("%016llx", res);
    137    printf("\n");
    138 }
    139 
    140 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
    141 {
    142    printf("%s %10s $%d ", rOrM, op, imm);
    143    showV128(&rra->arg1);
    144    printf(" ");
    145    showV128(&rra->arg2);
    146    printf(" ");
    147    showMaskedV128(&rra->res, rmask);
    148    printf("\n");
    149 }
    150 
    151 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
    152 {
    153    printf("%s %10s ", rOrM, op);
    154    showV128(&rra->arg1);
    155    printf(" ");
    156    showV128(&rra->arg2);
    157    printf(" ");
    158    showMaskedV128(&rra->res, rmask);
    159    printf("\n");
    160 }
    161 
    162 /* Note: these are little endian.  Hence first byte is the least
    163    significant byte of lane zero. */
    164 
    165 /* Mask for insns where all result bits are non-approximated. */
    166 static V128 AllMask  = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
    167                          0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
    168 
    169 /* Mark for insns which produce approximated vector short results. */
    170 __attribute__((unused))
    171 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
    172                          0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
    173 
    174 /* Mark for insns which produce approximated scalar short results. */
    175 __attribute__((unused))
    176 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
    177                          0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
    178 
    179 static V128 fives    = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
    180                          0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
    181 
    182 static V128 zeroes   = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
    183                          0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
    184 
    185 double mkPosInf ( void ) { return 1.0 / 0.0; }
    186 double mkNegInf ( void ) { return -mkPosInf(); }
    187 double mkPosNan ( void ) { return 0.0 / 0.0; }
    188 double mkNegNan ( void ) { return -mkPosNan(); }
    189 
    190 __attribute__((noinline))
    191 UInt get_mxcsr ( void )
    192 {
    193    ULong w64;
    194    __asm__ __volatile__(
    195       "subq    $8, %%rsp"    "\n\t"
    196       "stmxcsr (%%rsp)"      "\n\t"
    197       "movq    (%%rsp), %0"  "\n"
    198       "addq    $8, %%rsp"
    199       : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
    200    );
    201    if (0) printf("get %08x\n", (UInt)w64);
    202    return (UInt)w64;
    203 }
    204 
    205 __attribute__((noinline))
    206 void set_mxcsr ( UInt w32 )
    207 {
    208    if (0) printf("set %08x\n", w32);
    209    ULong w64 = (ULong)w32;
    210    __asm__ __volatile__(
    211       "subq    $8, %%rsp"    "\n\t"
    212       "movq    %0, (%%rsp)"  "\n\t"
    213       "ldmxcsr (%%rsp)"      "\n\t"
    214       "addq    $8, %%rsp"
    215       : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
    216    );
    217 }
    218 
    219 UInt get_sse_roundingmode ( void )
    220 {
    221    UInt w = get_mxcsr();
    222    return (w >> 13) & 3;
    223 }
    224 
    225 void set_sse_roundingmode ( UInt m )
    226 {
    227    UInt w;
    228    assert(0 == (m & ~3));
    229    w = get_mxcsr();
    230    w &= ~(3 << 13);
    231    w |= (m << 13);
    232    set_mxcsr(w);
    233 }
    234 
    235 
    236 #define DO_imm_r_r(_opname, _imm, _src, _dst)  \
    237    {  \
    238       V128 _tmp;  \
    239       __asm__ __volatile__(  \
    240          "movupd (%0), %%xmm2"    "\n\t"  \
    241          "movupd (%1), %%xmm11"   "\n\t"  \
    242          _opname " $" #_imm ", %%xmm2, %%xmm11"  "\n\t"  \
    243          "movupd %%xmm11, (%2)" "\n"  \
    244          : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
    245          : "cc", "memory", "xmm2", "xmm11"                            \
    246       );  \
    247       RRArgs rra;  \
    248       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    249       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    250       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    251       showIAA("r", (_opname), (_imm), &rra, &AllMask);  \
    252    }
    253 
    254 #define DO_imm_m_r(_opname, _imm, _src, _dst)  \
    255    {  \
    256       V128 _tmp;  \
    257       V128* _srcM = memalign16(sizeof(V128));  \
    258       memcpy(_srcM, &(_src), sizeof(V128));  \
    259       __asm__ __volatile__(  \
    260          "movupd (%1), %%xmm11"   "\n\t"  \
    261          _opname " $" #_imm ", (%0), %%xmm11"  "\n\t"  \
    262          "movupd %%xmm11, (%2)" "\n"  \
    263          : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
    264          : "cc", "memory", "xmm11"  \
    265       );  \
    266       RRArgs rra;  \
    267       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    268       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    269       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    270       showIAA("m", (_opname), (_imm), &rra, &AllMask);  \
    271       free(_srcM);  \
    272    }
    273 
    274 #define DO_imm_mandr_r(_opname, _imm, _src, _dst)  \
    275       DO_imm_r_r( _opname, _imm, _src, _dst ) \
    276       DO_imm_m_r( _opname, _imm, _src, _dst )
    277 
    278 
    279 
    280 
    281 
    282 #define DO_r_r(_opname, _src, _dst)  \
    283    {  \
    284       V128 _tmp;  \
    285       __asm__ __volatile__(  \
    286          "movupd (%0), %%xmm2"    "\n\t"  \
    287          "movupd (%1), %%xmm11"   "\n\t"  \
    288          _opname " %%xmm2, %%xmm11"  "\n\t"  \
    289          "movupd %%xmm11, (%2)" "\n"  \
    290          : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp))  \
    291          : "cc", "memory", "xmm2", "xmm11"  \
    292       );  \
    293       RRArgs rra;  \
    294       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    295       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    296       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    297       showAA("r", (_opname), &rra, &AllMask);  \
    298    }
    299 
    300 #define DO_m_r(_opname, _src, _dst)  \
    301    {  \
    302       V128 _tmp;  \
    303       V128* _srcM = memalign16(sizeof(V128));  \
    304       memcpy(_srcM, &(_src), sizeof(V128));  \
    305       __asm__ __volatile__(  \
    306          "movupd (%1), %%xmm11"   "\n\t"  \
    307          _opname " (%0), %%xmm11"  "\n\t"  \
    308          "movupd %%xmm11, (%2)" "\n"  \
    309          : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp))  \
    310          : "cc", "memory", "xmm11"  \
    311       );  \
    312       RRArgs rra;  \
    313       memcpy(&rra.arg1, &(_src), sizeof(V128));  \
    314       memcpy(&rra.arg2, &(_dst), sizeof(V128));  \
    315       memcpy(&rra.res,  &(_tmp), sizeof(V128));  \
    316       showAA("m", (_opname), &rra, &AllMask);  \
    317       free(_srcM);  \
    318    }
    319 
    320 #define DO_mandr_r(_opname, _src, _dst)  \
    321       DO_r_r(_opname, _src, _dst) \
    322       DO_m_r(_opname, _src, _dst)
    323 
    324 
    325 
    326 
    327 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix)       \
    328    {  \
    329       ULong _scbefore = 0x5555555555555555ULL;  \
    330       ULong _scafter  = 0xAAAAAAAAAAAAAAAAULL; \
    331       /* This assumes that gcc won't make any of %0, %1, %2 */ \
    332       /* be r11.  That should be ensured (cough, cough) */ \
    333       /* by declaring r11 to be clobbered. */ \
    334       __asm__ __volatile__(  \
    335          "movupd (%0), %%xmm2"    "\n\t"  \
    336          "movq   (%1), %%r11"   "\n\t"  \
    337          _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix  "\n\t"  \
    338          "movq   %%r11, (%2)" "\n"  \
    339          : /*out*/ \
    340          : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter))  \
    341          : "cc", "memory", "xmm2", "r11"  \
    342       );  \
    343       showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
    344    }
    345 
    346 #define DO_imm_r_to_mscalar(_opname, _imm, _src)   \
    347    {  \
    348       ULong _scbefore = 0x5555555555555555ULL;  \
    349       ULong _scafter = _scbefore; \
    350       __asm__ __volatile__(  \
    351          "movupd (%0), %%xmm2"    "\n\t"  \
    352          _opname " $" #_imm ", %%xmm2, (%1)"  "\n\t"  \
    353          : /*out*/ \
    354          : /*in*/ "r"(&(_src)), "r"(&(_scafter))  \
    355          : "cc", "memory", "xmm2"  \
    356       );  \
    357       showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter));  \
    358    }
    359 
    360 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix)   \
    361       DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix )       \
    362       DO_imm_r_to_mscalar( _opname, _imm, _src )
    363 
    364 
    365 
    366 
    367 
    368 
    369 
    370 
    371 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix)       \
    372    {  \
    373       V128  dstv;         \
    374       V128  res;          \
    375       ULong src64 = (ULong)(_src); \
    376       memcpy(dstv, fives, sizeof(dstv)); \
    377       memcpy(res,  zeroes, sizeof(res)); \
    378       /* This assumes that gcc won't make any of %0, %1, %2 */ \
    379       /* be r11.  That should be ensured (cough, cough) */ \
    380       /* by declaring r11 to be clobbered. */ \
    381       __asm__ __volatile__(  \
    382          "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
    383          "movq   (%1), %%r11"     "\n\t"   /*src64*/  \
    384          _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2"   "\n\t"  \
    385          "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
    386          : /*out*/ \
    387          : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
    388          : "cc", "memory", "xmm2", "r11"  \
    389       );  \
    390       showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
    391    }
    392 #define DO_imm_mscalar_to_r(_opname, _imm, _src)       \
    393    {  \
    394       V128  dstv;         \
    395       V128  res;          \
    396       ULong src64 = (ULong)(_src); \
    397       memcpy(dstv, fives, sizeof(dstv)); \
    398       memcpy(res,  zeroes, sizeof(res)); \
    399       __asm__ __volatile__(  \
    400          "movupd (%0), %%xmm2"    "\n\t"   /*dstv*/   \
    401          _opname " $" #_imm ", (%1), %%xmm2"   "\n\t"  \
    402          "movupd  %%xmm2, (%2)" "\n" /*res*/                          \
    403          : /*out*/ \
    404          : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res)  \
    405          : "cc", "memory", "xmm2"  \
    406       );  \
    407       showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
    408    }
    409 
    410 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix)   \
    411       DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix )       \
    412       DO_imm_mscalar_to_r( _opname, _imm, _src )
    413 
    414 
    415 
    416 
    417 
    418 void test_BLENDPD ( void )
    419 {
    420    V128 src, dst;
    421    Int i;
    422    for (i = 0; i < 10; i++) {
    423       randV128(&src);
    424       randV128(&dst);
    425       DO_imm_mandr_r("blendpd", 0, src, dst);
    426       DO_imm_mandr_r("blendpd", 1, src, dst);
    427       DO_imm_mandr_r("blendpd", 2, src, dst);
    428       DO_imm_mandr_r("blendpd", 3, src, dst);
    429    }
    430 }
    431 
    432 void test_BLENDPS ( void )
    433 {
    434    V128 src, dst;
    435    Int i;
    436    for (i = 0; i < 10; i++) {
    437       randV128(&src);
    438       randV128(&dst);
    439       DO_imm_mandr_r("blendps", 0, src, dst);
    440       DO_imm_mandr_r("blendps", 1, src, dst);
    441       DO_imm_mandr_r("blendps", 2, src, dst);
    442       DO_imm_mandr_r("blendps", 3, src, dst);
    443       DO_imm_mandr_r("blendps", 4, src, dst);
    444       DO_imm_mandr_r("blendps", 5, src, dst);
    445       DO_imm_mandr_r("blendps", 6, src, dst);
    446       DO_imm_mandr_r("blendps", 7, src, dst);
    447       DO_imm_mandr_r("blendps", 8, src, dst);
    448       DO_imm_mandr_r("blendps", 9, src, dst);
    449       DO_imm_mandr_r("blendps", 10, src, dst);
    450       DO_imm_mandr_r("blendps", 11, src, dst);
    451       DO_imm_mandr_r("blendps", 12, src, dst);
    452       DO_imm_mandr_r("blendps", 13, src, dst);
    453       DO_imm_mandr_r("blendps", 14, src, dst);
    454       DO_imm_mandr_r("blendps", 15, src, dst);
    455    }
    456 }
    457 
    458 void test_DPPD ( void )
    459 {
    460    V128 src, dst;
    461    {
    462       *(double*)(&src[0]) =  1.2345;
    463       *(double*)(&src[8]) = -6.78910;
    464       *(double*)(&dst[0]) = -11.121314;
    465       *(double*)(&dst[8]) =  15.161718;
    466       DO_imm_mandr_r("dppd", 0, src, dst);
    467       DO_imm_mandr_r("dppd", 1, src, dst);
    468       DO_imm_mandr_r("dppd", 2, src, dst);
    469       DO_imm_mandr_r("dppd", 3, src, dst);
    470       DO_imm_mandr_r("dppd", 4, src, dst);
    471       DO_imm_mandr_r("dppd", 5, src, dst);
    472       DO_imm_mandr_r("dppd", 6, src, dst);
    473       DO_imm_mandr_r("dppd", 7, src, dst);
    474       DO_imm_mandr_r("dppd", 8, src, dst);
    475       DO_imm_mandr_r("dppd", 9, src, dst);
    476       DO_imm_mandr_r("dppd", 10, src, dst);
    477       DO_imm_mandr_r("dppd", 11, src, dst);
    478       DO_imm_mandr_r("dppd", 12, src, dst);
    479       DO_imm_mandr_r("dppd", 13, src, dst);
    480       DO_imm_mandr_r("dppd", 14, src, dst);
    481       DO_imm_mandr_r("dppd", 15, src, dst);
    482       DO_imm_mandr_r("dppd", 16, src, dst);
    483       DO_imm_mandr_r("dppd", 17, src, dst);
    484       DO_imm_mandr_r("dppd", 18, src, dst);
    485       DO_imm_mandr_r("dppd", 19, src, dst);
    486       DO_imm_mandr_r("dppd", 20, src, dst);
    487       DO_imm_mandr_r("dppd", 21, src, dst);
    488       DO_imm_mandr_r("dppd", 22, src, dst);
    489       DO_imm_mandr_r("dppd", 23, src, dst);
    490       DO_imm_mandr_r("dppd", 24, src, dst);
    491       DO_imm_mandr_r("dppd", 25, src, dst);
    492       DO_imm_mandr_r("dppd", 26, src, dst);
    493       DO_imm_mandr_r("dppd", 27, src, dst);
    494       DO_imm_mandr_r("dppd", 28, src, dst);
    495       DO_imm_mandr_r("dppd", 29, src, dst);
    496       DO_imm_mandr_r("dppd", 30, src, dst);
    497       DO_imm_mandr_r("dppd", 31, src, dst);
    498       DO_imm_mandr_r("dppd", 32, src, dst);
    499       DO_imm_mandr_r("dppd", 33, src, dst);
    500       DO_imm_mandr_r("dppd", 34, src, dst);
    501       DO_imm_mandr_r("dppd", 35, src, dst);
    502       DO_imm_mandr_r("dppd", 36, src, dst);
    503       DO_imm_mandr_r("dppd", 37, src, dst);
    504       DO_imm_mandr_r("dppd", 38, src, dst);
    505       DO_imm_mandr_r("dppd", 39, src, dst);
    506       DO_imm_mandr_r("dppd", 40, src, dst);
    507       DO_imm_mandr_r("dppd", 41, src, dst);
    508       DO_imm_mandr_r("dppd", 42, src, dst);
    509       DO_imm_mandr_r("dppd", 43, src, dst);
    510       DO_imm_mandr_r("dppd", 44, src, dst);
    511       DO_imm_mandr_r("dppd", 45, src, dst);
    512       DO_imm_mandr_r("dppd", 46, src, dst);
    513       DO_imm_mandr_r("dppd", 47, src, dst);
    514       DO_imm_mandr_r("dppd", 48, src, dst);
    515       DO_imm_mandr_r("dppd", 49, src, dst);
    516       DO_imm_mandr_r("dppd", 50, src, dst);
    517       DO_imm_mandr_r("dppd", 51, src, dst);
    518       DO_imm_mandr_r("dppd", 52, src, dst);
    519       DO_imm_mandr_r("dppd", 53, src, dst);
    520       DO_imm_mandr_r("dppd", 54, src, dst);
    521       DO_imm_mandr_r("dppd", 55, src, dst);
    522       DO_imm_mandr_r("dppd", 56, src, dst);
    523       DO_imm_mandr_r("dppd", 57, src, dst);
    524       DO_imm_mandr_r("dppd", 58, src, dst);
    525       DO_imm_mandr_r("dppd", 59, src, dst);
    526       DO_imm_mandr_r("dppd", 60, src, dst);
    527       DO_imm_mandr_r("dppd", 61, src, dst);
    528       DO_imm_mandr_r("dppd", 62, src, dst);
    529       DO_imm_mandr_r("dppd", 63, src, dst);
    530       DO_imm_mandr_r("dppd", 64, src, dst);
    531       DO_imm_mandr_r("dppd", 65, src, dst);
    532       DO_imm_mandr_r("dppd", 66, src, dst);
    533       DO_imm_mandr_r("dppd", 67, src, dst);
    534       DO_imm_mandr_r("dppd", 68, src, dst);
    535       DO_imm_mandr_r("dppd", 69, src, dst);
    536       DO_imm_mandr_r("dppd", 70, src, dst);
    537       DO_imm_mandr_r("dppd", 71, src, dst);
    538       DO_imm_mandr_r("dppd", 72, src, dst);
    539       DO_imm_mandr_r("dppd", 73, src, dst);
    540       DO_imm_mandr_r("dppd", 74, src, dst);
    541       DO_imm_mandr_r("dppd", 75, src, dst);
    542       DO_imm_mandr_r("dppd", 76, src, dst);
    543       DO_imm_mandr_r("dppd", 77, src, dst);
    544       DO_imm_mandr_r("dppd", 78, src, dst);
    545       DO_imm_mandr_r("dppd", 79, src, dst);
    546       DO_imm_mandr_r("dppd", 80, src, dst);
    547       DO_imm_mandr_r("dppd", 81, src, dst);
    548       DO_imm_mandr_r("dppd", 82, src, dst);
    549       DO_imm_mandr_r("dppd", 83, src, dst);
    550       DO_imm_mandr_r("dppd", 84, src, dst);
    551       DO_imm_mandr_r("dppd", 85, src, dst);
    552       DO_imm_mandr_r("dppd", 86, src, dst);
    553       DO_imm_mandr_r("dppd", 87, src, dst);
    554       DO_imm_mandr_r("dppd", 88, src, dst);
    555       DO_imm_mandr_r("dppd", 89, src, dst);
    556       DO_imm_mandr_r("dppd", 90, src, dst);
    557       DO_imm_mandr_r("dppd", 91, src, dst);
    558       DO_imm_mandr_r("dppd", 92, src, dst);
    559       DO_imm_mandr_r("dppd", 93, src, dst);
    560       DO_imm_mandr_r("dppd", 94, src, dst);
    561       DO_imm_mandr_r("dppd", 95, src, dst);
    562       DO_imm_mandr_r("dppd", 96, src, dst);
    563       DO_imm_mandr_r("dppd", 97, src, dst);
    564       DO_imm_mandr_r("dppd", 98, src, dst);
    565       DO_imm_mandr_r("dppd", 99, src, dst);
    566       DO_imm_mandr_r("dppd", 100, src, dst);
    567       DO_imm_mandr_r("dppd", 101, src, dst);
    568       DO_imm_mandr_r("dppd", 102, src, dst);
    569       DO_imm_mandr_r("dppd", 103, src, dst);
    570       DO_imm_mandr_r("dppd", 104, src, dst);
    571       DO_imm_mandr_r("dppd", 105, src, dst);
    572       DO_imm_mandr_r("dppd", 106, src, dst);
    573       DO_imm_mandr_r("dppd", 107, src, dst);
    574       DO_imm_mandr_r("dppd", 108, src, dst);
    575       DO_imm_mandr_r("dppd", 109, src, dst);
    576       DO_imm_mandr_r("dppd", 110, src, dst);
    577       DO_imm_mandr_r("dppd", 111, src, dst);
    578       DO_imm_mandr_r("dppd", 112, src, dst);
    579       DO_imm_mandr_r("dppd", 113, src, dst);
    580       DO_imm_mandr_r("dppd", 114, src, dst);
    581       DO_imm_mandr_r("dppd", 115, src, dst);
    582       DO_imm_mandr_r("dppd", 116, src, dst);
    583       DO_imm_mandr_r("dppd", 117, src, dst);
    584       DO_imm_mandr_r("dppd", 118, src, dst);
    585       DO_imm_mandr_r("dppd", 119, src, dst);
    586       DO_imm_mandr_r("dppd", 120, src, dst);
    587       DO_imm_mandr_r("dppd", 121, src, dst);
    588       DO_imm_mandr_r("dppd", 122, src, dst);
    589       DO_imm_mandr_r("dppd", 123, src, dst);
    590       DO_imm_mandr_r("dppd", 124, src, dst);
    591       DO_imm_mandr_r("dppd", 125, src, dst);
    592       DO_imm_mandr_r("dppd", 126, src, dst);
    593       DO_imm_mandr_r("dppd", 127, src, dst);
    594       DO_imm_mandr_r("dppd", 128, src, dst);
    595       DO_imm_mandr_r("dppd", 129, src, dst);
    596       DO_imm_mandr_r("dppd", 130, src, dst);
    597       DO_imm_mandr_r("dppd", 131, src, dst);
    598       DO_imm_mandr_r("dppd", 132, src, dst);
    599       DO_imm_mandr_r("dppd", 133, src, dst);
    600       DO_imm_mandr_r("dppd", 134, src, dst);
    601       DO_imm_mandr_r("dppd", 135, src, dst);
    602       DO_imm_mandr_r("dppd", 136, src, dst);
    603       DO_imm_mandr_r("dppd", 137, src, dst);
    604       DO_imm_mandr_r("dppd", 138, src, dst);
    605       DO_imm_mandr_r("dppd", 139, src, dst);
    606       DO_imm_mandr_r("dppd", 140, src, dst);
    607       DO_imm_mandr_r("dppd", 141, src, dst);
    608       DO_imm_mandr_r("dppd", 142, src, dst);
    609       DO_imm_mandr_r("dppd", 143, src, dst);
    610       DO_imm_mandr_r("dppd", 144, src, dst);
    611       DO_imm_mandr_r("dppd", 145, src, dst);
    612       DO_imm_mandr_r("dppd", 146, src, dst);
    613       DO_imm_mandr_r("dppd", 147, src, dst);
    614       DO_imm_mandr_r("dppd", 148, src, dst);
    615       DO_imm_mandr_r("dppd", 149, src, dst);
    616       DO_imm_mandr_r("dppd", 150, src, dst);
    617       DO_imm_mandr_r("dppd", 151, src, dst);
    618       DO_imm_mandr_r("dppd", 152, src, dst);
    619       DO_imm_mandr_r("dppd", 153, src, dst);
    620       DO_imm_mandr_r("dppd", 154, src, dst);
    621       DO_imm_mandr_r("dppd", 155, src, dst);
    622       DO_imm_mandr_r("dppd", 156, src, dst);
    623       DO_imm_mandr_r("dppd", 157, src, dst);
    624       DO_imm_mandr_r("dppd", 158, src, dst);
    625       DO_imm_mandr_r("dppd", 159, src, dst);
    626       DO_imm_mandr_r("dppd", 160, src, dst);
    627       DO_imm_mandr_r("dppd", 161, src, dst);
    628       DO_imm_mandr_r("dppd", 162, src, dst);
    629       DO_imm_mandr_r("dppd", 163, src, dst);
    630       DO_imm_mandr_r("dppd", 164, src, dst);
    631       DO_imm_mandr_r("dppd", 165, src, dst);
    632       DO_imm_mandr_r("dppd", 166, src, dst);
    633       DO_imm_mandr_r("dppd", 167, src, dst);
    634       DO_imm_mandr_r("dppd", 168, src, dst);
    635       DO_imm_mandr_r("dppd", 169, src, dst);
    636       DO_imm_mandr_r("dppd", 170, src, dst);
    637       DO_imm_mandr_r("dppd", 171, src, dst);
    638       DO_imm_mandr_r("dppd", 172, src, dst);
    639       DO_imm_mandr_r("dppd", 173, src, dst);
    640       DO_imm_mandr_r("dppd", 174, src, dst);
    641       DO_imm_mandr_r("dppd", 175, src, dst);
    642       DO_imm_mandr_r("dppd", 176, src, dst);
    643       DO_imm_mandr_r("dppd", 177, src, dst);
    644       DO_imm_mandr_r("dppd", 178, src, dst);
    645       DO_imm_mandr_r("dppd", 179, src, dst);
    646       DO_imm_mandr_r("dppd", 180, src, dst);
    647       DO_imm_mandr_r("dppd", 181, src, dst);
    648       DO_imm_mandr_r("dppd", 182, src, dst);
    649       DO_imm_mandr_r("dppd", 183, src, dst);
    650       DO_imm_mandr_r("dppd", 184, src, dst);
    651       DO_imm_mandr_r("dppd", 185, src, dst);
    652       DO_imm_mandr_r("dppd", 186, src, dst);
    653       DO_imm_mandr_r("dppd", 187, src, dst);
    654       DO_imm_mandr_r("dppd", 188, src, dst);
    655       DO_imm_mandr_r("dppd", 189, src, dst);
    656       DO_imm_mandr_r("dppd", 190, src, dst);
    657       DO_imm_mandr_r("dppd", 191, src, dst);
    658       DO_imm_mandr_r("dppd", 192, src, dst);
    659       DO_imm_mandr_r("dppd", 193, src, dst);
    660       DO_imm_mandr_r("dppd", 194, src, dst);
    661       DO_imm_mandr_r("dppd", 195, src, dst);
    662       DO_imm_mandr_r("dppd", 196, src, dst);
    663       DO_imm_mandr_r("dppd", 197, src, dst);
    664       DO_imm_mandr_r("dppd", 198, src, dst);
    665       DO_imm_mandr_r("dppd", 199, src, dst);
    666       DO_imm_mandr_r("dppd", 200, src, dst);
    667       DO_imm_mandr_r("dppd", 201, src, dst);
    668       DO_imm_mandr_r("dppd", 202, src, dst);
    669       DO_imm_mandr_r("dppd", 203, src, dst);
    670       DO_imm_mandr_r("dppd", 204, src, dst);
    671       DO_imm_mandr_r("dppd", 205, src, dst);
    672       DO_imm_mandr_r("dppd", 206, src, dst);
    673       DO_imm_mandr_r("dppd", 207, src, dst);
    674       DO_imm_mandr_r("dppd", 208, src, dst);
    675       DO_imm_mandr_r("dppd", 209, src, dst);
    676       DO_imm_mandr_r("dppd", 210, src, dst);
    677       DO_imm_mandr_r("dppd", 211, src, dst);
    678       DO_imm_mandr_r("dppd", 212, src, dst);
    679       DO_imm_mandr_r("dppd", 213, src, dst);
    680       DO_imm_mandr_r("dppd", 214, src, dst);
    681       DO_imm_mandr_r("dppd", 215, src, dst);
    682       DO_imm_mandr_r("dppd", 216, src, dst);
    683       DO_imm_mandr_r("dppd", 217, src, dst);
    684       DO_imm_mandr_r("dppd", 218, src, dst);
    685       DO_imm_mandr_r("dppd", 219, src, dst);
    686       DO_imm_mandr_r("dppd", 220, src, dst);
    687       DO_imm_mandr_r("dppd", 221, src, dst);
    688       DO_imm_mandr_r("dppd", 222, src, dst);
    689       DO_imm_mandr_r("dppd", 223, src, dst);
    690       DO_imm_mandr_r("dppd", 224, src, dst);
    691       DO_imm_mandr_r("dppd", 225, src, dst);
    692       DO_imm_mandr_r("dppd", 226, src, dst);
    693       DO_imm_mandr_r("dppd", 227, src, dst);
    694       DO_imm_mandr_r("dppd", 228, src, dst);
    695       DO_imm_mandr_r("dppd", 229, src, dst);
    696       DO_imm_mandr_r("dppd", 230, src, dst);
    697       DO_imm_mandr_r("dppd", 231, src, dst);
    698       DO_imm_mandr_r("dppd", 232, src, dst);
    699       DO_imm_mandr_r("dppd", 233, src, dst);
    700       DO_imm_mandr_r("dppd", 234, src, dst);
    701       DO_imm_mandr_r("dppd", 235, src, dst);
    702       DO_imm_mandr_r("dppd", 236, src, dst);
    703       DO_imm_mandr_r("dppd", 237, src, dst);
    704       DO_imm_mandr_r("dppd", 238, src, dst);
    705       DO_imm_mandr_r("dppd", 239, src, dst);
    706       DO_imm_mandr_r("dppd", 240, src, dst);
    707       DO_imm_mandr_r("dppd", 241, src, dst);
    708       DO_imm_mandr_r("dppd", 242, src, dst);
    709       DO_imm_mandr_r("dppd", 243, src, dst);
    710       DO_imm_mandr_r("dppd", 244, src, dst);
    711       DO_imm_mandr_r("dppd", 245, src, dst);
    712       DO_imm_mandr_r("dppd", 246, src, dst);
    713       DO_imm_mandr_r("dppd", 247, src, dst);
    714       DO_imm_mandr_r("dppd", 248, src, dst);
    715       DO_imm_mandr_r("dppd", 249, src, dst);
    716       DO_imm_mandr_r("dppd", 250, src, dst);
    717       DO_imm_mandr_r("dppd", 251, src, dst);
    718       DO_imm_mandr_r("dppd", 252, src, dst);
    719       DO_imm_mandr_r("dppd", 253, src, dst);
    720       DO_imm_mandr_r("dppd", 254, src, dst);
    721       DO_imm_mandr_r("dppd", 255, src, dst);
    722    }
    723 }
    724 
    725 void test_DPPS ( void )
    726 {
    727    V128 src, dst;
    728    {
    729       *(float*)(&src[0])  =   1.2;
    730       *(float*)(&src[4])  =  -3.4;
    731       *(float*)(&src[8])  =  -6.7;
    732       *(float*)(&src[12]) =   8.9;
    733       *(float*)(&dst[0])  = -10.11;
    734       *(float*)(&dst[4])  =  12.13;
    735       *(float*)(&dst[8])  =  14.15;
    736       *(float*)(&dst[12]) = -16.17;
    737       DO_imm_mandr_r("dpps", 0, src, dst);
    738       DO_imm_mandr_r("dpps", 1, src, dst);
    739       DO_imm_mandr_r("dpps", 2, src, dst);
    740       DO_imm_mandr_r("dpps", 3, src, dst);
    741       DO_imm_mandr_r("dpps", 4, src, dst);
    742       DO_imm_mandr_r("dpps", 5, src, dst);
    743       DO_imm_mandr_r("dpps", 6, src, dst);
    744       DO_imm_mandr_r("dpps", 7, src, dst);
    745       DO_imm_mandr_r("dpps", 8, src, dst);
    746       DO_imm_mandr_r("dpps", 9, src, dst);
    747       DO_imm_mandr_r("dpps", 10, src, dst);
    748       DO_imm_mandr_r("dpps", 11, src, dst);
    749       DO_imm_mandr_r("dpps", 12, src, dst);
    750       DO_imm_mandr_r("dpps", 13, src, dst);
    751       DO_imm_mandr_r("dpps", 14, src, dst);
    752       DO_imm_mandr_r("dpps", 15, src, dst);
    753       DO_imm_mandr_r("dpps", 16, src, dst);
    754       DO_imm_mandr_r("dpps", 17, src, dst);
    755       DO_imm_mandr_r("dpps", 18, src, dst);
    756       DO_imm_mandr_r("dpps", 19, src, dst);
    757       DO_imm_mandr_r("dpps", 20, src, dst);
    758       DO_imm_mandr_r("dpps", 21, src, dst);
    759       DO_imm_mandr_r("dpps", 22, src, dst);
    760       DO_imm_mandr_r("dpps", 23, src, dst);
    761       DO_imm_mandr_r("dpps", 24, src, dst);
    762       DO_imm_mandr_r("dpps", 25, src, dst);
    763       DO_imm_mandr_r("dpps", 26, src, dst);
    764       DO_imm_mandr_r("dpps", 27, src, dst);
    765       DO_imm_mandr_r("dpps", 28, src, dst);
    766       DO_imm_mandr_r("dpps", 29, src, dst);
    767       DO_imm_mandr_r("dpps", 30, src, dst);
    768       DO_imm_mandr_r("dpps", 31, src, dst);
    769       DO_imm_mandr_r("dpps", 32, src, dst);
    770       DO_imm_mandr_r("dpps", 33, src, dst);
    771       DO_imm_mandr_r("dpps", 34, src, dst);
    772       DO_imm_mandr_r("dpps", 35, src, dst);
    773       DO_imm_mandr_r("dpps", 36, src, dst);
    774       DO_imm_mandr_r("dpps", 37, src, dst);
    775       DO_imm_mandr_r("dpps", 38, src, dst);
    776       DO_imm_mandr_r("dpps", 39, src, dst);
    777       DO_imm_mandr_r("dpps", 40, src, dst);
    778       DO_imm_mandr_r("dpps", 41, src, dst);
    779       DO_imm_mandr_r("dpps", 42, src, dst);
    780       DO_imm_mandr_r("dpps", 43, src, dst);
    781       DO_imm_mandr_r("dpps", 44, src, dst);
    782       DO_imm_mandr_r("dpps", 45, src, dst);
    783       DO_imm_mandr_r("dpps", 46, src, dst);
    784       DO_imm_mandr_r("dpps", 47, src, dst);
    785       DO_imm_mandr_r("dpps", 48, src, dst);
    786       DO_imm_mandr_r("dpps", 49, src, dst);
    787       DO_imm_mandr_r("dpps", 50, src, dst);
    788       DO_imm_mandr_r("dpps", 51, src, dst);
    789       DO_imm_mandr_r("dpps", 52, src, dst);
    790       DO_imm_mandr_r("dpps", 53, src, dst);
    791       DO_imm_mandr_r("dpps", 54, src, dst);
    792       DO_imm_mandr_r("dpps", 55, src, dst);
    793       DO_imm_mandr_r("dpps", 56, src, dst);
    794       DO_imm_mandr_r("dpps", 57, src, dst);
    795       DO_imm_mandr_r("dpps", 58, src, dst);
    796       DO_imm_mandr_r("dpps", 59, src, dst);
    797       DO_imm_mandr_r("dpps", 60, src, dst);
    798       DO_imm_mandr_r("dpps", 61, src, dst);
    799       DO_imm_mandr_r("dpps", 62, src, dst);
    800       DO_imm_mandr_r("dpps", 63, src, dst);
    801       DO_imm_mandr_r("dpps", 64, src, dst);
    802       DO_imm_mandr_r("dpps", 65, src, dst);
    803       DO_imm_mandr_r("dpps", 66, src, dst);
    804       DO_imm_mandr_r("dpps", 67, src, dst);
    805       DO_imm_mandr_r("dpps", 68, src, dst);
    806       DO_imm_mandr_r("dpps", 69, src, dst);
    807       DO_imm_mandr_r("dpps", 70, src, dst);
    808       DO_imm_mandr_r("dpps", 71, src, dst);
    809       DO_imm_mandr_r("dpps", 72, src, dst);
    810       DO_imm_mandr_r("dpps", 73, src, dst);
    811       DO_imm_mandr_r("dpps", 74, src, dst);
    812       DO_imm_mandr_r("dpps", 75, src, dst);
    813       DO_imm_mandr_r("dpps", 76, src, dst);
    814       DO_imm_mandr_r("dpps", 77, src, dst);
    815       DO_imm_mandr_r("dpps", 78, src, dst);
    816       DO_imm_mandr_r("dpps", 79, src, dst);
    817       DO_imm_mandr_r("dpps", 80, src, dst);
    818       DO_imm_mandr_r("dpps", 81, src, dst);
    819       DO_imm_mandr_r("dpps", 82, src, dst);
    820       DO_imm_mandr_r("dpps", 83, src, dst);
    821       DO_imm_mandr_r("dpps", 84, src, dst);
    822       DO_imm_mandr_r("dpps", 85, src, dst);
    823       DO_imm_mandr_r("dpps", 86, src, dst);
    824       DO_imm_mandr_r("dpps", 87, src, dst);
    825       DO_imm_mandr_r("dpps", 88, src, dst);
    826       DO_imm_mandr_r("dpps", 89, src, dst);
    827       DO_imm_mandr_r("dpps", 90, src, dst);
    828       DO_imm_mandr_r("dpps", 91, src, dst);
    829       DO_imm_mandr_r("dpps", 92, src, dst);
    830       DO_imm_mandr_r("dpps", 93, src, dst);
    831       DO_imm_mandr_r("dpps", 94, src, dst);
    832       DO_imm_mandr_r("dpps", 95, src, dst);
    833       DO_imm_mandr_r("dpps", 96, src, dst);
    834       DO_imm_mandr_r("dpps", 97, src, dst);
    835       DO_imm_mandr_r("dpps", 98, src, dst);
    836       DO_imm_mandr_r("dpps", 99, src, dst);
    837       DO_imm_mandr_r("dpps", 100, src, dst);
    838       DO_imm_mandr_r("dpps", 101, src, dst);
    839       DO_imm_mandr_r("dpps", 102, src, dst);
    840       DO_imm_mandr_r("dpps", 103, src, dst);
    841       DO_imm_mandr_r("dpps", 104, src, dst);
    842       DO_imm_mandr_r("dpps", 105, src, dst);
    843       DO_imm_mandr_r("dpps", 106, src, dst);
    844       DO_imm_mandr_r("dpps", 107, src, dst);
    845       DO_imm_mandr_r("dpps", 108, src, dst);
    846       DO_imm_mandr_r("dpps", 109, src, dst);
    847       DO_imm_mandr_r("dpps", 110, src, dst);
    848       DO_imm_mandr_r("dpps", 111, src, dst);
    849       DO_imm_mandr_r("dpps", 112, src, dst);
    850       DO_imm_mandr_r("dpps", 113, src, dst);
    851       DO_imm_mandr_r("dpps", 114, src, dst);
    852       DO_imm_mandr_r("dpps", 115, src, dst);
    853       DO_imm_mandr_r("dpps", 116, src, dst);
    854       DO_imm_mandr_r("dpps", 117, src, dst);
    855       DO_imm_mandr_r("dpps", 118, src, dst);
    856       DO_imm_mandr_r("dpps", 119, src, dst);
    857       DO_imm_mandr_r("dpps", 120, src, dst);
    858       DO_imm_mandr_r("dpps", 121, src, dst);
    859       DO_imm_mandr_r("dpps", 122, src, dst);
    860       DO_imm_mandr_r("dpps", 123, src, dst);
    861       DO_imm_mandr_r("dpps", 124, src, dst);
    862       DO_imm_mandr_r("dpps", 125, src, dst);
    863       DO_imm_mandr_r("dpps", 126, src, dst);
    864       DO_imm_mandr_r("dpps", 127, src, dst);
    865       DO_imm_mandr_r("dpps", 128, src, dst);
    866       DO_imm_mandr_r("dpps", 129, src, dst);
    867       DO_imm_mandr_r("dpps", 130, src, dst);
    868       DO_imm_mandr_r("dpps", 131, src, dst);
    869       DO_imm_mandr_r("dpps", 132, src, dst);
    870       DO_imm_mandr_r("dpps", 133, src, dst);
    871       DO_imm_mandr_r("dpps", 134, src, dst);
    872       DO_imm_mandr_r("dpps", 135, src, dst);
    873       DO_imm_mandr_r("dpps", 136, src, dst);
    874       DO_imm_mandr_r("dpps", 137, src, dst);
    875       DO_imm_mandr_r("dpps", 138, src, dst);
    876       DO_imm_mandr_r("dpps", 139, src, dst);
    877       DO_imm_mandr_r("dpps", 140, src, dst);
    878       DO_imm_mandr_r("dpps", 141, src, dst);
    879       DO_imm_mandr_r("dpps", 142, src, dst);
    880       DO_imm_mandr_r("dpps", 143, src, dst);
    881       DO_imm_mandr_r("dpps", 144, src, dst);
    882       DO_imm_mandr_r("dpps", 145, src, dst);
    883       DO_imm_mandr_r("dpps", 146, src, dst);
    884       DO_imm_mandr_r("dpps", 147, src, dst);
    885       DO_imm_mandr_r("dpps", 148, src, dst);
    886       DO_imm_mandr_r("dpps", 149, src, dst);
    887       DO_imm_mandr_r("dpps", 150, src, dst);
    888       DO_imm_mandr_r("dpps", 151, src, dst);
    889       DO_imm_mandr_r("dpps", 152, src, dst);
    890       DO_imm_mandr_r("dpps", 153, src, dst);
    891       DO_imm_mandr_r("dpps", 154, src, dst);
    892       DO_imm_mandr_r("dpps", 155, src, dst);
    893       DO_imm_mandr_r("dpps", 156, src, dst);
    894       DO_imm_mandr_r("dpps", 157, src, dst);
    895       DO_imm_mandr_r("dpps", 158, src, dst);
    896       DO_imm_mandr_r("dpps", 159, src, dst);
    897       DO_imm_mandr_r("dpps", 160, src, dst);
    898       DO_imm_mandr_r("dpps", 161, src, dst);
    899       DO_imm_mandr_r("dpps", 162, src, dst);
    900       DO_imm_mandr_r("dpps", 163, src, dst);
    901       DO_imm_mandr_r("dpps", 164, src, dst);
    902       DO_imm_mandr_r("dpps", 165, src, dst);
    903       DO_imm_mandr_r("dpps", 166, src, dst);
    904       DO_imm_mandr_r("dpps", 167, src, dst);
    905       DO_imm_mandr_r("dpps", 168, src, dst);
    906       DO_imm_mandr_r("dpps", 169, src, dst);
    907       DO_imm_mandr_r("dpps", 170, src, dst);
    908       DO_imm_mandr_r("dpps", 171, src, dst);
    909       DO_imm_mandr_r("dpps", 172, src, dst);
    910       DO_imm_mandr_r("dpps", 173, src, dst);
    911       DO_imm_mandr_r("dpps", 174, src, dst);
    912       DO_imm_mandr_r("dpps", 175, src, dst);
    913       DO_imm_mandr_r("dpps", 176, src, dst);
    914       DO_imm_mandr_r("dpps", 177, src, dst);
    915       DO_imm_mandr_r("dpps", 178, src, dst);
    916       DO_imm_mandr_r("dpps", 179, src, dst);
    917       DO_imm_mandr_r("dpps", 180, src, dst);
    918       DO_imm_mandr_r("dpps", 181, src, dst);
    919       DO_imm_mandr_r("dpps", 182, src, dst);
    920       DO_imm_mandr_r("dpps", 183, src, dst);
    921       DO_imm_mandr_r("dpps", 184, src, dst);
    922       DO_imm_mandr_r("dpps", 185, src, dst);
    923       DO_imm_mandr_r("dpps", 186, src, dst);
    924       DO_imm_mandr_r("dpps", 187, src, dst);
    925       DO_imm_mandr_r("dpps", 188, src, dst);
    926       DO_imm_mandr_r("dpps", 189, src, dst);
    927       DO_imm_mandr_r("dpps", 190, src, dst);
    928       DO_imm_mandr_r("dpps", 191, src, dst);
    929       DO_imm_mandr_r("dpps", 192, src, dst);
    930       DO_imm_mandr_r("dpps", 193, src, dst);
    931       DO_imm_mandr_r("dpps", 194, src, dst);
    932       DO_imm_mandr_r("dpps", 195, src, dst);
    933       DO_imm_mandr_r("dpps", 196, src, dst);
    934       DO_imm_mandr_r("dpps", 197, src, dst);
    935       DO_imm_mandr_r("dpps", 198, src, dst);
    936       DO_imm_mandr_r("dpps", 199, src, dst);
    937       DO_imm_mandr_r("dpps", 200, src, dst);
    938       DO_imm_mandr_r("dpps", 201, src, dst);
    939       DO_imm_mandr_r("dpps", 202, src, dst);
    940       DO_imm_mandr_r("dpps", 203, src, dst);
    941       DO_imm_mandr_r("dpps", 204, src, dst);
    942       DO_imm_mandr_r("dpps", 205, src, dst);
    943       DO_imm_mandr_r("dpps", 206, src, dst);
    944       DO_imm_mandr_r("dpps", 207, src, dst);
    945       DO_imm_mandr_r("dpps", 208, src, dst);
    946       DO_imm_mandr_r("dpps", 209, src, dst);
    947       DO_imm_mandr_r("dpps", 210, src, dst);
    948       DO_imm_mandr_r("dpps", 211, src, dst);
    949       DO_imm_mandr_r("dpps", 212, src, dst);
    950       DO_imm_mandr_r("dpps", 213, src, dst);
    951       DO_imm_mandr_r("dpps", 214, src, dst);
    952       DO_imm_mandr_r("dpps", 215, src, dst);
    953       DO_imm_mandr_r("dpps", 216, src, dst);
    954       DO_imm_mandr_r("dpps", 217, src, dst);
    955       DO_imm_mandr_r("dpps", 218, src, dst);
    956       DO_imm_mandr_r("dpps", 219, src, dst);
    957       DO_imm_mandr_r("dpps", 220, src, dst);
    958       DO_imm_mandr_r("dpps", 221, src, dst);
    959       DO_imm_mandr_r("dpps", 222, src, dst);
    960       DO_imm_mandr_r("dpps", 223, src, dst);
    961       DO_imm_mandr_r("dpps", 224, src, dst);
    962       DO_imm_mandr_r("dpps", 225, src, dst);
    963       DO_imm_mandr_r("dpps", 226, src, dst);
    964       DO_imm_mandr_r("dpps", 227, src, dst);
    965       DO_imm_mandr_r("dpps", 228, src, dst);
    966       DO_imm_mandr_r("dpps", 229, src, dst);
    967       DO_imm_mandr_r("dpps", 230, src, dst);
    968       DO_imm_mandr_r("dpps", 231, src, dst);
    969       DO_imm_mandr_r("dpps", 232, src, dst);
    970       DO_imm_mandr_r("dpps", 233, src, dst);
    971       DO_imm_mandr_r("dpps", 234, src, dst);
    972       DO_imm_mandr_r("dpps", 235, src, dst);
    973       DO_imm_mandr_r("dpps", 236, src, dst);
    974       DO_imm_mandr_r("dpps", 237, src, dst);
    975       DO_imm_mandr_r("dpps", 238, src, dst);
    976       DO_imm_mandr_r("dpps", 239, src, dst);
    977       DO_imm_mandr_r("dpps", 240, src, dst);
    978       DO_imm_mandr_r("dpps", 241, src, dst);
    979       DO_imm_mandr_r("dpps", 242, src, dst);
    980       DO_imm_mandr_r("dpps", 243, src, dst);
    981       DO_imm_mandr_r("dpps", 244, src, dst);
    982       DO_imm_mandr_r("dpps", 245, src, dst);
    983       DO_imm_mandr_r("dpps", 246, src, dst);
    984       DO_imm_mandr_r("dpps", 247, src, dst);
    985       DO_imm_mandr_r("dpps", 248, src, dst);
    986       DO_imm_mandr_r("dpps", 249, src, dst);
    987       DO_imm_mandr_r("dpps", 250, src, dst);
    988       DO_imm_mandr_r("dpps", 251, src, dst);
    989       DO_imm_mandr_r("dpps", 252, src, dst);
    990       DO_imm_mandr_r("dpps", 253, src, dst);
    991       DO_imm_mandr_r("dpps", 254, src, dst);
    992       DO_imm_mandr_r("dpps", 255, src, dst);
    993    }
    994 }
    995 
    996 void test_INSERTPS ( void )
    997 {
    998    V128 src, dst;
    999    {
   1000       *(float*)(&src[0])  =   1.2;
   1001       *(float*)(&src[4])  =  -3.4;
   1002       *(float*)(&src[8])  =  -6.7;
   1003       *(float*)(&src[12]) =   8.9;
   1004       *(float*)(&dst[0])  = -10.11;
   1005       *(float*)(&dst[4])  =  12.13;
   1006       *(float*)(&dst[8])  =  14.15;
   1007       *(float*)(&dst[12]) = -16.17;
   1008       DO_imm_mandr_r("insertps", 0, src, dst);
   1009       DO_imm_mandr_r("insertps", 1, src, dst);
   1010       DO_imm_mandr_r("insertps", 2, src, dst);
   1011       DO_imm_mandr_r("insertps", 3, src, dst);
   1012       DO_imm_mandr_r("insertps", 4, src, dst);
   1013       DO_imm_mandr_r("insertps", 5, src, dst);
   1014       DO_imm_mandr_r("insertps", 6, src, dst);
   1015       DO_imm_mandr_r("insertps", 7, src, dst);
   1016       DO_imm_mandr_r("insertps", 8, src, dst);
   1017       DO_imm_mandr_r("insertps", 9, src, dst);
   1018       DO_imm_mandr_r("insertps", 10, src, dst);
   1019       DO_imm_mandr_r("insertps", 11, src, dst);
   1020       DO_imm_mandr_r("insertps", 12, src, dst);
   1021       DO_imm_mandr_r("insertps", 13, src, dst);
   1022       DO_imm_mandr_r("insertps", 14, src, dst);
   1023       DO_imm_mandr_r("insertps", 15, src, dst);
   1024       DO_imm_mandr_r("insertps", 16, src, dst);
   1025       DO_imm_mandr_r("insertps", 17, src, dst);
   1026       DO_imm_mandr_r("insertps", 18, src, dst);
   1027       DO_imm_mandr_r("insertps", 19, src, dst);
   1028       DO_imm_mandr_r("insertps", 20, src, dst);
   1029       DO_imm_mandr_r("insertps", 21, src, dst);
   1030       DO_imm_mandr_r("insertps", 22, src, dst);
   1031       DO_imm_mandr_r("insertps", 23, src, dst);
   1032       DO_imm_mandr_r("insertps", 24, src, dst);
   1033       DO_imm_mandr_r("insertps", 25, src, dst);
   1034       DO_imm_mandr_r("insertps", 26, src, dst);
   1035       DO_imm_mandr_r("insertps", 27, src, dst);
   1036       DO_imm_mandr_r("insertps", 28, src, dst);
   1037       DO_imm_mandr_r("insertps", 29, src, dst);
   1038       DO_imm_mandr_r("insertps", 30, src, dst);
   1039       DO_imm_mandr_r("insertps", 31, src, dst);
   1040       DO_imm_mandr_r("insertps", 32, src, dst);
   1041       DO_imm_mandr_r("insertps", 33, src, dst);
   1042       DO_imm_mandr_r("insertps", 34, src, dst);
   1043       DO_imm_mandr_r("insertps", 35, src, dst);
   1044       DO_imm_mandr_r("insertps", 36, src, dst);
   1045       DO_imm_mandr_r("insertps", 37, src, dst);
   1046       DO_imm_mandr_r("insertps", 38, src, dst);
   1047       DO_imm_mandr_r("insertps", 39, src, dst);
   1048       DO_imm_mandr_r("insertps", 40, src, dst);
   1049       DO_imm_mandr_r("insertps", 41, src, dst);
   1050       DO_imm_mandr_r("insertps", 42, src, dst);
   1051       DO_imm_mandr_r("insertps", 43, src, dst);
   1052       DO_imm_mandr_r("insertps", 44, src, dst);
   1053       DO_imm_mandr_r("insertps", 45, src, dst);
   1054       DO_imm_mandr_r("insertps", 46, src, dst);
   1055       DO_imm_mandr_r("insertps", 47, src, dst);
   1056       DO_imm_mandr_r("insertps", 48, src, dst);
   1057       DO_imm_mandr_r("insertps", 49, src, dst);
   1058       DO_imm_mandr_r("insertps", 50, src, dst);
   1059       DO_imm_mandr_r("insertps", 51, src, dst);
   1060       DO_imm_mandr_r("insertps", 52, src, dst);
   1061       DO_imm_mandr_r("insertps", 53, src, dst);
   1062       DO_imm_mandr_r("insertps", 54, src, dst);
   1063       DO_imm_mandr_r("insertps", 55, src, dst);
   1064       DO_imm_mandr_r("insertps", 56, src, dst);
   1065       DO_imm_mandr_r("insertps", 57, src, dst);
   1066       DO_imm_mandr_r("insertps", 58, src, dst);
   1067       DO_imm_mandr_r("insertps", 59, src, dst);
   1068       DO_imm_mandr_r("insertps", 60, src, dst);
   1069       DO_imm_mandr_r("insertps", 61, src, dst);
   1070       DO_imm_mandr_r("insertps", 62, src, dst);
   1071       DO_imm_mandr_r("insertps", 63, src, dst);
   1072       DO_imm_mandr_r("insertps", 64, src, dst);
   1073       DO_imm_mandr_r("insertps", 65, src, dst);
   1074       DO_imm_mandr_r("insertps", 66, src, dst);
   1075       DO_imm_mandr_r("insertps", 67, src, dst);
   1076       DO_imm_mandr_r("insertps", 68, src, dst);
   1077       DO_imm_mandr_r("insertps", 69, src, dst);
   1078       DO_imm_mandr_r("insertps", 70, src, dst);
   1079       DO_imm_mandr_r("insertps", 71, src, dst);
   1080       DO_imm_mandr_r("insertps", 72, src, dst);
   1081       DO_imm_mandr_r("insertps", 73, src, dst);
   1082       DO_imm_mandr_r("insertps", 74, src, dst);
   1083       DO_imm_mandr_r("insertps", 75, src, dst);
   1084       DO_imm_mandr_r("insertps", 76, src, dst);
   1085       DO_imm_mandr_r("insertps", 77, src, dst);
   1086       DO_imm_mandr_r("insertps", 78, src, dst);
   1087       DO_imm_mandr_r("insertps", 79, src, dst);
   1088       DO_imm_mandr_r("insertps", 80, src, dst);
   1089       DO_imm_mandr_r("insertps", 81, src, dst);
   1090       DO_imm_mandr_r("insertps", 82, src, dst);
   1091       DO_imm_mandr_r("insertps", 83, src, dst);
   1092       DO_imm_mandr_r("insertps", 84, src, dst);
   1093       DO_imm_mandr_r("insertps", 85, src, dst);
   1094       DO_imm_mandr_r("insertps", 86, src, dst);
   1095       DO_imm_mandr_r("insertps", 87, src, dst);
   1096       DO_imm_mandr_r("insertps", 88, src, dst);
   1097       DO_imm_mandr_r("insertps", 89, src, dst);
   1098       DO_imm_mandr_r("insertps", 90, src, dst);
   1099       DO_imm_mandr_r("insertps", 91, src, dst);
   1100       DO_imm_mandr_r("insertps", 92, src, dst);
   1101       DO_imm_mandr_r("insertps", 93, src, dst);
   1102       DO_imm_mandr_r("insertps", 94, src, dst);
   1103       DO_imm_mandr_r("insertps", 95, src, dst);
   1104       DO_imm_mandr_r("insertps", 96, src, dst);
   1105       DO_imm_mandr_r("insertps", 97, src, dst);
   1106       DO_imm_mandr_r("insertps", 98, src, dst);
   1107       DO_imm_mandr_r("insertps", 99, src, dst);
   1108       DO_imm_mandr_r("insertps", 100, src, dst);
   1109       DO_imm_mandr_r("insertps", 101, src, dst);
   1110       DO_imm_mandr_r("insertps", 102, src, dst);
   1111       DO_imm_mandr_r("insertps", 103, src, dst);
   1112       DO_imm_mandr_r("insertps", 104, src, dst);
   1113       DO_imm_mandr_r("insertps", 105, src, dst);
   1114       DO_imm_mandr_r("insertps", 106, src, dst);
   1115       DO_imm_mandr_r("insertps", 107, src, dst);
   1116       DO_imm_mandr_r("insertps", 108, src, dst);
   1117       DO_imm_mandr_r("insertps", 109, src, dst);
   1118       DO_imm_mandr_r("insertps", 110, src, dst);
   1119       DO_imm_mandr_r("insertps", 111, src, dst);
   1120       DO_imm_mandr_r("insertps", 112, src, dst);
   1121       DO_imm_mandr_r("insertps", 113, src, dst);
   1122       DO_imm_mandr_r("insertps", 114, src, dst);
   1123       DO_imm_mandr_r("insertps", 115, src, dst);
   1124       DO_imm_mandr_r("insertps", 116, src, dst);
   1125       DO_imm_mandr_r("insertps", 117, src, dst);
   1126       DO_imm_mandr_r("insertps", 118, src, dst);
   1127       DO_imm_mandr_r("insertps", 119, src, dst);
   1128       DO_imm_mandr_r("insertps", 120, src, dst);
   1129       DO_imm_mandr_r("insertps", 121, src, dst);
   1130       DO_imm_mandr_r("insertps", 122, src, dst);
   1131       DO_imm_mandr_r("insertps", 123, src, dst);
   1132       DO_imm_mandr_r("insertps", 124, src, dst);
   1133       DO_imm_mandr_r("insertps", 125, src, dst);
   1134       DO_imm_mandr_r("insertps", 126, src, dst);
   1135       DO_imm_mandr_r("insertps", 127, src, dst);
   1136       DO_imm_mandr_r("insertps", 128, src, dst);
   1137       DO_imm_mandr_r("insertps", 129, src, dst);
   1138       DO_imm_mandr_r("insertps", 130, src, dst);
   1139       DO_imm_mandr_r("insertps", 131, src, dst);
   1140       DO_imm_mandr_r("insertps", 132, src, dst);
   1141       DO_imm_mandr_r("insertps", 133, src, dst);
   1142       DO_imm_mandr_r("insertps", 134, src, dst);
   1143       DO_imm_mandr_r("insertps", 135, src, dst);
   1144       DO_imm_mandr_r("insertps", 136, src, dst);
   1145       DO_imm_mandr_r("insertps", 137, src, dst);
   1146       DO_imm_mandr_r("insertps", 138, src, dst);
   1147       DO_imm_mandr_r("insertps", 139, src, dst);
   1148       DO_imm_mandr_r("insertps", 140, src, dst);
   1149       DO_imm_mandr_r("insertps", 141, src, dst);
   1150       DO_imm_mandr_r("insertps", 142, src, dst);
   1151       DO_imm_mandr_r("insertps", 143, src, dst);
   1152       DO_imm_mandr_r("insertps", 144, src, dst);
   1153       DO_imm_mandr_r("insertps", 145, src, dst);
   1154       DO_imm_mandr_r("insertps", 146, src, dst);
   1155       DO_imm_mandr_r("insertps", 147, src, dst);
   1156       DO_imm_mandr_r("insertps", 148, src, dst);
   1157       DO_imm_mandr_r("insertps", 149, src, dst);
   1158       DO_imm_mandr_r("insertps", 150, src, dst);
   1159       DO_imm_mandr_r("insertps", 151, src, dst);
   1160       DO_imm_mandr_r("insertps", 152, src, dst);
   1161       DO_imm_mandr_r("insertps", 153, src, dst);
   1162       DO_imm_mandr_r("insertps", 154, src, dst);
   1163       DO_imm_mandr_r("insertps", 155, src, dst);
   1164       DO_imm_mandr_r("insertps", 156, src, dst);
   1165       DO_imm_mandr_r("insertps", 157, src, dst);
   1166       DO_imm_mandr_r("insertps", 158, src, dst);
   1167       DO_imm_mandr_r("insertps", 159, src, dst);
   1168       DO_imm_mandr_r("insertps", 160, src, dst);
   1169       DO_imm_mandr_r("insertps", 161, src, dst);
   1170       DO_imm_mandr_r("insertps", 162, src, dst);
   1171       DO_imm_mandr_r("insertps", 163, src, dst);
   1172       DO_imm_mandr_r("insertps", 164, src, dst);
   1173       DO_imm_mandr_r("insertps", 165, src, dst);
   1174       DO_imm_mandr_r("insertps", 166, src, dst);
   1175       DO_imm_mandr_r("insertps", 167, src, dst);
   1176       DO_imm_mandr_r("insertps", 168, src, dst);
   1177       DO_imm_mandr_r("insertps", 169, src, dst);
   1178       DO_imm_mandr_r("insertps", 170, src, dst);
   1179       DO_imm_mandr_r("insertps", 171, src, dst);
   1180       DO_imm_mandr_r("insertps", 172, src, dst);
   1181       DO_imm_mandr_r("insertps", 173, src, dst);
   1182       DO_imm_mandr_r("insertps", 174, src, dst);
   1183       DO_imm_mandr_r("insertps", 175, src, dst);
   1184       DO_imm_mandr_r("insertps", 176, src, dst);
   1185       DO_imm_mandr_r("insertps", 177, src, dst);
   1186       DO_imm_mandr_r("insertps", 178, src, dst);
   1187       DO_imm_mandr_r("insertps", 179, src, dst);
   1188       DO_imm_mandr_r("insertps", 180, src, dst);
   1189       DO_imm_mandr_r("insertps", 181, src, dst);
   1190       DO_imm_mandr_r("insertps", 182, src, dst);
   1191       DO_imm_mandr_r("insertps", 183, src, dst);
   1192       DO_imm_mandr_r("insertps", 184, src, dst);
   1193       DO_imm_mandr_r("insertps", 185, src, dst);
   1194       DO_imm_mandr_r("insertps", 186, src, dst);
   1195       DO_imm_mandr_r("insertps", 187, src, dst);
   1196       DO_imm_mandr_r("insertps", 188, src, dst);
   1197       DO_imm_mandr_r("insertps", 189, src, dst);
   1198       DO_imm_mandr_r("insertps", 190, src, dst);
   1199       DO_imm_mandr_r("insertps", 191, src, dst);
   1200       DO_imm_mandr_r("insertps", 192, src, dst);
   1201       DO_imm_mandr_r("insertps", 193, src, dst);
   1202       DO_imm_mandr_r("insertps", 194, src, dst);
   1203       DO_imm_mandr_r("insertps", 195, src, dst);
   1204       DO_imm_mandr_r("insertps", 196, src, dst);
   1205       DO_imm_mandr_r("insertps", 197, src, dst);
   1206       DO_imm_mandr_r("insertps", 198, src, dst);
   1207       DO_imm_mandr_r("insertps", 199, src, dst);
   1208       DO_imm_mandr_r("insertps", 200, src, dst);
   1209       DO_imm_mandr_r("insertps", 201, src, dst);
   1210       DO_imm_mandr_r("insertps", 202, src, dst);
   1211       DO_imm_mandr_r("insertps", 203, src, dst);
   1212       DO_imm_mandr_r("insertps", 204, src, dst);
   1213       DO_imm_mandr_r("insertps", 205, src, dst);
   1214       DO_imm_mandr_r("insertps", 206, src, dst);
   1215       DO_imm_mandr_r("insertps", 207, src, dst);
   1216       DO_imm_mandr_r("insertps", 208, src, dst);
   1217       DO_imm_mandr_r("insertps", 209, src, dst);
   1218       DO_imm_mandr_r("insertps", 210, src, dst);
   1219       DO_imm_mandr_r("insertps", 211, src, dst);
   1220       DO_imm_mandr_r("insertps", 212, src, dst);
   1221       DO_imm_mandr_r("insertps", 213, src, dst);
   1222       DO_imm_mandr_r("insertps", 214, src, dst);
   1223       DO_imm_mandr_r("insertps", 215, src, dst);
   1224       DO_imm_mandr_r("insertps", 216, src, dst);
   1225       DO_imm_mandr_r("insertps", 217, src, dst);
   1226       DO_imm_mandr_r("insertps", 218, src, dst);
   1227       DO_imm_mandr_r("insertps", 219, src, dst);
   1228       DO_imm_mandr_r("insertps", 220, src, dst);
   1229       DO_imm_mandr_r("insertps", 221, src, dst);
   1230       DO_imm_mandr_r("insertps", 222, src, dst);
   1231       DO_imm_mandr_r("insertps", 223, src, dst);
   1232       DO_imm_mandr_r("insertps", 224, src, dst);
   1233       DO_imm_mandr_r("insertps", 225, src, dst);
   1234       DO_imm_mandr_r("insertps", 226, src, dst);
   1235       DO_imm_mandr_r("insertps", 227, src, dst);
   1236       DO_imm_mandr_r("insertps", 228, src, dst);
   1237       DO_imm_mandr_r("insertps", 229, src, dst);
   1238       DO_imm_mandr_r("insertps", 230, src, dst);
   1239       DO_imm_mandr_r("insertps", 231, src, dst);
   1240       DO_imm_mandr_r("insertps", 232, src, dst);
   1241       DO_imm_mandr_r("insertps", 233, src, dst);
   1242       DO_imm_mandr_r("insertps", 234, src, dst);
   1243       DO_imm_mandr_r("insertps", 235, src, dst);
   1244       DO_imm_mandr_r("insertps", 236, src, dst);
   1245       DO_imm_mandr_r("insertps", 237, src, dst);
   1246       DO_imm_mandr_r("insertps", 238, src, dst);
   1247       DO_imm_mandr_r("insertps", 239, src, dst);
   1248       DO_imm_mandr_r("insertps", 240, src, dst);
   1249       DO_imm_mandr_r("insertps", 241, src, dst);
   1250       DO_imm_mandr_r("insertps", 242, src, dst);
   1251       DO_imm_mandr_r("insertps", 243, src, dst);
   1252       DO_imm_mandr_r("insertps", 244, src, dst);
   1253       DO_imm_mandr_r("insertps", 245, src, dst);
   1254       DO_imm_mandr_r("insertps", 246, src, dst);
   1255       DO_imm_mandr_r("insertps", 247, src, dst);
   1256       DO_imm_mandr_r("insertps", 248, src, dst);
   1257       DO_imm_mandr_r("insertps", 249, src, dst);
   1258       DO_imm_mandr_r("insertps", 250, src, dst);
   1259       DO_imm_mandr_r("insertps", 251, src, dst);
   1260       DO_imm_mandr_r("insertps", 252, src, dst);
   1261       DO_imm_mandr_r("insertps", 253, src, dst);
   1262       DO_imm_mandr_r("insertps", 254, src, dst);
   1263       DO_imm_mandr_r("insertps", 255, src, dst);
   1264    }
   1265 }
   1266 
   1267 void test_MPSADBW ( void )
   1268 {
   1269    V128 src, dst;
   1270    Int i;
   1271    for (i = 0; i < 50; i++) {
   1272       randV128(&src);
   1273       randV128(&dst);
   1274       DO_imm_mandr_r("mpsadbw", 0, src, dst);
   1275       DO_imm_mandr_r("mpsadbw", 1, src, dst);
   1276       DO_imm_mandr_r("mpsadbw", 2, src, dst);
   1277       DO_imm_mandr_r("mpsadbw", 3, src, dst);
   1278       DO_imm_mandr_r("mpsadbw", 4, src, dst);
   1279       DO_imm_mandr_r("mpsadbw", 5, src, dst);
   1280       DO_imm_mandr_r("mpsadbw", 6, src, dst);
   1281       DO_imm_mandr_r("mpsadbw", 7, src, dst);
   1282    }
   1283 }
   1284 
   1285 void test_PACKUSDW ( void )
   1286 {
   1287    V128 src, dst;
   1288    Int i;
   1289    for (i = 0; i < 10; i++) {
   1290       if (i < 9) {
   1291          randV128(&src);
   1292          randV128(&dst);
   1293       } else {
   1294          memset(&src, 0, sizeof(src));
   1295          memset(&dst, 0, sizeof(src));
   1296          src[0] = 0x11; src[1] = 0x22;
   1297          src[4] = 0x33; src[5] = 0x44;
   1298          src[8] = 0x55; src[9] = 0x66;
   1299          src[12] = 0x77; src[13] = 0x88;
   1300          dst[0] = 0xaa; dst[1] = 0xbb;
   1301          dst[4] = 0xcc; dst[5] = 0xdd;
   1302          dst[8] = 0xee; dst[9] = 0xff;
   1303          dst[12] = 0xa1; dst[13] = 0xb2;
   1304       }
   1305       DO_mandr_r("packusdw", src, dst);
   1306    }
   1307 }
   1308 
   1309 void test_PBLENDW ( void )
   1310 {
   1311    V128 src, dst;
   1312    randV128(&src);
   1313    randV128(&dst);
   1314    {
   1315       DO_imm_mandr_r("pblendw", 0, src, dst);
   1316       DO_imm_mandr_r("pblendw", 1, src, dst);
   1317       DO_imm_mandr_r("pblendw", 2, src, dst);
   1318       DO_imm_mandr_r("pblendw", 3, src, dst);
   1319       DO_imm_mandr_r("pblendw", 4, src, dst);
   1320       DO_imm_mandr_r("pblendw", 5, src, dst);
   1321       DO_imm_mandr_r("pblendw", 6, src, dst);
   1322       DO_imm_mandr_r("pblendw", 7, src, dst);
   1323       DO_imm_mandr_r("pblendw", 8, src, dst);
   1324       DO_imm_mandr_r("pblendw", 9, src, dst);
   1325       DO_imm_mandr_r("pblendw", 10, src, dst);
   1326       DO_imm_mandr_r("pblendw", 11, src, dst);
   1327       DO_imm_mandr_r("pblendw", 12, src, dst);
   1328       DO_imm_mandr_r("pblendw", 13, src, dst);
   1329       DO_imm_mandr_r("pblendw", 14, src, dst);
   1330       DO_imm_mandr_r("pblendw", 15, src, dst);
   1331       DO_imm_mandr_r("pblendw", 16, src, dst);
   1332       DO_imm_mandr_r("pblendw", 17, src, dst);
   1333       DO_imm_mandr_r("pblendw", 18, src, dst);
   1334       DO_imm_mandr_r("pblendw", 19, src, dst);
   1335       DO_imm_mandr_r("pblendw", 20, src, dst);
   1336       DO_imm_mandr_r("pblendw", 21, src, dst);
   1337       DO_imm_mandr_r("pblendw", 22, src, dst);
   1338       DO_imm_mandr_r("pblendw", 23, src, dst);
   1339       DO_imm_mandr_r("pblendw", 24, src, dst);
   1340       DO_imm_mandr_r("pblendw", 25, src, dst);
   1341       DO_imm_mandr_r("pblendw", 26, src, dst);
   1342       DO_imm_mandr_r("pblendw", 27, src, dst);
   1343       DO_imm_mandr_r("pblendw", 28, src, dst);
   1344       DO_imm_mandr_r("pblendw", 29, src, dst);
   1345       DO_imm_mandr_r("pblendw", 30, src, dst);
   1346       DO_imm_mandr_r("pblendw", 31, src, dst);
   1347       DO_imm_mandr_r("pblendw", 32, src, dst);
   1348       DO_imm_mandr_r("pblendw", 33, src, dst);
   1349       DO_imm_mandr_r("pblendw", 34, src, dst);
   1350       DO_imm_mandr_r("pblendw", 35, src, dst);
   1351       DO_imm_mandr_r("pblendw", 36, src, dst);
   1352       DO_imm_mandr_r("pblendw", 37, src, dst);
   1353       DO_imm_mandr_r("pblendw", 38, src, dst);
   1354       DO_imm_mandr_r("pblendw", 39, src, dst);
   1355       DO_imm_mandr_r("pblendw", 40, src, dst);
   1356       DO_imm_mandr_r("pblendw", 41, src, dst);
   1357       DO_imm_mandr_r("pblendw", 42, src, dst);
   1358       DO_imm_mandr_r("pblendw", 43, src, dst);
   1359       DO_imm_mandr_r("pblendw", 44, src, dst);
   1360       DO_imm_mandr_r("pblendw", 45, src, dst);
   1361       DO_imm_mandr_r("pblendw", 46, src, dst);
   1362       DO_imm_mandr_r("pblendw", 47, src, dst);
   1363       DO_imm_mandr_r("pblendw", 48, src, dst);
   1364       DO_imm_mandr_r("pblendw", 49, src, dst);
   1365       DO_imm_mandr_r("pblendw", 50, src, dst);
   1366       DO_imm_mandr_r("pblendw", 51, src, dst);
   1367       DO_imm_mandr_r("pblendw", 52, src, dst);
   1368       DO_imm_mandr_r("pblendw", 53, src, dst);
   1369       DO_imm_mandr_r("pblendw", 54, src, dst);
   1370       DO_imm_mandr_r("pblendw", 55, src, dst);
   1371       DO_imm_mandr_r("pblendw", 56, src, dst);
   1372       DO_imm_mandr_r("pblendw", 57, src, dst);
   1373       DO_imm_mandr_r("pblendw", 58, src, dst);
   1374       DO_imm_mandr_r("pblendw", 59, src, dst);
   1375       DO_imm_mandr_r("pblendw", 60, src, dst);
   1376       DO_imm_mandr_r("pblendw", 61, src, dst);
   1377       DO_imm_mandr_r("pblendw", 62, src, dst);
   1378       DO_imm_mandr_r("pblendw", 63, src, dst);
   1379       DO_imm_mandr_r("pblendw", 64, src, dst);
   1380       DO_imm_mandr_r("pblendw", 65, src, dst);
   1381       DO_imm_mandr_r("pblendw", 66, src, dst);
   1382       DO_imm_mandr_r("pblendw", 67, src, dst);
   1383       DO_imm_mandr_r("pblendw", 68, src, dst);
   1384       DO_imm_mandr_r("pblendw", 69, src, dst);
   1385       DO_imm_mandr_r("pblendw", 70, src, dst);
   1386       DO_imm_mandr_r("pblendw", 71, src, dst);
   1387       DO_imm_mandr_r("pblendw", 72, src, dst);
   1388       DO_imm_mandr_r("pblendw", 73, src, dst);
   1389       DO_imm_mandr_r("pblendw", 74, src, dst);
   1390       DO_imm_mandr_r("pblendw", 75, src, dst);
   1391       DO_imm_mandr_r("pblendw", 76, src, dst);
   1392       DO_imm_mandr_r("pblendw", 77, src, dst);
   1393       DO_imm_mandr_r("pblendw", 78, src, dst);
   1394       DO_imm_mandr_r("pblendw", 79, src, dst);
   1395       DO_imm_mandr_r("pblendw", 80, src, dst);
   1396       DO_imm_mandr_r("pblendw", 81, src, dst);
   1397       DO_imm_mandr_r("pblendw", 82, src, dst);
   1398       DO_imm_mandr_r("pblendw", 83, src, dst);
   1399       DO_imm_mandr_r("pblendw", 84, src, dst);
   1400       DO_imm_mandr_r("pblendw", 85, src, dst);
   1401       DO_imm_mandr_r("pblendw", 86, src, dst);
   1402       DO_imm_mandr_r("pblendw", 87, src, dst);
   1403       DO_imm_mandr_r("pblendw", 88, src, dst);
   1404       DO_imm_mandr_r("pblendw", 89, src, dst);
   1405       DO_imm_mandr_r("pblendw", 90, src, dst);
   1406       DO_imm_mandr_r("pblendw", 91, src, dst);
   1407       DO_imm_mandr_r("pblendw", 92, src, dst);
   1408       DO_imm_mandr_r("pblendw", 93, src, dst);
   1409       DO_imm_mandr_r("pblendw", 94, src, dst);
   1410       DO_imm_mandr_r("pblendw", 95, src, dst);
   1411       DO_imm_mandr_r("pblendw", 96, src, dst);
   1412       DO_imm_mandr_r("pblendw", 97, src, dst);
   1413       DO_imm_mandr_r("pblendw", 98, src, dst);
   1414       DO_imm_mandr_r("pblendw", 99, src, dst);
   1415       DO_imm_mandr_r("pblendw", 100, src, dst);
   1416       DO_imm_mandr_r("pblendw", 101, src, dst);
   1417       DO_imm_mandr_r("pblendw", 102, src, dst);
   1418       DO_imm_mandr_r("pblendw", 103, src, dst);
   1419       DO_imm_mandr_r("pblendw", 104, src, dst);
   1420       DO_imm_mandr_r("pblendw", 105, src, dst);
   1421       DO_imm_mandr_r("pblendw", 106, src, dst);
   1422       DO_imm_mandr_r("pblendw", 107, src, dst);
   1423       DO_imm_mandr_r("pblendw", 108, src, dst);
   1424       DO_imm_mandr_r("pblendw", 109, src, dst);
   1425       DO_imm_mandr_r("pblendw", 110, src, dst);
   1426       DO_imm_mandr_r("pblendw", 111, src, dst);
   1427       DO_imm_mandr_r("pblendw", 112, src, dst);
   1428       DO_imm_mandr_r("pblendw", 113, src, dst);
   1429       DO_imm_mandr_r("pblendw", 114, src, dst);
   1430       DO_imm_mandr_r("pblendw", 115, src, dst);
   1431       DO_imm_mandr_r("pblendw", 116, src, dst);
   1432       DO_imm_mandr_r("pblendw", 117, src, dst);
   1433       DO_imm_mandr_r("pblendw", 118, src, dst);
   1434       DO_imm_mandr_r("pblendw", 119, src, dst);
   1435       DO_imm_mandr_r("pblendw", 120, src, dst);
   1436       DO_imm_mandr_r("pblendw", 121, src, dst);
   1437       DO_imm_mandr_r("pblendw", 122, src, dst);
   1438       DO_imm_mandr_r("pblendw", 123, src, dst);
   1439       DO_imm_mandr_r("pblendw", 124, src, dst);
   1440       DO_imm_mandr_r("pblendw", 125, src, dst);
   1441       DO_imm_mandr_r("pblendw", 126, src, dst);
   1442       DO_imm_mandr_r("pblendw", 127, src, dst);
   1443       DO_imm_mandr_r("pblendw", 128, src, dst);
   1444       DO_imm_mandr_r("pblendw", 129, src, dst);
   1445       DO_imm_mandr_r("pblendw", 130, src, dst);
   1446       DO_imm_mandr_r("pblendw", 131, src, dst);
   1447       DO_imm_mandr_r("pblendw", 132, src, dst);
   1448       DO_imm_mandr_r("pblendw", 133, src, dst);
   1449       DO_imm_mandr_r("pblendw", 134, src, dst);
   1450       DO_imm_mandr_r("pblendw", 135, src, dst);
   1451       DO_imm_mandr_r("pblendw", 136, src, dst);
   1452       DO_imm_mandr_r("pblendw", 137, src, dst);
   1453       DO_imm_mandr_r("pblendw", 138, src, dst);
   1454       DO_imm_mandr_r("pblendw", 139, src, dst);
   1455       DO_imm_mandr_r("pblendw", 140, src, dst);
   1456       DO_imm_mandr_r("pblendw", 141, src, dst);
   1457       DO_imm_mandr_r("pblendw", 142, src, dst);
   1458       DO_imm_mandr_r("pblendw", 143, src, dst);
   1459       DO_imm_mandr_r("pblendw", 144, src, dst);
   1460       DO_imm_mandr_r("pblendw", 145, src, dst);
   1461       DO_imm_mandr_r("pblendw", 146, src, dst);
   1462       DO_imm_mandr_r("pblendw", 147, src, dst);
   1463       DO_imm_mandr_r("pblendw", 148, src, dst);
   1464       DO_imm_mandr_r("pblendw", 149, src, dst);
   1465       DO_imm_mandr_r("pblendw", 150, src, dst);
   1466       DO_imm_mandr_r("pblendw", 151, src, dst);
   1467       DO_imm_mandr_r("pblendw", 152, src, dst);
   1468       DO_imm_mandr_r("pblendw", 153, src, dst);
   1469       DO_imm_mandr_r("pblendw", 154, src, dst);
   1470       DO_imm_mandr_r("pblendw", 155, src, dst);
   1471       DO_imm_mandr_r("pblendw", 156, src, dst);
   1472       DO_imm_mandr_r("pblendw", 157, src, dst);
   1473       DO_imm_mandr_r("pblendw", 158, src, dst);
   1474       DO_imm_mandr_r("pblendw", 159, src, dst);
   1475       DO_imm_mandr_r("pblendw", 160, src, dst);
   1476       DO_imm_mandr_r("pblendw", 161, src, dst);
   1477       DO_imm_mandr_r("pblendw", 162, src, dst);
   1478       DO_imm_mandr_r("pblendw", 163, src, dst);
   1479       DO_imm_mandr_r("pblendw", 164, src, dst);
   1480       DO_imm_mandr_r("pblendw", 165, src, dst);
   1481       DO_imm_mandr_r("pblendw", 166, src, dst);
   1482       DO_imm_mandr_r("pblendw", 167, src, dst);
   1483       DO_imm_mandr_r("pblendw", 168, src, dst);
   1484       DO_imm_mandr_r("pblendw", 169, src, dst);
   1485       DO_imm_mandr_r("pblendw", 170, src, dst);
   1486       DO_imm_mandr_r("pblendw", 171, src, dst);
   1487       DO_imm_mandr_r("pblendw", 172, src, dst);
   1488       DO_imm_mandr_r("pblendw", 173, src, dst);
   1489       DO_imm_mandr_r("pblendw", 174, src, dst);
   1490       DO_imm_mandr_r("pblendw", 175, src, dst);
   1491       DO_imm_mandr_r("pblendw", 176, src, dst);
   1492       DO_imm_mandr_r("pblendw", 177, src, dst);
   1493       DO_imm_mandr_r("pblendw", 178, src, dst);
   1494       DO_imm_mandr_r("pblendw", 179, src, dst);
   1495       DO_imm_mandr_r("pblendw", 180, src, dst);
   1496       DO_imm_mandr_r("pblendw", 181, src, dst);
   1497       DO_imm_mandr_r("pblendw", 182, src, dst);
   1498       DO_imm_mandr_r("pblendw", 183, src, dst);
   1499       DO_imm_mandr_r("pblendw", 184, src, dst);
   1500       DO_imm_mandr_r("pblendw", 185, src, dst);
   1501       DO_imm_mandr_r("pblendw", 186, src, dst);
   1502       DO_imm_mandr_r("pblendw", 187, src, dst);
   1503       DO_imm_mandr_r("pblendw", 188, src, dst);
   1504       DO_imm_mandr_r("pblendw", 189, src, dst);
   1505       DO_imm_mandr_r("pblendw", 190, src, dst);
   1506       DO_imm_mandr_r("pblendw", 191, src, dst);
   1507       DO_imm_mandr_r("pblendw", 192, src, dst);
   1508       DO_imm_mandr_r("pblendw", 193, src, dst);
   1509       DO_imm_mandr_r("pblendw", 194, src, dst);
   1510       DO_imm_mandr_r("pblendw", 195, src, dst);
   1511       DO_imm_mandr_r("pblendw", 196, src, dst);
   1512       DO_imm_mandr_r("pblendw", 197, src, dst);
   1513       DO_imm_mandr_r("pblendw", 198, src, dst);
   1514       DO_imm_mandr_r("pblendw", 199, src, dst);
   1515       DO_imm_mandr_r("pblendw", 200, src, dst);
   1516       DO_imm_mandr_r("pblendw", 201, src, dst);
   1517       DO_imm_mandr_r("pblendw", 202, src, dst);
   1518       DO_imm_mandr_r("pblendw", 203, src, dst);
   1519       DO_imm_mandr_r("pblendw", 204, src, dst);
   1520       DO_imm_mandr_r("pblendw", 205, src, dst);
   1521       DO_imm_mandr_r("pblendw", 206, src, dst);
   1522       DO_imm_mandr_r("pblendw", 207, src, dst);
   1523       DO_imm_mandr_r("pblendw", 208, src, dst);
   1524       DO_imm_mandr_r("pblendw", 209, src, dst);
   1525       DO_imm_mandr_r("pblendw", 210, src, dst);
   1526       DO_imm_mandr_r("pblendw", 211, src, dst);
   1527       DO_imm_mandr_r("pblendw", 212, src, dst);
   1528       DO_imm_mandr_r("pblendw", 213, src, dst);
   1529       DO_imm_mandr_r("pblendw", 214, src, dst);
   1530       DO_imm_mandr_r("pblendw", 215, src, dst);
   1531       DO_imm_mandr_r("pblendw", 216, src, dst);
   1532       DO_imm_mandr_r("pblendw", 217, src, dst);
   1533       DO_imm_mandr_r("pblendw", 218, src, dst);
   1534       DO_imm_mandr_r("pblendw", 219, src, dst);
   1535       DO_imm_mandr_r("pblendw", 220, src, dst);
   1536       DO_imm_mandr_r("pblendw", 221, src, dst);
   1537       DO_imm_mandr_r("pblendw", 222, src, dst);
   1538       DO_imm_mandr_r("pblendw", 223, src, dst);
   1539       DO_imm_mandr_r("pblendw", 224, src, dst);
   1540       DO_imm_mandr_r("pblendw", 225, src, dst);
   1541       DO_imm_mandr_r("pblendw", 226, src, dst);
   1542       DO_imm_mandr_r("pblendw", 227, src, dst);
   1543       DO_imm_mandr_r("pblendw", 228, src, dst);
   1544       DO_imm_mandr_r("pblendw", 229, src, dst);
   1545       DO_imm_mandr_r("pblendw", 230, src, dst);
   1546       DO_imm_mandr_r("pblendw", 231, src, dst);
   1547       DO_imm_mandr_r("pblendw", 232, src, dst);
   1548       DO_imm_mandr_r("pblendw", 233, src, dst);
   1549       DO_imm_mandr_r("pblendw", 234, src, dst);
   1550       DO_imm_mandr_r("pblendw", 235, src, dst);
   1551       DO_imm_mandr_r("pblendw", 236, src, dst);
   1552       DO_imm_mandr_r("pblendw", 237, src, dst);
   1553       DO_imm_mandr_r("pblendw", 238, src, dst);
   1554       DO_imm_mandr_r("pblendw", 239, src, dst);
   1555       DO_imm_mandr_r("pblendw", 240, src, dst);
   1556       DO_imm_mandr_r("pblendw", 241, src, dst);
   1557       DO_imm_mandr_r("pblendw", 242, src, dst);
   1558       DO_imm_mandr_r("pblendw", 243, src, dst);
   1559       DO_imm_mandr_r("pblendw", 244, src, dst);
   1560       DO_imm_mandr_r("pblendw", 245, src, dst);
   1561       DO_imm_mandr_r("pblendw", 246, src, dst);
   1562       DO_imm_mandr_r("pblendw", 247, src, dst);
   1563       DO_imm_mandr_r("pblendw", 248, src, dst);
   1564       DO_imm_mandr_r("pblendw", 249, src, dst);
   1565       DO_imm_mandr_r("pblendw", 250, src, dst);
   1566       DO_imm_mandr_r("pblendw", 251, src, dst);
   1567       DO_imm_mandr_r("pblendw", 252, src, dst);
   1568       DO_imm_mandr_r("pblendw", 253, src, dst);
   1569       DO_imm_mandr_r("pblendw", 254, src, dst);
   1570       DO_imm_mandr_r("pblendw", 255, src, dst);
   1571    }
   1572 }
   1573 
   1574 
   1575 void test_PCMPEQQ ( void )
   1576 {
   1577    V128 src, dst;
   1578    Int i;
   1579    for (i = 0; i < 10; i++) {
   1580       randV128(&src);
   1581       randV128(&dst);
   1582       switch (i - 6) {
   1583          case 0: memset(&src[0], 0x55, 8);
   1584                  memset(&dst[0], 0x55, 8); break;
   1585          case 1: memset(&src[8], 0x55, 8);
   1586                  memset(&dst[8], 0x55, 8); break;
   1587          default:
   1588             break;
   1589       }
   1590       DO_mandr_r("pcmpeqq", src, dst);
   1591    }
   1592 }
   1593 
   1594 
   1595 void test_PEXTRB ( void )
   1596 {
   1597    V128 src;
   1598    randV128(&src);
   1599    DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
   1600    DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
   1601    DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
   1602    DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
   1603    DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
   1604    DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
   1605    DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
   1606    DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
   1607    DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
   1608    DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
   1609    DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
   1610    DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
   1611    DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
   1612    DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
   1613    DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
   1614    DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
   1615 }
   1616 
   1617 void test_PINSRB ( void )
   1618 {
   1619    ULong src;
   1620    src = randULong();
   1621    DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
   1622    src = randULong();
   1623    DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
   1624    src = randULong();
   1625    DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
   1626    src = randULong();
   1627    DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
   1628    src = randULong();
   1629    DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
   1630    src = randULong();
   1631    DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
   1632    src = randULong();
   1633    DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
   1634    src = randULong();
   1635    DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
   1636    src = randULong();
   1637    DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
   1638    src = randULong();
   1639    DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
   1640    src = randULong();
   1641    DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
   1642    src = randULong();
   1643    DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
   1644    src = randULong();
   1645    DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
   1646    src = randULong();
   1647    DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
   1648    src = randULong();
   1649    DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
   1650    src = randULong();
   1651    DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
   1652 }
   1653 
   1654 
   1655 void test_PEXTRW ( void )
   1656 {
   1657    V128 src;
   1658    randV128(&src);
   1659    DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
   1660    DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
   1661    DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
   1662    DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
   1663    DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
   1664    DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
   1665    DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
   1666    DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
   1667 }
   1668 
   1669 void test_PINSRW ( void )
   1670 {
   1671    ULong src;
   1672    src = randULong();
   1673    DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
   1674    src = randULong();
   1675    DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
   1676    src = randULong();
   1677    DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
   1678    src = randULong();
   1679    DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
   1680    src = randULong();
   1681    DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
   1682    src = randULong();
   1683    DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
   1684    src = randULong();
   1685    DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
   1686    src = randULong();
   1687    DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
   1688 }
   1689 
   1690 
   1691 void test_PEXTRD ( void )
   1692 {
   1693    V128 src;
   1694    randV128(&src);
   1695    DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
   1696    DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
   1697    DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
   1698    DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
   1699 }
   1700 
   1701 void test_PINSRD ( void )
   1702 {
   1703    ULong src;
   1704    src = randULong();
   1705    DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
   1706    src = randULong();
   1707    DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
   1708    src = randULong();
   1709    DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
   1710    src = randULong();
   1711    DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
   1712 }
   1713 
   1714 
   1715 void test_PEXTRQ ( void )
   1716 {
   1717    V128 src;
   1718    randV128(&src);
   1719    DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
   1720    DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
   1721 }
   1722 
   1723 void test_PINSRQ ( void )
   1724 {
   1725    ULong src;
   1726    src = randULong();
   1727    DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
   1728    src = randULong();
   1729    DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
   1730 }
   1731 
   1732 
   1733 void test_EXTRACTPS ( void )
   1734 {
   1735    V128 src;
   1736    randV128(&src);
   1737    DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
   1738    DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
   1739    DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
   1740    DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
   1741 }
   1742 
   1743 
   1744 void test_PHMINPOSUW ( void )
   1745 {
   1746    V128 src, dst;
   1747    Int i;
   1748    for (i = 0; i < 20; i++) {
   1749       randV128(&src);
   1750       randV128(&dst);
   1751       DO_mandr_r("phminposuw", src, dst);
   1752    }
   1753    memset(src, 0x55, sizeof(src));
   1754    memset(dst, 0xAA, sizeof(dst));
   1755    DO_mandr_r("phminposuw", src, dst);
   1756 }
   1757 
   1758 void test_PMAXSB ( void )
   1759 {
   1760    V128 src, dst;
   1761    Int i;
   1762    for (i = 0; i < 10; i++) {
   1763       randV128(&src);
   1764       randV128(&dst);
   1765       DO_mandr_r("pmaxsb", src, dst);
   1766    }
   1767 }
   1768 
   1769 void test_PMAXSD ( void )
   1770 {
   1771    V128 src, dst;
   1772    Int i;
   1773    for (i = 0; i < 10; i++) {
   1774       randV128(&src);
   1775       randV128(&dst);
   1776       DO_mandr_r("pmaxsd", src, dst);
   1777    }
   1778 }
   1779 
   1780 void test_PMAXUD ( void )
   1781 {
   1782    V128 src, dst;
   1783    Int i;
   1784    for (i = 0; i < 10; i++) {
   1785       randV128(&src);
   1786       randV128(&dst);
   1787       DO_mandr_r("pmaxud", src, dst);
   1788    }
   1789 }
   1790 
   1791 void test_PMAXUW ( void )
   1792 {
   1793    V128 src, dst;
   1794    Int i;
   1795    for (i = 0; i < 10; i++) {
   1796       randV128(&src);
   1797       randV128(&dst);
   1798       DO_mandr_r("pmaxuw", src, dst);
   1799    }
   1800 }
   1801 
   1802 void test_PMINSB ( void )
   1803 {
   1804    V128 src, dst;
   1805    Int i;
   1806    for (i = 0; i < 10; i++) {
   1807       randV128(&src);
   1808       randV128(&dst);
   1809       DO_mandr_r("pminsb", src, dst);
   1810    }
   1811 }
   1812 
   1813 void test_PMINSD ( void )
   1814 {
   1815    V128 src, dst;
   1816    Int i;
   1817    for (i = 0; i < 10; i++) {
   1818       randV128(&src);
   1819       randV128(&dst);
   1820       DO_mandr_r("pminsd", src, dst);
   1821    }
   1822 }
   1823 
   1824 void test_PMINUD ( void )
   1825 {
   1826    V128 src, dst;
   1827    Int i;
   1828    for (i = 0; i < 10; i++) {
   1829       randV128(&src);
   1830       randV128(&dst);
   1831       DO_mandr_r("pminud", src, dst);
   1832    }
   1833 }
   1834 
   1835 void test_PMINUW ( void )
   1836 {
   1837    V128 src, dst;
   1838    Int i;
   1839    for (i = 0; i < 10; i++) {
   1840       randV128(&src);
   1841       randV128(&dst);
   1842       DO_mandr_r("pminuw", src, dst);
   1843    }
   1844 }
   1845 
   1846 void test_PMOVSXBW ( void )
   1847 {
   1848    V128 src, dst;
   1849    Int i;
   1850    for (i = 0; i < 10; i++) {
   1851       randV128(&src);
   1852       randV128(&dst);
   1853       DO_mandr_r("pmovsxbw", src, dst);
   1854    }
   1855 }
   1856 
   1857 void test_PMOVSXBD ( void )
   1858 {
   1859    V128 src, dst;
   1860    Int i;
   1861    for (i = 0; i < 10; i++) {
   1862       randV128(&src);
   1863       randV128(&dst);
   1864       DO_mandr_r("pmovsxbd", src, dst);
   1865    }
   1866 }
   1867 
   1868 void test_PMOVSXBQ ( void )
   1869 {
   1870    V128 src, dst;
   1871    Int i;
   1872    for (i = 0; i < 10; i++) {
   1873       randV128(&src);
   1874       randV128(&dst);
   1875       DO_mandr_r("pmovsxbq", src, dst);
   1876    }
   1877 }
   1878 
   1879 void test_PMOVSXWD ( void )
   1880 {
   1881    V128 src, dst;
   1882    Int i;
   1883    for (i = 0; i < 10; i++) {
   1884       randV128(&src);
   1885       randV128(&dst);
   1886       DO_mandr_r("pmovsxwd", src, dst);
   1887    }
   1888 }
   1889 
   1890 void test_PMOVSXWQ ( void )
   1891 {
   1892    V128 src, dst;
   1893    Int i;
   1894    for (i = 0; i < 10; i++) {
   1895       randV128(&src);
   1896       randV128(&dst);
   1897       DO_mandr_r("pmovsxwq", src, dst);
   1898    }
   1899 }
   1900 
   1901 void test_PMOVSXDQ ( void )
   1902 {
   1903    V128 src, dst;
   1904    Int i;
   1905    for (i = 0; i < 10; i++) {
   1906       randV128(&src);
   1907       randV128(&dst);
   1908       DO_mandr_r("pmovsxdq", src, dst);
   1909    }
   1910 }
   1911 
   1912 void test_PMOVZXBW ( void )
   1913 {
   1914    V128 src, dst;
   1915    Int i;
   1916    for (i = 0; i < 10; i++) {
   1917       randV128(&src);
   1918       randV128(&dst);
   1919       DO_mandr_r("pmovzxbw", src, dst);
   1920    }
   1921 }
   1922 
   1923 void test_PMOVZXBD ( void )
   1924 {
   1925    V128 src, dst;
   1926    Int i;
   1927    for (i = 0; i < 10; i++) {
   1928       randV128(&src);
   1929       randV128(&dst);
   1930       DO_mandr_r("pmovzxbd", src, dst);
   1931    }
   1932 }
   1933 
   1934 void test_PMOVZXBQ ( void )
   1935 {
   1936    V128 src, dst;
   1937    Int i;
   1938    for (i = 0; i < 10; i++) {
   1939       randV128(&src);
   1940       randV128(&dst);
   1941       DO_mandr_r("pmovzxbq", src, dst);
   1942    }
   1943 }
   1944 
   1945 void test_PMOVZXWD ( void )
   1946 {
   1947    V128 src, dst;
   1948    Int i;
   1949    for (i = 0; i < 10; i++) {
   1950       randV128(&src);
   1951       randV128(&dst);
   1952       DO_mandr_r("pmovzxwd", src, dst);
   1953    }
   1954 }
   1955 
   1956 void test_PMOVZXWQ ( void )
   1957 {
   1958    V128 src, dst;
   1959    Int i;
   1960    for (i = 0; i < 10; i++) {
   1961       randV128(&src);
   1962       randV128(&dst);
   1963       DO_mandr_r("pmovzxwq", src, dst);
   1964    }
   1965 }
   1966 
   1967 void test_PMOVZXDQ ( void )
   1968 {
   1969    V128 src, dst;
   1970    Int i;
   1971    for (i = 0; i < 10; i++) {
   1972       randV128(&src);
   1973       randV128(&dst);
   1974       DO_mandr_r("pmovzxdq", src, dst);
   1975    }
   1976 }
   1977 
   1978 void test_PMULDQ ( void )
   1979 {
   1980    V128 src, dst;
   1981    Int i;
   1982    for (i = 0; i < 10; i++) {
   1983       randV128(&src);
   1984       randV128(&dst);
   1985       DO_mandr_r("pmuldq", src, dst);
   1986    }
   1987 }
   1988 
   1989 
   1990 void test_PMULLD ( void )
   1991 {
   1992    V128 src, dst;
   1993    Int i;
   1994    for (i = 0; i < 10; i++) {
   1995       randV128(&src);
   1996       randV128(&dst);
   1997       DO_mandr_r("pmulld", src, dst);
   1998    }
   1999 }
   2000 
   2001 
   2002 void test_POPCNTQ ( void )
   2003 {
   2004    ULong block[4];
   2005    Int i;
   2006    ULong oszacp_mask = 0x8D5;
   2007    for (i = 0; i < 10; i++) {
   2008       block[0] = i == 0 ? 0 : randULong();
   2009       block[1] = randULong();
   2010       block[2] = randULong();
   2011       block[3] = randULong();
   2012       __asm__ __volatile__(
   2013          "movq %0,       %%rax"  "\n\t"
   2014          "movq 0(%%rax), %%rdi"  "\n\t"
   2015          "movq 8(%%rax), %%r11"  "\n\t"
   2016 #ifndef VGP_amd64_darwin
   2017          "popcntq %%rdi, %%r11"  "\n\t"
   2018 #else
   2019          "popcnt  %%rdi, %%r11"  "\n\t"
   2020 #endif
   2021          "movq %%r11, 16(%%rax)"  "\n\t"
   2022          "pushfq"                 "\n\t"
   2023          "popq %%r12"             "\n\t"
   2024          "movq %%r12, 24(%%rax)"  "\n"
   2025          : /*out*/
   2026          : /*in*/"r"(&block[0])
   2027          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2028       );
   2029       printf("r popcntq  %016llx %016llx  %016llx %016llx\n",
   2030              block[0], block[1], block[2], block[3] & oszacp_mask);
   2031 
   2032       block[0] = i == 0 ? 0 : randULong();
   2033       block[1] = randULong();
   2034       block[2] = randULong();
   2035       block[3] = randULong();
   2036       __asm__ __volatile__(
   2037          "movq %0,       %%rax"  "\n\t"
   2038          "movq 8(%%rax), %%r11"  "\n\t"
   2039 #ifndef VGP_amd64_darwin
   2040          "popcntq 0(%%rax), %%r11"  "\n\t"
   2041 #else
   2042          "popcnt  0(%%rax), %%r11"  "\n\t"
   2043 #endif
   2044          "movq %%r11, 16(%%rax)"  "\n\t"
   2045          "pushfq"                 "\n\t"
   2046          "popq %%r12"             "\n\t"
   2047          "movq %%r12, 24(%%rax)"  "\n"
   2048          : /*out*/
   2049          : /*in*/"r"(&block[0])
   2050          : /*trash*/ "cc", "memory", "r11", "r12"
   2051       );
   2052       printf("m popcntq  %016llx %016llx  %016llx %016llx\n",
   2053              block[0], block[1], block[2], block[3] & oszacp_mask);
   2054    }
   2055 }
   2056 
   2057 
   2058 void test_POPCNTL ( void )
   2059 {
   2060    ULong block[4];
   2061    Int i;
   2062    ULong oszacp_mask = 0x8D5;
   2063    for (i = 0; i < 10; i++) {
   2064       block[0] = i == 0 ? 0 : randULong();
   2065       block[1] = randULong();
   2066       block[2] = randULong();
   2067       block[3] = randULong();
   2068       __asm__ __volatile__(
   2069          "movq %0,       %%rax"  "\n\t"
   2070          "movq 0(%%rax), %%rdi"  "\n\t"
   2071          "movq 8(%%rax), %%r11"  "\n\t"
   2072 #ifndef VGP_amd64_darwin
   2073          "popcntl %%edi, %%r11d"  "\n\t"
   2074 #else
   2075          "popcnt  %%edi, %%r11d"  "\n\t"
   2076 #endif
   2077          "movq %%r11, 16(%%rax)"  "\n\t"
   2078          "pushfq"                 "\n\t"
   2079          "popq %%r12"             "\n\t"
   2080          "movq %%r12, 24(%%rax)"  "\n"
   2081          : /*out*/
   2082          : /*in*/"r"(&block[0])
   2083          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2084       );
   2085       printf("r popcntl  %016llx %016llx  %016llx %016llx\n",
   2086              block[0], block[1], block[2], block[3] & oszacp_mask);
   2087 
   2088       block[0] = i == 0 ? 0 : randULong();
   2089       block[1] = randULong();
   2090       block[2] = randULong();
   2091       block[3] = randULong();
   2092       __asm__ __volatile__(
   2093          "movq %0,       %%rax"  "\n\t"
   2094          "movq 8(%%rax), %%r11"  "\n\t"
   2095 #ifndef VGP_amd64_darwin
   2096          "popcntl 0(%%rax), %%r11d"  "\n\t"
   2097 #else
   2098          "popcnt  0(%%rax), %%r11d"  "\n\t"
   2099 #endif
   2100          "movq %%r11, 16(%%rax)"  "\n\t"
   2101          "pushfq"                 "\n\t"
   2102          "popq %%r12"             "\n\t"
   2103          "movq %%r12, 24(%%rax)"  "\n"
   2104          : /*out*/
   2105          : /*in*/"r"(&block[0])
   2106          : /*trash*/ "cc", "memory", "r11", "r12"
   2107       );
   2108       printf("m popcntl  %016llx %016llx  %016llx %016llx\n",
   2109              block[0], block[1], block[2], block[3] & oszacp_mask);
   2110    }
   2111 }
   2112 
   2113 
   2114 void test_POPCNTW ( void )
   2115 {
   2116    ULong block[4];
   2117    Int i;
   2118    ULong oszacp_mask = 0x8D5;
   2119    for (i = 0; i < 10; i++) {
   2120       block[0] = i == 0 ? 0 : randULong();
   2121       block[1] = randULong();
   2122       block[2] = randULong();
   2123       block[3] = randULong();
   2124       __asm__ __volatile__(
   2125          "movq %0,       %%rax"  "\n\t"
   2126          "movq 0(%%rax), %%rdi"  "\n\t"
   2127          "movq 8(%%rax), %%r11"  "\n\t"
   2128 #ifndef VGP_amd64_darwin
   2129          "popcntw %%di,  %%r11w"  "\n\t"
   2130 #else
   2131          "popcnt  %%di,  %%r11w"  "\n\t"
   2132 #endif
   2133          "movq %%r11, 16(%%rax)"  "\n\t"
   2134          "pushfq"                 "\n\t"
   2135          "popq %%r12"             "\n\t"
   2136          "movq %%r12, 24(%%rax)"  "\n"
   2137          : /*out*/
   2138          : /*in*/"r"(&block[0])
   2139          : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
   2140       );
   2141       printf("r popcntw  %016llx %016llx  %016llx %016llx\n",
   2142              block[0], block[1], block[2], block[3] & oszacp_mask);
   2143 
   2144       block[0] = i == 0 ? 0 : randULong();
   2145       block[1] = randULong();
   2146       block[2] = randULong();
   2147       block[3] = randULong();
   2148       __asm__ __volatile__(
   2149          "movq %0,       %%rax"  "\n\t"
   2150          "movq 8(%%rax), %%r11"  "\n\t"
   2151 #ifndef VGP_amd64_darwin
   2152          "popcntw 0(%%rax), %%r11w"  "\n\t"
   2153 #else
   2154          "popcnt  0(%%rax), %%r11w"  "\n\t"
   2155 #endif
   2156          "movq %%r11, 16(%%rax)"  "\n\t"
   2157          "pushfq"                 "\n\t"
   2158          "popq %%r12"             "\n\t"
   2159          "movq %%r12, 24(%%rax)"  "\n"
   2160          : /*out*/
   2161          : /*in*/"r"(&block[0])
   2162          : /*trash*/ "cc", "memory", "r11", "r12"
   2163       );
   2164       printf("m popcntw  %016llx %016llx  %016llx %016llx\n",
   2165              block[0], block[1], block[2], block[3] & oszacp_mask);
   2166    }
   2167 }
   2168 
   2169 
   2170 void test_PCMPGTQ ( void )
   2171 {
   2172    V128 spec[7];
   2173    do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
   2174    do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
   2175    do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
   2176    do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
   2177    do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
   2178    do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
   2179    do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
   2180 
   2181    V128 src, dst;
   2182    Int i, j;
   2183    for (i = 0; i < 10; i++) {
   2184       randV128(&src);
   2185       randV128(&dst);
   2186       DO_mandr_r("pcmpgtq", src, dst);
   2187    }
   2188    for (i = 0; i < 7; i++) {
   2189       for (j = 0; j < 7; j++) {
   2190          memcpy(&src, &spec[i], 16);
   2191          memcpy(&dst, &spec[j], 16);
   2192          DO_mandr_r("pcmpgtq", src, dst);
   2193       }
   2194    }
   2195 }
   2196 
   2197 /* ------------ ROUNDSD ------------ */
   2198 
   2199 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2200 {
   2201    if (mem) {
   2202       __asm__ __volatile__(
   2203          "movupd  (%1), %%xmm11"       "\n\t"
   2204          "roundsd $0, (%0), %%xmm11"   "\n\t"
   2205          "movupd  %%xmm11, (%1)"       "\n"
   2206          : /*OUT*/
   2207          : /*IN*/ "r"(src), "r"(dst)
   2208          : /*TRASH*/ "xmm11"
   2209       );
   2210    } else {
   2211       __asm__ __volatile__(
   2212          "movupd  (%1), %%xmm11"         "\n\t"
   2213          "movupd  (%0), %%xmm2"          "\n\t"
   2214          "roundsd $0, %%xmm2, %%xmm11"   "\n\t"
   2215          "movupd  %%xmm11, (%1)"         "\n"
   2216          : /*OUT*/
   2217          : /*IN*/ "r"(src), "r"(dst)
   2218          : /*TRASH*/ "xmm11","xmm2"
   2219       );
   2220    }
   2221 }
   2222 
   2223 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2224 {
   2225    if (mem) {
   2226       __asm__ __volatile__(
   2227          "movupd  (%1), %%xmm11"       "\n\t"
   2228          "roundsd $1, (%0), %%xmm11"   "\n\t"
   2229          "movupd  %%xmm11, (%1)"       "\n"
   2230          : /*OUT*/
   2231          : /*IN*/ "r"(src), "r"(dst)
   2232          : /*TRASH*/ "xmm11"
   2233       );
   2234    } else {
   2235       __asm__ __volatile__(
   2236          "movupd  (%1), %%xmm11"         "\n\t"
   2237          "movupd  (%0), %%xmm2"          "\n\t"
   2238          "roundsd $1, %%xmm2, %%xmm11"   "\n\t"
   2239          "movupd  %%xmm11, (%1)"         "\n"
   2240          : /*OUT*/
   2241          : /*IN*/ "r"(src), "r"(dst)
   2242          : /*TRASH*/ "xmm11","xmm2"
   2243       );
   2244    }
   2245 }
   2246 
   2247 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2248 {
   2249    if (mem) {
   2250       __asm__ __volatile__(
   2251          "movupd  (%1), %%xmm11"       "\n\t"
   2252          "roundsd $2, (%0), %%xmm11"   "\n\t"
   2253          "movupd  %%xmm11, (%1)"       "\n"
   2254          : /*OUT*/
   2255          : /*IN*/ "r"(src), "r"(dst)
   2256          : /*TRASH*/ "xmm11"
   2257       );
   2258    } else {
   2259       __asm__ __volatile__(
   2260          "movupd  (%1), %%xmm11"         "\n\t"
   2261          "movupd  (%0), %%xmm2"          "\n\t"
   2262          "roundsd $2, %%xmm2, %%xmm11"   "\n\t"
   2263          "movupd  %%xmm11, (%1)"         "\n"
   2264          : /*OUT*/
   2265          : /*IN*/ "r"(src), "r"(dst)
   2266          : /*TRASH*/ "xmm11","xmm2"
   2267       );
   2268    }
   2269 }
   2270 
   2271 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2272 {
   2273    if (mem) {
   2274       __asm__ __volatile__(
   2275          "movupd  (%1), %%xmm11"       "\n\t"
   2276          "roundsd $3, (%0), %%xmm11"   "\n\t"
   2277          "movupd  %%xmm11, (%1)"       "\n"
   2278          : /*OUT*/
   2279          : /*IN*/ "r"(src), "r"(dst)
   2280          : /*TRASH*/ "xmm11"
   2281       );
   2282    } else {
   2283       __asm__ __volatile__(
   2284          "movupd  (%1), %%xmm11"         "\n\t"
   2285          "movupd  (%0), %%xmm2"          "\n\t"
   2286          "roundsd $3, %%xmm2, %%xmm11"   "\n\t"
   2287          "movupd  %%xmm11, (%1)"         "\n"
   2288          : /*OUT*/
   2289          : /*IN*/ "r"(src), "r"(dst)
   2290          : /*TRASH*/ "xmm11","xmm2"
   2291       );
   2292    }
   2293 }
   2294 
   2295 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2296 {
   2297    if (mem) {
   2298       __asm__ __volatile__(
   2299          "movupd  (%1), %%xmm11"       "\n\t"
   2300          "roundsd $4, (%0), %%xmm11"   "\n\t"
   2301          "movupd  %%xmm11, (%1)"       "\n"
   2302          : /*OUT*/
   2303          : /*IN*/ "r"(src), "r"(dst)
   2304          : /*TRASH*/ "xmm11"
   2305       );
   2306    } else {
   2307       __asm__ __volatile__(
   2308          "movupd  (%1), %%xmm11"         "\n\t"
   2309          "movupd  (%0), %%xmm2"          "\n\t"
   2310          "roundsd $4, %%xmm2, %%xmm11"   "\n\t"
   2311          "movupd  %%xmm11, (%1)"         "\n"
   2312          : /*OUT*/
   2313          : /*IN*/ "r"(src), "r"(dst)
   2314          : /*TRASH*/ "xmm11","xmm2"
   2315       );
   2316    }
   2317 }
   2318 
   2319 void test_ROUNDSD_w_immediate_rounding ( void )
   2320 {
   2321    double vals[22];
   2322    Int i = 0;
   2323    vals[i++] = 0.0;
   2324    vals[i++] = -0.0;
   2325    vals[i++] = mkPosInf();
   2326    vals[i++] = mkNegInf();
   2327    vals[i++] = mkPosNan();
   2328    vals[i++] = mkNegNan();
   2329    vals[i++] = -1.3;
   2330    vals[i++] = -1.1;
   2331    vals[i++] = -0.9;
   2332    vals[i++] = -0.7;
   2333    vals[i++] = -0.50001;
   2334    vals[i++] = -0.49999;
   2335    vals[i++] = -0.3;
   2336    vals[i++] = -0.1;
   2337    vals[i++] = 0.1;
   2338    vals[i++] = 0.3;
   2339    vals[i++] = 0.49999;
   2340    vals[i++] = 0.50001;
   2341    vals[i++] = 0.7;
   2342    vals[i++] = 0.9;
   2343    vals[i++] = 1.1;
   2344    vals[i++] = 1.3;
   2345    assert(i == 22);
   2346 
   2347    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2348       V128 src, dst;
   2349 
   2350       randV128(&src);
   2351       randV128(&dst);
   2352       memcpy(&src[0], &vals[i], 8);
   2353       do_ROUNDSD_000(False/*reg*/, &src, &dst);
   2354       printf("r roundsd_000  ");
   2355       showV128(&src);
   2356       printf(" ");
   2357       showV128(&dst);
   2358       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2359       printf("\n");
   2360 
   2361       randV128(&src);
   2362       randV128(&dst);
   2363       memcpy(&src[0], &vals[i], 8);
   2364       do_ROUNDSD_000(True/*mem*/, &src, &dst);
   2365       printf("m roundsd_000  ");
   2366       showV128(&src);
   2367       printf(" ");
   2368       showV128(&dst);
   2369       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2370       printf("\n");
   2371 
   2372 
   2373       randV128(&src);
   2374       randV128(&dst);
   2375       memcpy(&src[0], &vals[i], 8);
   2376       do_ROUNDSD_001(False/*reg*/, &src, &dst);
   2377       printf("r roundsd_001  ");
   2378       showV128(&src);
   2379       printf(" ");
   2380       showV128(&dst);
   2381       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2382       printf("\n");
   2383 
   2384       randV128(&src);
   2385       randV128(&dst);
   2386       memcpy(&src[0], &vals[i], 8);
   2387       do_ROUNDSD_001(True/*mem*/, &src, &dst);
   2388       printf("m roundsd_001  ");
   2389       showV128(&src);
   2390       printf(" ");
   2391       showV128(&dst);
   2392       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2393       printf("\n");
   2394 
   2395 
   2396       randV128(&src);
   2397       randV128(&dst);
   2398       memcpy(&src[0], &vals[i], 8);
   2399       do_ROUNDSD_010(False/*reg*/, &src, &dst);
   2400       printf("r roundsd_010  ");
   2401       showV128(&src);
   2402       printf(" ");
   2403       showV128(&dst);
   2404       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2405       printf("\n");
   2406 
   2407       randV128(&src);
   2408       randV128(&dst);
   2409       memcpy(&src[0], &vals[i], 8);
   2410       do_ROUNDSD_010(True/*mem*/, &src, &dst);
   2411       printf("m roundsd_010  ");
   2412       showV128(&src);
   2413       printf(" ");
   2414       showV128(&dst);
   2415       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2416       printf("\n");
   2417 
   2418 
   2419       randV128(&src);
   2420       randV128(&dst);
   2421       memcpy(&src[0], &vals[i], 8);
   2422       do_ROUNDSD_011(False/*reg*/, &src, &dst);
   2423       printf("r roundsd_011  ");
   2424       showV128(&src);
   2425       printf(" ");
   2426       showV128(&dst);
   2427       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2428       printf("\n");
   2429 
   2430       randV128(&src);
   2431       randV128(&dst);
   2432       memcpy(&src[0], &vals[i], 8);
   2433       do_ROUNDSD_011(True/*mem*/, &src, &dst);
   2434       printf("m roundsd_011  ");
   2435       showV128(&src);
   2436       printf(" ");
   2437       showV128(&dst);
   2438       printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2439       printf("\n");
   2440    }
   2441 }
   2442 
   2443 void test_ROUNDSD_w_mxcsr_rounding ( void )
   2444 {
   2445    UInt rm;
   2446    double vals[22];
   2447    Int i = 0;
   2448    vals[i++] = 0.0;
   2449    vals[i++] = -0.0;
   2450    vals[i++] = mkPosInf();
   2451    vals[i++] = mkNegInf();
   2452    vals[i++] = mkPosNan();
   2453    vals[i++] = mkNegNan();
   2454    vals[i++] = -1.3;
   2455    vals[i++] = -1.1;
   2456    vals[i++] = -0.9;
   2457    vals[i++] = -0.7;
   2458    vals[i++] = -0.50001;
   2459    vals[i++] = -0.49999;
   2460    vals[i++] = -0.3;
   2461    vals[i++] = -0.1;
   2462    vals[i++] = 0.1;
   2463    vals[i++] = 0.3;
   2464    vals[i++] = 0.49999;
   2465    vals[i++] = 0.50001;
   2466    vals[i++] = 0.7;
   2467    vals[i++] = 0.9;
   2468    vals[i++] = 1.1;
   2469    vals[i++] = 1.3;
   2470    assert(i == 22);
   2471 
   2472    rm = get_sse_roundingmode();
   2473    assert(rm == 0); // 0 == RN == default
   2474 
   2475    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2476       V128 src, dst;
   2477 
   2478       for (rm = 0; rm <= 3; rm++) {
   2479          set_sse_roundingmode(rm);
   2480 
   2481          randV128(&src);
   2482          randV128(&dst);
   2483          memcpy(&src[0], &vals[i], 8);
   2484          do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
   2485          printf("r (rm=%u) roundsd_1XX  ", rm);
   2486          showV128(&src);
   2487          printf(" ");
   2488          showV128(&dst);
   2489          printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2490          printf("\n");
   2491 
   2492          randV128(&src);
   2493          randV128(&dst);
   2494          memcpy(&src[0], &vals[i], 8);
   2495          do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
   2496          printf("m (rm=%u) roundsd_1XX  ", rm);
   2497          showV128(&src);
   2498          printf(" ");
   2499          showV128(&dst);
   2500          printf("  %10f %10f", vals[i], *(double*)(&dst[0]));
   2501          printf("\n");
   2502       }
   2503    }
   2504 
   2505    rm = get_sse_roundingmode();
   2506    assert(rm == 3);
   2507    set_sse_roundingmode(0);
   2508    rm = get_sse_roundingmode();
   2509    assert(rm == 0); // 0 == RN == default
   2510 }
   2511 
   2512 
   2513 /* ------------ ROUNDSS ------------ */
   2514 
   2515 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2516 {
   2517    if (mem) {
   2518       __asm__ __volatile__(
   2519          "movupd  (%1), %%xmm11"       "\n\t"
   2520          "roundss $0, (%0), %%xmm11"   "\n\t"
   2521          "movupd  %%xmm11, (%1)"       "\n"
   2522          : /*OUT*/
   2523          : /*IN*/ "r"(src), "r"(dst)
   2524          : /*TRASH*/ "xmm11"
   2525       );
   2526    } else {
   2527       __asm__ __volatile__(
   2528          "movupd  (%1), %%xmm11"         "\n\t"
   2529          "movupd  (%0), %%xmm2"          "\n\t"
   2530          "roundss $0, %%xmm2, %%xmm11"   "\n\t"
   2531          "movupd  %%xmm11, (%1)"         "\n"
   2532          : /*OUT*/
   2533          : /*IN*/ "r"(src), "r"(dst)
   2534          : /*TRASH*/ "xmm11","xmm2"
   2535       );
   2536    }
   2537 }
   2538 
   2539 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2540 {
   2541    if (mem) {
   2542       __asm__ __volatile__(
   2543          "movupd  (%1), %%xmm11"       "\n\t"
   2544          "roundss $1, (%0), %%xmm11"   "\n\t"
   2545          "movupd  %%xmm11, (%1)"       "\n"
   2546          : /*OUT*/
   2547          : /*IN*/ "r"(src), "r"(dst)
   2548          : /*TRASH*/ "xmm11"
   2549       );
   2550    } else {
   2551       __asm__ __volatile__(
   2552          "movupd  (%1), %%xmm11"         "\n\t"
   2553          "movupd  (%0), %%xmm2"          "\n\t"
   2554          "roundss $1, %%xmm2, %%xmm11"   "\n\t"
   2555          "movupd  %%xmm11, (%1)"         "\n"
   2556          : /*OUT*/
   2557          : /*IN*/ "r"(src), "r"(dst)
   2558          : /*TRASH*/ "xmm11","xmm2"
   2559       );
   2560    }
   2561 }
   2562 
   2563 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2564 {
   2565    if (mem) {
   2566       __asm__ __volatile__(
   2567          "movupd  (%1), %%xmm11"       "\n\t"
   2568          "roundss $2, (%0), %%xmm11"   "\n\t"
   2569          "movupd  %%xmm11, (%1)"       "\n"
   2570          : /*OUT*/
   2571          : /*IN*/ "r"(src), "r"(dst)
   2572          : /*TRASH*/ "xmm11"
   2573       );
   2574    } else {
   2575       __asm__ __volatile__(
   2576          "movupd  (%1), %%xmm11"         "\n\t"
   2577          "movupd  (%0), %%xmm2"          "\n\t"
   2578          "roundss $2, %%xmm2, %%xmm11"   "\n\t"
   2579          "movupd  %%xmm11, (%1)"         "\n"
   2580          : /*OUT*/
   2581          : /*IN*/ "r"(src), "r"(dst)
   2582          : /*TRASH*/ "xmm11","xmm2"
   2583       );
   2584    }
   2585 }
   2586 
   2587 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2588 {
   2589    if (mem) {
   2590       __asm__ __volatile__(
   2591          "movupd  (%1), %%xmm11"       "\n\t"
   2592          "roundss $3, (%0), %%xmm11"   "\n\t"
   2593          "movupd  %%xmm11, (%1)"       "\n"
   2594          : /*OUT*/
   2595          : /*IN*/ "r"(src), "r"(dst)
   2596          : /*TRASH*/ "xmm11"
   2597       );
   2598    } else {
   2599       __asm__ __volatile__(
   2600          "movupd  (%1), %%xmm11"         "\n\t"
   2601          "movupd  (%0), %%xmm2"          "\n\t"
   2602          "roundss $3, %%xmm2, %%xmm11"   "\n\t"
   2603          "movupd  %%xmm11, (%1)"         "\n"
   2604          : /*OUT*/
   2605          : /*IN*/ "r"(src), "r"(dst)
   2606          : /*TRASH*/ "xmm11","xmm2"
   2607       );
   2608    }
   2609 }
   2610 
   2611 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2612 {
   2613    if (mem) {
   2614       __asm__ __volatile__(
   2615          "movupd  (%1), %%xmm11"       "\n\t"
   2616          "roundss $4, (%0), %%xmm11"   "\n\t"
   2617          "movupd  %%xmm11, (%1)"       "\n"
   2618          : /*OUT*/
   2619          : /*IN*/ "r"(src), "r"(dst)
   2620          : /*TRASH*/ "xmm11"
   2621       );
   2622    } else {
   2623       __asm__ __volatile__(
   2624          "movupd  (%1), %%xmm11"         "\n\t"
   2625          "movupd  (%0), %%xmm2"          "\n\t"
   2626          "roundss $4, %%xmm2, %%xmm11"   "\n\t"
   2627          "movupd  %%xmm11, (%1)"         "\n"
   2628          : /*OUT*/
   2629          : /*IN*/ "r"(src), "r"(dst)
   2630          : /*TRASH*/ "xmm11","xmm2"
   2631       );
   2632    }
   2633 }
   2634 
   2635 void test_ROUNDSS_w_immediate_rounding ( void )
   2636 {
   2637    float vals[22];
   2638    Int i = 0;
   2639    vals[i++] = 0.0;
   2640    vals[i++] = -0.0;
   2641    vals[i++] = mkPosInf();
   2642    vals[i++] = mkNegInf();
   2643    vals[i++] = mkPosNan();
   2644    vals[i++] = mkNegNan();
   2645    vals[i++] = -1.3;
   2646    vals[i++] = -1.1;
   2647    vals[i++] = -0.9;
   2648    vals[i++] = -0.7;
   2649    vals[i++] = -0.50001;
   2650    vals[i++] = -0.49999;
   2651    vals[i++] = -0.3;
   2652    vals[i++] = -0.1;
   2653    vals[i++] = 0.1;
   2654    vals[i++] = 0.3;
   2655    vals[i++] = 0.49999;
   2656    vals[i++] = 0.50001;
   2657    vals[i++] = 0.7;
   2658    vals[i++] = 0.9;
   2659    vals[i++] = 1.1;
   2660    vals[i++] = 1.3;
   2661    assert(i == 22);
   2662 
   2663    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2664       V128 src, dst;
   2665 
   2666       randV128(&src);
   2667       randV128(&dst);
   2668       memcpy(&src[0], &vals[i], 4);
   2669       do_ROUNDSS_000(False/*reg*/, &src, &dst);
   2670       printf("r roundss_000  ");
   2671       showV128(&src);
   2672       printf(" ");
   2673       showV128(&dst);
   2674       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2675       printf("\n");
   2676 
   2677       randV128(&src);
   2678       randV128(&dst);
   2679       memcpy(&src[0], &vals[i], 4);
   2680       do_ROUNDSS_000(True/*mem*/, &src, &dst);
   2681       printf("m roundss_000  ");
   2682       showV128(&src);
   2683       printf(" ");
   2684       showV128(&dst);
   2685       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2686       printf("\n");
   2687 
   2688 
   2689       randV128(&src);
   2690       randV128(&dst);
   2691       memcpy(&src[0], &vals[i], 4);
   2692       do_ROUNDSS_001(False/*reg*/, &src, &dst);
   2693       printf("r roundss_001  ");
   2694       showV128(&src);
   2695       printf(" ");
   2696       showV128(&dst);
   2697       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2698       printf("\n");
   2699 
   2700       randV128(&src);
   2701       randV128(&dst);
   2702       memcpy(&src[0], &vals[i], 4);
   2703       do_ROUNDSS_001(True/*mem*/, &src, &dst);
   2704       printf("m roundss_001  ");
   2705       showV128(&src);
   2706       printf(" ");
   2707       showV128(&dst);
   2708       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2709       printf("\n");
   2710 
   2711 
   2712       randV128(&src);
   2713       randV128(&dst);
   2714       memcpy(&src[0], &vals[i], 4);
   2715       do_ROUNDSS_010(False/*reg*/, &src, &dst);
   2716       printf("r roundss_010  ");
   2717       showV128(&src);
   2718       printf(" ");
   2719       showV128(&dst);
   2720       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2721       printf("\n");
   2722 
   2723       randV128(&src);
   2724       randV128(&dst);
   2725       memcpy(&src[0], &vals[i], 4);
   2726       do_ROUNDSS_010(True/*mem*/, &src, &dst);
   2727       printf("m roundss_010  ");
   2728       showV128(&src);
   2729       printf(" ");
   2730       showV128(&dst);
   2731       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2732       printf("\n");
   2733 
   2734 
   2735       randV128(&src);
   2736       randV128(&dst);
   2737       memcpy(&src[0], &vals[i], 4);
   2738       do_ROUNDSS_011(False/*reg*/, &src, &dst);
   2739       printf("r roundss_011  ");
   2740       showV128(&src);
   2741       printf(" ");
   2742       showV128(&dst);
   2743       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2744       printf("\n");
   2745 
   2746       randV128(&src);
   2747       randV128(&dst);
   2748       memcpy(&src[0], &vals[i], 4);
   2749       do_ROUNDSS_011(True/*mem*/, &src, &dst);
   2750       printf("m roundss_011  ");
   2751       showV128(&src);
   2752       printf(" ");
   2753       showV128(&dst);
   2754       printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2755       printf("\n");
   2756    }
   2757 }
   2758 
   2759 void test_ROUNDSS_w_mxcsr_rounding ( void )
   2760 {
   2761    UInt rm;
   2762    float vals[22];
   2763    Int i = 0;
   2764    vals[i++] = 0.0;
   2765    vals[i++] = -0.0;
   2766    vals[i++] = mkPosInf();
   2767    vals[i++] = mkNegInf();
   2768    vals[i++] = mkPosNan();
   2769    vals[i++] = mkNegNan();
   2770    vals[i++] = -1.3;
   2771    vals[i++] = -1.1;
   2772    vals[i++] = -0.9;
   2773    vals[i++] = -0.7;
   2774    vals[i++] = -0.50001;
   2775    vals[i++] = -0.49999;
   2776    vals[i++] = -0.3;
   2777    vals[i++] = -0.1;
   2778    vals[i++] = 0.1;
   2779    vals[i++] = 0.3;
   2780    vals[i++] = 0.49999;
   2781    vals[i++] = 0.50001;
   2782    vals[i++] = 0.7;
   2783    vals[i++] = 0.9;
   2784    vals[i++] = 1.1;
   2785    vals[i++] = 1.3;
   2786    assert(i == 22);
   2787 
   2788    rm = get_sse_roundingmode();
   2789    assert(rm == 0); // 0 == RN == default
   2790 
   2791    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2792       V128 src, dst;
   2793 
   2794       for (rm = 0; rm <= 3; rm++) {
   2795          set_sse_roundingmode(rm);
   2796 
   2797          randV128(&src);
   2798          randV128(&dst);
   2799          memcpy(&src[0], &vals[i], 4);
   2800          do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
   2801          printf("r (rm=%u) roundss_1XX  ", rm);
   2802          showV128(&src);
   2803          printf(" ");
   2804          showV128(&dst);
   2805          printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2806          printf("\n");
   2807 
   2808          randV128(&src);
   2809          randV128(&dst);
   2810          memcpy(&src[0], &vals[i], 4);
   2811          do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
   2812          printf("m (rm=%u) roundss_1XX  ", rm);
   2813          showV128(&src);
   2814          printf(" ");
   2815          showV128(&dst);
   2816          printf("  %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
   2817          printf("\n");
   2818       }
   2819    }
   2820 
   2821    rm = get_sse_roundingmode();
   2822    assert(rm == 3);
   2823    set_sse_roundingmode(0);
   2824    rm = get_sse_roundingmode();
   2825    assert(rm == 0); // 0 == RN == default
   2826 }
   2827 
   2828 /* ------------ ROUNDPD ------------ */
   2829 
   2830 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2831 {
   2832    if (mem) {
   2833       __asm__ __volatile__(
   2834          "movupd  (%1), %%xmm11"       "\n\t"
   2835          "roundpd $0, (%0), %%xmm11"   "\n\t"
   2836          "movupd  %%xmm11, (%1)"       "\n"
   2837          : /*OUT*/
   2838          : /*IN*/ "r"(src), "r"(dst)
   2839          : /*TRASH*/ "xmm11"
   2840       );
   2841    } else {
   2842       __asm__ __volatile__(
   2843          "movupd  (%1), %%xmm11"         "\n\t"
   2844          "movupd  (%0), %%xmm2"          "\n\t"
   2845          "roundpd $0, %%xmm2, %%xmm11"   "\n\t"
   2846          "movupd  %%xmm11, (%1)"         "\n"
   2847          : /*OUT*/
   2848          : /*IN*/ "r"(src), "r"(dst)
   2849          : /*TRASH*/ "xmm11","xmm2"
   2850       );
   2851    }
   2852 }
   2853 
   2854 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2855 {
   2856    if (mem) {
   2857       __asm__ __volatile__(
   2858          "movupd  (%1), %%xmm11"       "\n\t"
   2859          "roundpd $1, (%0), %%xmm11"   "\n\t"
   2860          "movupd  %%xmm11, (%1)"       "\n"
   2861          : /*OUT*/
   2862          : /*IN*/ "r"(src), "r"(dst)
   2863          : /*TRASH*/ "xmm11"
   2864       );
   2865    } else {
   2866       __asm__ __volatile__(
   2867          "movupd  (%1), %%xmm11"         "\n\t"
   2868          "movupd  (%0), %%xmm2"          "\n\t"
   2869          "roundpd $1, %%xmm2, %%xmm11"   "\n\t"
   2870          "movupd  %%xmm11, (%1)"         "\n"
   2871          : /*OUT*/
   2872          : /*IN*/ "r"(src), "r"(dst)
   2873          : /*TRASH*/ "xmm11","xmm2"
   2874       );
   2875    }
   2876 }
   2877 
   2878 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2879 {
   2880    if (mem) {
   2881       __asm__ __volatile__(
   2882          "movupd  (%1), %%xmm11"       "\n\t"
   2883          "roundpd $2, (%0), %%xmm11"   "\n\t"
   2884          "movupd  %%xmm11, (%1)"       "\n"
   2885          : /*OUT*/
   2886          : /*IN*/ "r"(src), "r"(dst)
   2887          : /*TRASH*/ "xmm11"
   2888       );
   2889    } else {
   2890       __asm__ __volatile__(
   2891          "movupd  (%1), %%xmm11"         "\n\t"
   2892          "movupd  (%0), %%xmm2"          "\n\t"
   2893          "roundpd $2, %%xmm2, %%xmm11"   "\n\t"
   2894          "movupd  %%xmm11, (%1)"         "\n"
   2895          : /*OUT*/
   2896          : /*IN*/ "r"(src), "r"(dst)
   2897          : /*TRASH*/ "xmm11","xmm2"
   2898       );
   2899    }
   2900 }
   2901 
   2902 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   2903 {
   2904    if (mem) {
   2905       __asm__ __volatile__(
   2906          "movupd  (%1), %%xmm11"       "\n\t"
   2907          "roundpd $3, (%0), %%xmm11"   "\n\t"
   2908          "movupd  %%xmm11, (%1)"       "\n"
   2909          : /*OUT*/
   2910          : /*IN*/ "r"(src), "r"(dst)
   2911          : /*TRASH*/ "xmm11"
   2912       );
   2913    } else {
   2914       __asm__ __volatile__(
   2915          "movupd  (%1), %%xmm11"         "\n\t"
   2916          "movupd  (%0), %%xmm2"          "\n\t"
   2917          "roundpd $3, %%xmm2, %%xmm11"   "\n\t"
   2918          "movupd  %%xmm11, (%1)"         "\n"
   2919          : /*OUT*/
   2920          : /*IN*/ "r"(src), "r"(dst)
   2921          : /*TRASH*/ "xmm11","xmm2"
   2922       );
   2923    }
   2924 }
   2925 
   2926 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   2927 {
   2928    if (mem) {
   2929       __asm__ __volatile__(
   2930          "movupd  (%1), %%xmm11"       "\n\t"
   2931          "roundpd $4, (%0), %%xmm11"   "\n\t"
   2932          "movupd  %%xmm11, (%1)"       "\n"
   2933          : /*OUT*/
   2934          : /*IN*/ "r"(src), "r"(dst)
   2935          : /*TRASH*/ "xmm11"
   2936       );
   2937    } else {
   2938       __asm__ __volatile__(
   2939          "movupd  (%1), %%xmm11"         "\n\t"
   2940          "movupd  (%0), %%xmm2"          "\n\t"
   2941          "roundpd $4, %%xmm2, %%xmm11"   "\n\t"
   2942          "movupd  %%xmm11, (%1)"         "\n"
   2943          : /*OUT*/
   2944          : /*IN*/ "r"(src), "r"(dst)
   2945          : /*TRASH*/ "xmm11","xmm2"
   2946       );
   2947    }
   2948 }
   2949 
   2950 void test_ROUNDPD_w_immediate_rounding ( void )
   2951 {
   2952    double vals[22];
   2953    Int i = 0;
   2954    vals[i++] = 0.0;
   2955    vals[i++] = -0.0;
   2956    vals[i++] = mkPosInf();
   2957    vals[i++] = mkNegInf();
   2958    vals[i++] = mkPosNan();
   2959    vals[i++] = mkNegNan();
   2960    vals[i++] = -1.3;
   2961    vals[i++] = -1.1;
   2962    vals[i++] = -0.9;
   2963    vals[i++] = -0.7;
   2964    vals[i++] = -0.50001;
   2965    vals[i++] = -0.49999;
   2966    vals[i++] = -0.3;
   2967    vals[i++] = -0.1;
   2968    vals[i++] = 0.1;
   2969    vals[i++] = 0.3;
   2970    vals[i++] = 0.49999;
   2971    vals[i++] = 0.50001;
   2972    vals[i++] = 0.7;
   2973    vals[i++] = 0.9;
   2974    vals[i++] = 1.1;
   2975    vals[i++] = 1.3;
   2976    assert(i == 22);
   2977 
   2978    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   2979       V128 src, dst;
   2980 
   2981       randV128(&src);
   2982       randV128(&dst);
   2983       memcpy(&src[0], &vals[i], 8);
   2984       memcpy(&src[8], &vals[(i+11)%22], 8);
   2985       do_ROUNDPD_000(False/*reg*/, &src, &dst);
   2986       printf("r roundpd_000  ");
   2987       showV128(&src);
   2988       printf(" ");
   2989       showV128(&dst);
   2990       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   2991       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   2992       printf("\n");
   2993 
   2994       randV128(&src);
   2995       randV128(&dst);
   2996       memcpy(&src[0], &vals[i], 8);
   2997       memcpy(&src[8], &vals[(i+11)%22], 8);
   2998       do_ROUNDPD_000(True/*mem*/, &src, &dst);
   2999       printf("m roundpd_000  ");
   3000       showV128(&src);
   3001       printf(" ");
   3002       showV128(&dst);
   3003       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3004       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3005       printf("\n");
   3006 
   3007 
   3008       randV128(&src);
   3009       randV128(&dst);
   3010       memcpy(&src[0], &vals[i], 8);
   3011       memcpy(&src[8], &vals[(i+11)%22], 8);
   3012       do_ROUNDPD_001(False/*reg*/, &src, &dst);
   3013       printf("r roundpd_001  ");
   3014       showV128(&src);
   3015       printf(" ");
   3016       showV128(&dst);
   3017       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3018       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3019       printf("\n");
   3020 
   3021       randV128(&src);
   3022       randV128(&dst);
   3023       memcpy(&src[0], &vals[i], 8);
   3024       memcpy(&src[8], &vals[(i+11)%22], 8);
   3025       do_ROUNDPD_001(True/*mem*/, &src, &dst);
   3026       printf("m roundpd_001  ");
   3027       showV128(&src);
   3028       printf(" ");
   3029       showV128(&dst);
   3030       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3031       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3032       printf("\n");
   3033 
   3034 
   3035       randV128(&src);
   3036       randV128(&dst);
   3037       memcpy(&src[0], &vals[i], 8);
   3038       memcpy(&src[8], &vals[(i+11)%22], 8);
   3039       do_ROUNDPD_010(False/*reg*/, &src, &dst);
   3040       printf("r roundpd_010  ");
   3041       showV128(&src);
   3042       printf(" ");
   3043       showV128(&dst);
   3044       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3045       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3046       printf("\n");
   3047 
   3048       randV128(&src);
   3049       randV128(&dst);
   3050       memcpy(&src[0], &vals[i], 8);
   3051       memcpy(&src[8], &vals[(i+11)%22], 8);
   3052       do_ROUNDPD_010(True/*mem*/, &src, &dst);
   3053       printf("m roundpd_010  ");
   3054       showV128(&src);
   3055       printf(" ");
   3056       showV128(&dst);
   3057       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3058       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3059       printf("\n");
   3060 
   3061 
   3062       randV128(&src);
   3063       randV128(&dst);
   3064       memcpy(&src[0], &vals[i], 8);
   3065       memcpy(&src[8], &vals[(i+11)%22], 8);
   3066       do_ROUNDPD_011(False/*reg*/, &src, &dst);
   3067       printf("r roundpd_011  ");
   3068       showV128(&src);
   3069       printf(" ");
   3070       showV128(&dst);
   3071       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3072       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3073       printf("\n");
   3074 
   3075       randV128(&src);
   3076       randV128(&dst);
   3077       memcpy(&src[0], &vals[i], 8);
   3078       memcpy(&src[8], &vals[(i+11)%22], 8);
   3079       do_ROUNDPD_011(True/*mem*/, &src, &dst);
   3080       printf("m roundpd_011  ");
   3081       showV128(&src);
   3082       printf(" ");
   3083       showV128(&dst);
   3084       printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3085       printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3086       printf("\n");
   3087    }
   3088 }
   3089 
   3090 void test_ROUNDPD_w_mxcsr_rounding ( void )
   3091 {
   3092    UInt rm;
   3093    double vals[22];
   3094    Int i = 0;
   3095    vals[i++] = 0.0;
   3096    vals[i++] = -0.0;
   3097    vals[i++] = mkPosInf();
   3098    vals[i++] = mkNegInf();
   3099    vals[i++] = mkPosNan();
   3100    vals[i++] = mkNegNan();
   3101    vals[i++] = -1.3;
   3102    vals[i++] = -1.1;
   3103    vals[i++] = -0.9;
   3104    vals[i++] = -0.7;
   3105    vals[i++] = -0.50001;
   3106    vals[i++] = -0.49999;
   3107    vals[i++] = -0.3;
   3108    vals[i++] = -0.1;
   3109    vals[i++] = 0.1;
   3110    vals[i++] = 0.3;
   3111    vals[i++] = 0.49999;
   3112    vals[i++] = 0.50001;
   3113    vals[i++] = 0.7;
   3114    vals[i++] = 0.9;
   3115    vals[i++] = 1.1;
   3116    vals[i++] = 1.3;
   3117    assert(i == 22);
   3118 
   3119    rm = get_sse_roundingmode();
   3120    assert(rm == 0); // 0 == RN == default
   3121 
   3122    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3123       V128 src, dst;
   3124 
   3125       for (rm = 0; rm <= 3; rm++) {
   3126          set_sse_roundingmode(rm);
   3127 
   3128          randV128(&src);
   3129          randV128(&dst);
   3130          memcpy(&src[0], &vals[i], 8);
   3131          memcpy(&src[8], &vals[(i+11)%22], 8);
   3132          do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
   3133          printf("r (rm=%u) roundpd_1XX  ", rm);
   3134          showV128(&src);
   3135          printf(" ");
   3136          showV128(&dst);
   3137          printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3138          printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3139          printf("\n");
   3140 
   3141          randV128(&src);
   3142          randV128(&dst);
   3143          memcpy(&src[0], &vals[i], 8);
   3144          memcpy(&src[8], &vals[(i+11)%22], 8);
   3145          do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
   3146          printf("m (rm=%u) roundpd_1XX  ", rm);
   3147          showV128(&src);
   3148          printf(" ");
   3149          showV128(&dst);
   3150          printf("  %10f -> %10f", vals[i], *(double*)(&dst[0]));
   3151          printf("   %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
   3152          printf("\n");
   3153       }
   3154    }
   3155 
   3156    rm = get_sse_roundingmode();
   3157    assert(rm == 3);
   3158    set_sse_roundingmode(0);
   3159    rm = get_sse_roundingmode();
   3160    assert(rm == 0); // 0 == RN == default
   3161 }
   3162 
   3163 /* ------------ ROUNDPS ------------ */
   3164 
   3165 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3166 {
   3167    if (mem) {
   3168       __asm__ __volatile__(
   3169          "movupd  (%1), %%xmm11"       "\n\t"
   3170          "roundps $0, (%0), %%xmm11"   "\n\t"
   3171          "movupd  %%xmm11, (%1)"       "\n"
   3172          : /*OUT*/
   3173          : /*IN*/ "r"(src), "r"(dst)
   3174          : /*TRASH*/ "xmm11"
   3175       );
   3176    } else {
   3177       __asm__ __volatile__(
   3178          "movupd  (%1), %%xmm11"         "\n\t"
   3179          "movupd  (%0), %%xmm2"          "\n\t"
   3180          "roundps $0, %%xmm2, %%xmm11"   "\n\t"
   3181          "movupd  %%xmm11, (%1)"         "\n"
   3182          : /*OUT*/
   3183          : /*IN*/ "r"(src), "r"(dst)
   3184          : /*TRASH*/ "xmm11","xmm2"
   3185       );
   3186    }
   3187 }
   3188 
   3189 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3190 {
   3191    if (mem) {
   3192       __asm__ __volatile__(
   3193          "movupd  (%1), %%xmm11"       "\n\t"
   3194          "roundps $1, (%0), %%xmm11"   "\n\t"
   3195          "movupd  %%xmm11, (%1)"       "\n"
   3196          : /*OUT*/
   3197          : /*IN*/ "r"(src), "r"(dst)
   3198          : /*TRASH*/ "xmm11"
   3199       );
   3200    } else {
   3201       __asm__ __volatile__(
   3202          "movupd  (%1), %%xmm11"         "\n\t"
   3203          "movupd  (%0), %%xmm2"          "\n\t"
   3204          "roundps $1, %%xmm2, %%xmm11"   "\n\t"
   3205          "movupd  %%xmm11, (%1)"         "\n"
   3206          : /*OUT*/
   3207          : /*IN*/ "r"(src), "r"(dst)
   3208          : /*TRASH*/ "xmm11","xmm2"
   3209       );
   3210    }
   3211 }
   3212 
   3213 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3214 {
   3215    if (mem) {
   3216       __asm__ __volatile__(
   3217          "movupd  (%1), %%xmm11"       "\n\t"
   3218          "roundps $2, (%0), %%xmm11"   "\n\t"
   3219          "movupd  %%xmm11, (%1)"       "\n"
   3220          : /*OUT*/
   3221          : /*IN*/ "r"(src), "r"(dst)
   3222          : /*TRASH*/ "xmm11"
   3223       );
   3224    } else {
   3225       __asm__ __volatile__(
   3226          "movupd  (%1), %%xmm11"         "\n\t"
   3227          "movupd  (%0), %%xmm2"          "\n\t"
   3228          "roundps $2, %%xmm2, %%xmm11"   "\n\t"
   3229          "movupd  %%xmm11, (%1)"         "\n"
   3230          : /*OUT*/
   3231          : /*IN*/ "r"(src), "r"(dst)
   3232          : /*TRASH*/ "xmm11","xmm2"
   3233       );
   3234    }
   3235 }
   3236 
   3237 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
   3238 {
   3239    if (mem) {
   3240       __asm__ __volatile__(
   3241          "movupd  (%1), %%xmm11"       "\n\t"
   3242          "roundps $3, (%0), %%xmm11"   "\n\t"
   3243          "movupd  %%xmm11, (%1)"       "\n"
   3244          : /*OUT*/
   3245          : /*IN*/ "r"(src), "r"(dst)
   3246          : /*TRASH*/ "xmm11"
   3247       );
   3248    } else {
   3249       __asm__ __volatile__(
   3250          "movupd  (%1), %%xmm11"         "\n\t"
   3251          "movupd  (%0), %%xmm2"          "\n\t"
   3252          "roundps $3, %%xmm2, %%xmm11"   "\n\t"
   3253          "movupd  %%xmm11, (%1)"         "\n"
   3254          : /*OUT*/
   3255          : /*IN*/ "r"(src), "r"(dst)
   3256          : /*TRASH*/ "xmm11","xmm2"
   3257       );
   3258    }
   3259 }
   3260 
   3261 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
   3262 {
   3263    if (mem) {
   3264       __asm__ __volatile__(
   3265          "movupd  (%1), %%xmm11"       "\n\t"
   3266          "roundps $4, (%0), %%xmm11"   "\n\t"
   3267          "movupd  %%xmm11, (%1)"       "\n"
   3268          : /*OUT*/
   3269          : /*IN*/ "r"(src), "r"(dst)
   3270          : /*TRASH*/ "xmm11"
   3271       );
   3272    } else {
   3273       __asm__ __volatile__(
   3274          "movupd  (%1), %%xmm11"         "\n\t"
   3275          "movupd  (%0), %%xmm2"          "\n\t"
   3276          "roundps $4, %%xmm2, %%xmm11"   "\n\t"
   3277          "movupd  %%xmm11, (%1)"         "\n"
   3278          : /*OUT*/
   3279          : /*IN*/ "r"(src), "r"(dst)
   3280          : /*TRASH*/ "xmm11","xmm2"
   3281       );
   3282    }
   3283 }
   3284 
   3285 void test_ROUNDPS_w_immediate_rounding ( void )
   3286 {
   3287    float vals[22];
   3288    Int i = 0;
   3289    vals[i++] = 0.0;
   3290    vals[i++] = -0.0;
   3291    vals[i++] = mkPosInf();
   3292    vals[i++] = mkNegInf();
   3293    vals[i++] = mkPosNan();
   3294    vals[i++] = mkNegNan();
   3295    vals[i++] = -1.3;
   3296    vals[i++] = -1.1;
   3297    vals[i++] = -0.9;
   3298    vals[i++] = -0.7;
   3299    vals[i++] = -0.50001;
   3300    vals[i++] = -0.49999;
   3301    vals[i++] = -0.3;
   3302    vals[i++] = -0.1;
   3303    vals[i++] = 0.1;
   3304    vals[i++] = 0.3;
   3305    vals[i++] = 0.49999;
   3306    vals[i++] = 0.50001;
   3307    vals[i++] = 0.7;
   3308    vals[i++] = 0.9;
   3309    vals[i++] = 1.1;
   3310    vals[i++] = 1.3;
   3311    assert(i == 22);
   3312 
   3313    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3314       V128 src, dst;
   3315 
   3316       randV128(&src);
   3317       randV128(&dst);
   3318       memcpy(&src[0], &vals[i], 4);
   3319       memcpy(&src[4], &vals[(i+5)%22], 4);
   3320       memcpy(&src[8], &vals[(i+11)%22], 4);
   3321       memcpy(&src[12], &vals[(i+17)%22], 4);
   3322       do_ROUNDPS_000(False/*reg*/, &src, &dst);
   3323       printf("r roundps_000  ");
   3324       showV128(&src);
   3325       printf(" ");
   3326       showV128(&dst);
   3327       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3328       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3329       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3330       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3331       printf("\n");
   3332 
   3333       randV128(&src);
   3334       randV128(&dst);
   3335       memcpy(&src[0], &vals[i], 4);
   3336       memcpy(&src[4], &vals[(i+5)%22], 4);
   3337       memcpy(&src[8], &vals[(i+11)%22], 4);
   3338       memcpy(&src[12], &vals[(i+17)%22], 4);
   3339       do_ROUNDPS_000(True/*mem*/, &src, &dst);
   3340       printf("m roundps_000  ");
   3341       showV128(&src);
   3342       printf(" ");
   3343       showV128(&dst);
   3344       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3345       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3346       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3347       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3348       printf("\n");
   3349 
   3350 
   3351       randV128(&src);
   3352       randV128(&dst);
   3353       memcpy(&src[0], &vals[i], 4);
   3354       memcpy(&src[4], &vals[(i+5)%22], 4);
   3355       memcpy(&src[8], &vals[(i+11)%22], 4);
   3356       memcpy(&src[12], &vals[(i+17)%22], 4);
   3357       do_ROUNDPS_001(False/*reg*/, &src, &dst);
   3358       printf("r roundps_001  ");
   3359       showV128(&src);
   3360       printf(" ");
   3361       showV128(&dst);
   3362       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3363       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3364       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3365       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3366       printf("\n");
   3367 
   3368       randV128(&src);
   3369       randV128(&dst);
   3370       memcpy(&src[0], &vals[i], 4);
   3371       memcpy(&src[4], &vals[(i+5)%22], 4);
   3372       memcpy(&src[8], &vals[(i+11)%22], 4);
   3373       memcpy(&src[12], &vals[(i+17)%22], 4);
   3374       do_ROUNDPS_001(True/*mem*/, &src, &dst);
   3375       printf("m roundps_001  ");
   3376       showV128(&src);
   3377       printf(" ");
   3378       showV128(&dst);
   3379       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3380       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3381       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3382       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3383       printf("\n");
   3384 
   3385 
   3386       randV128(&src);
   3387       randV128(&dst);
   3388       memcpy(&src[0], &vals[i], 4);
   3389       memcpy(&src[4], &vals[(i+5)%22], 4);
   3390       memcpy(&src[8], &vals[(i+11)%22], 4);
   3391       memcpy(&src[12], &vals[(i+17)%22], 4);
   3392       do_ROUNDPS_010(False/*reg*/, &src, &dst);
   3393       printf("r roundps_010  ");
   3394       showV128(&src);
   3395       printf(" ");
   3396       showV128(&dst);
   3397       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3398       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3399       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3400       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3401       printf("\n");
   3402 
   3403       randV128(&src);
   3404       randV128(&dst);
   3405       memcpy(&src[0], &vals[i], 4);
   3406       memcpy(&src[4], &vals[(i+5)%22], 4);
   3407       memcpy(&src[8], &vals[(i+11)%22], 4);
   3408       memcpy(&src[12], &vals[(i+17)%22], 4);
   3409       do_ROUNDPS_010(True/*mem*/, &src, &dst);
   3410       printf("m roundps_010  ");
   3411       showV128(&src);
   3412       printf(" ");
   3413       showV128(&dst);
   3414       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3415       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3416       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3417       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3418       printf("\n");
   3419 
   3420 
   3421       randV128(&src);
   3422       randV128(&dst);
   3423       memcpy(&src[0], &vals[i], 4);
   3424       memcpy(&src[4], &vals[(i+5)%22], 4);
   3425       memcpy(&src[8], &vals[(i+11)%22], 4);
   3426       memcpy(&src[12], &vals[(i+17)%22], 4);
   3427       do_ROUNDPS_011(False/*reg*/, &src, &dst);
   3428       printf("r roundps_011  ");
   3429       showV128(&src);
   3430       printf(" ");
   3431       showV128(&dst);
   3432       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3433       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3434       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3435       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3436       printf("\n");
   3437 
   3438       randV128(&src);
   3439       randV128(&dst);
   3440       memcpy(&src[0], &vals[i], 4);
   3441       memcpy(&src[4], &vals[(i+5)%22], 4);
   3442       memcpy(&src[8], &vals[(i+11)%22], 4);
   3443       memcpy(&src[12], &vals[(i+17)%22], 4);
   3444       do_ROUNDPS_011(True/*mem*/, &src, &dst);
   3445       printf("m roundps_011  ");
   3446       showV128(&src);
   3447       printf(" ");
   3448       showV128(&dst);
   3449       printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3450       printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3451       printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3452       printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3453       printf("\n");
   3454    }
   3455 }
   3456 
   3457 void test_ROUNDPS_w_mxcsr_rounding ( void )
   3458 {
   3459    UInt rm;
   3460    float vals[22];
   3461    Int i = 0;
   3462    vals[i++] = 0.0;
   3463    vals[i++] = -0.0;
   3464    vals[i++] = mkPosInf();
   3465    vals[i++] = mkNegInf();
   3466    vals[i++] = mkPosNan();
   3467    vals[i++] = mkNegNan();
   3468    vals[i++] = -1.3;
   3469    vals[i++] = -1.1;
   3470    vals[i++] = -0.9;
   3471    vals[i++] = -0.7;
   3472    vals[i++] = -0.50001;
   3473    vals[i++] = -0.49999;
   3474    vals[i++] = -0.3;
   3475    vals[i++] = -0.1;
   3476    vals[i++] = 0.1;
   3477    vals[i++] = 0.3;
   3478    vals[i++] = 0.49999;
   3479    vals[i++] = 0.50001;
   3480    vals[i++] = 0.7;
   3481    vals[i++] = 0.9;
   3482    vals[i++] = 1.1;
   3483    vals[i++] = 1.3;
   3484    assert(i == 22);
   3485 
   3486    rm = get_sse_roundingmode();
   3487    assert(rm == 0); // 0 == RN == default
   3488 
   3489    for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
   3490       V128 src, dst;
   3491 
   3492       for (rm = 0; rm <= 3; rm++) {
   3493          set_sse_roundingmode(rm);
   3494 
   3495          randV128(&src);
   3496          randV128(&dst);
   3497          memcpy(&src[0], &vals[i], 4);
   3498          memcpy(&src[4], &vals[(i+5)%22], 4);
   3499          memcpy(&src[8], &vals[(i+11)%22], 4);
   3500          memcpy(&src[12], &vals[(i+17)%22], 4);
   3501          do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
   3502          printf("r (rm=%u) roundps_1XX  ", rm);
   3503          showV128(&src);
   3504          printf(" ");
   3505          showV128(&dst);
   3506          printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3507          printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3508          printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3509          printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3510          printf("\n");
   3511 
   3512          randV128(&src);
   3513          randV128(&dst);
   3514          memcpy(&src[0], &vals[i], 4);
   3515          memcpy(&src[4], &vals[(i+5)%22], 4);
   3516          memcpy(&src[8], &vals[(i+11)%22], 4);
   3517          memcpy(&src[12], &vals[(i+17)%22], 4);
   3518          do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
   3519          printf("m (rm=%u) roundps_1XX  ", rm);
   3520          showV128(&src);
   3521          printf(" ");
   3522          showV128(&dst);
   3523          printf("  %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
   3524          printf("  %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
   3525          printf("  %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
   3526          printf("  %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
   3527          printf("\n");
   3528       }
   3529    }
   3530 
   3531    rm = get_sse_roundingmode();
   3532    assert(rm == 3);
   3533    set_sse_roundingmode(0);
   3534    rm = get_sse_roundingmode();
   3535    assert(rm == 0); // 0 == RN == default
   3536 }
   3537 
   3538 /* ------------ PTEST ------------ */
   3539 
   3540 void test_PTEST ( void )
   3541 {
   3542    const Int ntests = 8;
   3543    V128 spec[ntests];
   3544    do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
   3545    do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
   3546    do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
   3547    do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
   3548    do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
   3549    do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
   3550    do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
   3551    do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
   3552    V128 block[2];
   3553    Int i, j;
   3554    ULong flags;
   3555    for (i = 0; i < ntests; i++) {
   3556       for (j = 0; j < ntests; j++) {
   3557          memcpy(&block[0], &spec[i], 16);
   3558          memcpy(&block[1], &spec[j], 16);
   3559          __asm__ __volatile__(
   3560             "subq $256, %%rsp"        "\n\t"
   3561             "movupd 0(%1), %%xmm2"    "\n\t"
   3562             "ptest 16(%1), %%xmm2"    "\n\t"
   3563             "pushfq"                  "\n\t"
   3564             "popq %0"                 "\n\t"
   3565             "addq $256, %%rsp"        "\n\t"
   3566             : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
   3567             "xmm2", "memory", "cc"
   3568          );
   3569          printf("r   ptest ");
   3570          showV128(&block[0]);
   3571          printf(" ");
   3572          showV128(&block[1]);
   3573          printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
   3574       }
   3575    }
   3576 }
   3577 
   3578 /* ------------ PBLENDVB ------------ */
   3579 
   3580 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3581 {
   3582    if (mem) {
   3583       __asm__ __volatile__(
   3584          "movupd   (%2), %%xmm0"         "\n\t"
   3585          "movupd   (%1), %%xmm11"        "\n\t"
   3586          "pblendvb (%0), %%xmm11"        "\n\t"
   3587          "movupd   %%xmm11, (%1)"        "\n"
   3588          : /*OUT*/
   3589          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3590          : /*TRASH*/ "xmm11","xmm0"
   3591       );
   3592    } else {
   3593       __asm__ __volatile__(
   3594          "movupd   (%2), %%xmm0"         "\n\t"
   3595          "movupd   (%1), %%xmm11"        "\n\t"
   3596          "movupd   (%0), %%xmm2"         "\n\t"
   3597          "pblendvb %%xmm2, %%xmm11"      "\n\t"
   3598          "movupd   %%xmm11, (%1)"        "\n"
   3599          : /*OUT*/
   3600          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3601          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3602       );
   3603    }
   3604 }
   3605 
   3606 void test_PBLENDVB ( void )
   3607 {
   3608    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3609    Int i;
   3610    for (i = 0; i < 10; i++) {
   3611       randV128(&t_xmm0);
   3612       randV128(&t_src);
   3613       randV128(&t_dst);
   3614 
   3615       memcpy(&xmm0, &t_xmm0, 16);
   3616       memcpy(&src, &t_src, 16);
   3617       memcpy(&dst, &t_dst, 16);
   3618       do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
   3619       printf("r pblendvb  ");
   3620       showV128(&t_xmm0);
   3621       printf(" ");
   3622       showV128(&t_src);
   3623       printf(" ");
   3624       showV128(&t_dst);
   3625       printf(" -> ");
   3626       showV128(&dst);
   3627       printf("\n");
   3628 
   3629       memcpy(&xmm0, &t_xmm0, 16);
   3630       memcpy(&src, &t_src, 16);
   3631       memcpy(&dst, &t_dst, 16);
   3632       do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
   3633       printf("m pblendvb  ");
   3634       showV128(&t_xmm0);
   3635       printf(" ");
   3636       showV128(&t_src);
   3637       printf(" ");
   3638       showV128(&t_dst);
   3639       printf(" -> ");
   3640       showV128(&dst);
   3641       printf("\n");
   3642    }
   3643 }
   3644 
   3645 /* ------------ BLENDVPD ------------ */
   3646 
   3647 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3648 {
   3649    if (mem) {
   3650       __asm__ __volatile__(
   3651          "movupd   (%2), %%xmm0"         "\n\t"
   3652          "movupd   (%1), %%xmm11"        "\n\t"
   3653          "blendvpd (%0), %%xmm11"        "\n\t"
   3654          "movupd   %%xmm11, (%1)"        "\n"
   3655          : /*OUT*/
   3656          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3657          : /*TRASH*/ "xmm11","xmm0"
   3658       );
   3659    } else {
   3660       __asm__ __volatile__(
   3661          "movupd   (%2), %%xmm0"         "\n\t"
   3662          "movupd   (%1), %%xmm11"        "\n\t"
   3663          "movupd   (%0), %%xmm2"         "\n\t"
   3664          "blendvpd %%xmm2, %%xmm11"      "\n\t"
   3665          "movupd   %%xmm11, (%1)"        "\n"
   3666          : /*OUT*/
   3667          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3668          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3669       );
   3670    }
   3671 }
   3672 
   3673 void test_BLENDVPD ( void )
   3674 {
   3675    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3676    Int i;
   3677    for (i = 0; i < 10; i++) {
   3678       randV128(&t_xmm0);
   3679       randV128(&t_src);
   3680       randV128(&t_dst);
   3681 
   3682       memcpy(&xmm0, &t_xmm0, 16);
   3683       memcpy(&src, &t_src, 16);
   3684       memcpy(&dst, &t_dst, 16);
   3685       do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
   3686       printf("r blendvpd  ");
   3687       showV128(&t_xmm0);
   3688       printf(" ");
   3689       showV128(&t_src);
   3690       printf(" ");
   3691       showV128(&t_dst);
   3692       printf(" -> ");
   3693       showV128(&dst);
   3694       printf("\n");
   3695 
   3696       memcpy(&xmm0, &t_xmm0, 16);
   3697       memcpy(&src, &t_src, 16);
   3698       memcpy(&dst, &t_dst, 16);
   3699       do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
   3700       printf("m blendvpd  ");
   3701       showV128(&t_xmm0);
   3702       printf(" ");
   3703       showV128(&t_src);
   3704       printf(" ");
   3705       showV128(&t_dst);
   3706       printf(" -> ");
   3707       showV128(&dst);
   3708       printf("\n");
   3709    }
   3710 }
   3711 
   3712 /* ------------ BLENDVPS ------------ */
   3713 
   3714 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
   3715 {
   3716    if (mem) {
   3717       __asm__ __volatile__(
   3718          "movupd   (%2), %%xmm0"         "\n\t"
   3719          "movupd   (%1), %%xmm11"        "\n\t"
   3720          "blendvps (%0), %%xmm11"        "\n\t"
   3721          "movupd   %%xmm11, (%1)"        "\n"
   3722          : /*OUT*/
   3723          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3724          : /*TRASH*/ "xmm11","xmm0"
   3725       );
   3726    } else {
   3727       __asm__ __volatile__(
   3728          "movupd   (%2), %%xmm0"         "\n\t"
   3729          "movupd   (%1), %%xmm11"        "\n\t"
   3730          "movupd   (%0), %%xmm2"         "\n\t"
   3731          "blendvps %%xmm2, %%xmm11"      "\n\t"
   3732          "movupd   %%xmm11, (%1)"        "\n"
   3733          : /*OUT*/
   3734          : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
   3735          : /*TRASH*/ "xmm11","xmm2","xmm0"
   3736       );
   3737    }
   3738 }
   3739 
   3740 void test_BLENDVPS ( void )
   3741 {
   3742    V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
   3743    Int i;
   3744    for (i = 0; i < 10; i++) {
   3745       randV128(&t_xmm0);
   3746       randV128(&t_src);
   3747       randV128(&t_dst);
   3748 
   3749       memcpy(&xmm0, &t_xmm0, 16);
   3750       memcpy(&src, &t_src, 16);
   3751       memcpy(&dst, &t_dst, 16);
   3752       do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
   3753       printf("r blendvps  ");
   3754       showV128(&t_xmm0);
   3755       printf(" ");
   3756       showV128(&t_src);
   3757       printf(" ");
   3758       showV128(&t_dst);
   3759       printf(" -> ");
   3760       showV128(&dst);
   3761       printf("\n");
   3762 
   3763       memcpy(&xmm0, &t_xmm0, 16);
   3764       memcpy(&src, &t_src, 16);
   3765       memcpy(&dst, &t_dst, 16);
   3766       do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
   3767       printf("m blendvps  ");
   3768       showV128(&t_xmm0);
   3769       printf(" ");
   3770       showV128(&t_src);
   3771       printf(" ");
   3772       showV128(&t_dst);
   3773       printf(" -> ");
   3774       showV128(&dst);
   3775       printf("\n");
   3776    }
   3777 }
   3778 
   3779 /* ------------ main ------------ */
   3780 
   3781 int main ( int argc, char** argv )
   3782 {
   3783 #if 1
   3784    // ------ SSE 4.1 ------
   3785    test_BLENDPD();        // done Apr.01.2010
   3786    test_BLENDPS();        // done Apr.02.2010
   3787    test_PBLENDW();
   3788    test_PBLENDVB();
   3789    test_BLENDVPD();
   3790    test_BLENDVPS();
   3791    test_DPPD();           // done Apr.08.2010
   3792    test_DPPS();           // done Apr.09.2010
   3793    test_EXTRACTPS();
   3794    test_INSERTPS();       // done Apr.01.2010
   3795    // MOVNTDQA  ***
   3796    test_PCMPEQQ();
   3797    test_PEXTRB();         // done Apr.15.2010
   3798    test_PEXTRD();         // done Apr.14.2010
   3799    test_PEXTRQ();         // done Apr.14.2010
   3800    test_PEXTRW();         // done Apr.14.2010
   3801    test_PINSRQ();         // done Apr.16.2010
   3802    test_PINSRD();         // todo
   3803    test_PINSRW(); /* Umm, this is SSE2, not SSE4.  Right? */
   3804    test_PINSRB();         // todo
   3805    test_PMAXSB();
   3806    test_PMAXSD();         // done Apr.09.2010
   3807    test_PMAXUD();         // done Apr.16.2010
   3808    test_PMAXUW();
   3809    test_PMINSB();
   3810    test_PMINSD();         // done Apr.09.2010
   3811    test_PMINUD();
   3812    test_PMINUW();
   3813    test_PMOVSXBW();       // done Apr.02.2010
   3814    test_PMOVSXBD();       // done Mar.30.2010
   3815    test_PMOVSXBQ();       // done Mar.30.2010
   3816    test_PMOVSXWD();       // done Mar.31.2010
   3817    test_PMOVSXWQ();       // done Mar.31.2010
   3818    test_PMOVSXDQ();       // done Mar.31.2010
   3819    test_PMOVZXBW();       // done Mar.28.2010
   3820    test_PMOVZXBD();       // done Mar.29.2010
   3821    test_PMOVZXBQ();       // done Mar.29.2010
   3822    test_PMOVZXWD();       // done Mar.28.2010
   3823    test_PMOVZXWQ();       // done Mar.29.2010
   3824    test_PMOVZXDQ();       // done Mar.29.2010
   3825    test_POPCNTW();
   3826    test_POPCNTL();
   3827    test_POPCNTQ();
   3828    test_PMULDQ();
   3829    test_PMULLD();
   3830    test_PTEST();
   3831    test_ROUNDSD_w_immediate_rounding();
   3832    test_ROUNDSS_w_immediate_rounding();
   3833    test_ROUNDPD_w_immediate_rounding();
   3834    test_ROUNDPS_w_immediate_rounding();
   3835    test_ROUNDSD_w_mxcsr_rounding();
   3836    test_ROUNDSS_w_mxcsr_rounding();
   3837    test_ROUNDPD_w_mxcsr_rounding();
   3838    test_ROUNDPS_w_mxcsr_rounding();
   3839    // ------ SSE 4.2 ------
   3840    test_PCMPGTQ();
   3841    // CRC32B,Q
   3842    test_PACKUSDW();
   3843    test_PHMINPOSUW();
   3844    test_MPSADBW();
   3845 #else
   3846    test_MPSADBW();
   3847 #endif
   3848 
   3849    return 0;
   3850 }
   3851 
   3852