Home | History | Annotate | Download | only in target-i386
      1 /*
      2  *  MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
      3  *
      4  *  Copyright (c) 2005 Fabrice Bellard
      5  *  Copyright (c) 2008 Intel Corporation  <andrew.zaborowski (at) intel.com>
      6  *
      7  * This library is free software; you can redistribute it and/or
      8  * modify it under the terms of the GNU Lesser General Public
      9  * License as published by the Free Software Foundation; either
     10  * version 2 of the License, or (at your option) any later version.
     11  *
     12  * This library is distributed in the hope that it will be useful,
     13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15  * Lesser General Public License for more details.
     16  *
     17  * You should have received a copy of the GNU Lesser General Public
     18  * License along with this library; if not, write to the Free Software
     19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA  02110-1301 USA
     20  */
     21 #if SHIFT == 0
     22 #define Reg MMXReg
     23 #define XMM_ONLY(...)
     24 #define B(n) MMX_B(n)
     25 #define W(n) MMX_W(n)
     26 #define L(n) MMX_L(n)
     27 #define Q(n) q
     28 #define SUFFIX _mmx
     29 #else
     30 #define Reg XMMReg
     31 #define XMM_ONLY(...) __VA_ARGS__
     32 #define B(n) XMM_B(n)
     33 #define W(n) XMM_W(n)
     34 #define L(n) XMM_L(n)
     35 #define Q(n) XMM_Q(n)
     36 #define SUFFIX _xmm
     37 #endif
     38 
     39 void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
     40 {
     41     int shift;
     42 
     43     if (s->Q(0) > 15) {
     44         d->Q(0) = 0;
     45 #if SHIFT == 1
     46         d->Q(1) = 0;
     47 #endif
     48     } else {
     49         shift = s->B(0);
     50         d->W(0) >>= shift;
     51         d->W(1) >>= shift;
     52         d->W(2) >>= shift;
     53         d->W(3) >>= shift;
     54 #if SHIFT == 1
     55         d->W(4) >>= shift;
     56         d->W(5) >>= shift;
     57         d->W(6) >>= shift;
     58         d->W(7) >>= shift;
     59 #endif
     60     }
     61 }
     62 
     63 void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
     64 {
     65     int shift;
     66 
     67     if (s->Q(0) > 15) {
     68         shift = 15;
     69     } else {
     70         shift = s->B(0);
     71     }
     72     d->W(0) = (int16_t)d->W(0) >> shift;
     73     d->W(1) = (int16_t)d->W(1) >> shift;
     74     d->W(2) = (int16_t)d->W(2) >> shift;
     75     d->W(3) = (int16_t)d->W(3) >> shift;
     76 #if SHIFT == 1
     77     d->W(4) = (int16_t)d->W(4) >> shift;
     78     d->W(5) = (int16_t)d->W(5) >> shift;
     79     d->W(6) = (int16_t)d->W(6) >> shift;
     80     d->W(7) = (int16_t)d->W(7) >> shift;
     81 #endif
     82 }
     83 
     84 void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
     85 {
     86     int shift;
     87 
     88     if (s->Q(0) > 15) {
     89         d->Q(0) = 0;
     90 #if SHIFT == 1
     91         d->Q(1) = 0;
     92 #endif
     93     } else {
     94         shift = s->B(0);
     95         d->W(0) <<= shift;
     96         d->W(1) <<= shift;
     97         d->W(2) <<= shift;
     98         d->W(3) <<= shift;
     99 #if SHIFT == 1
    100         d->W(4) <<= shift;
    101         d->W(5) <<= shift;
    102         d->W(6) <<= shift;
    103         d->W(7) <<= shift;
    104 #endif
    105     }
    106 }
    107 
    108 void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
    109 {
    110     int shift;
    111 
    112     if (s->Q(0) > 31) {
    113         d->Q(0) = 0;
    114 #if SHIFT == 1
    115         d->Q(1) = 0;
    116 #endif
    117     } else {
    118         shift = s->B(0);
    119         d->L(0) >>= shift;
    120         d->L(1) >>= shift;
    121 #if SHIFT == 1
    122         d->L(2) >>= shift;
    123         d->L(3) >>= shift;
    124 #endif
    125     }
    126 }
    127 
    128 void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
    129 {
    130     int shift;
    131 
    132     if (s->Q(0) > 31) {
    133         shift = 31;
    134     } else {
    135         shift = s->B(0);
    136     }
    137     d->L(0) = (int32_t)d->L(0) >> shift;
    138     d->L(1) = (int32_t)d->L(1) >> shift;
    139 #if SHIFT == 1
    140     d->L(2) = (int32_t)d->L(2) >> shift;
    141     d->L(3) = (int32_t)d->L(3) >> shift;
    142 #endif
    143 }
    144 
    145 void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
    146 {
    147     int shift;
    148 
    149     if (s->Q(0) > 31) {
    150         d->Q(0) = 0;
    151 #if SHIFT == 1
    152         d->Q(1) = 0;
    153 #endif
    154     } else {
    155         shift = s->B(0);
    156         d->L(0) <<= shift;
    157         d->L(1) <<= shift;
    158 #if SHIFT == 1
    159         d->L(2) <<= shift;
    160         d->L(3) <<= shift;
    161 #endif
    162     }
    163 }
    164 
    165 void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
    166 {
    167     int shift;
    168 
    169     if (s->Q(0) > 63) {
    170         d->Q(0) = 0;
    171 #if SHIFT == 1
    172         d->Q(1) = 0;
    173 #endif
    174     } else {
    175         shift = s->B(0);
    176         d->Q(0) >>= shift;
    177 #if SHIFT == 1
    178         d->Q(1) >>= shift;
    179 #endif
    180     }
    181 }
    182 
    183 void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
    184 {
    185     int shift;
    186 
    187     if (s->Q(0) > 63) {
    188         d->Q(0) = 0;
    189 #if SHIFT == 1
    190         d->Q(1) = 0;
    191 #endif
    192     } else {
    193         shift = s->B(0);
    194         d->Q(0) <<= shift;
    195 #if SHIFT == 1
    196         d->Q(1) <<= shift;
    197 #endif
    198     }
    199 }
    200 
    201 #if SHIFT == 1
    202 void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
    203 {
    204     int shift, i;
    205 
    206     shift = s->L(0);
    207     if (shift > 16)
    208         shift = 16;
    209     for(i = 0; i < 16 - shift; i++)
    210         d->B(i) = d->B(i + shift);
    211     for(i = 16 - shift; i < 16; i++)
    212         d->B(i) = 0;
    213 }
    214 
    215 void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
    216 {
    217     int shift, i;
    218 
    219     shift = s->L(0);
    220     if (shift > 16)
    221         shift = 16;
    222     for(i = 15; i >= shift; i--)
    223         d->B(i) = d->B(i - shift);
    224     for(i = 0; i < shift; i++)
    225         d->B(i) = 0;
    226 }
    227 #endif
    228 
    229 #define SSE_HELPER_B(name, F)\
    230 void glue(name, SUFFIX) (Reg *d, Reg *s)\
    231 {\
    232     d->B(0) = F(d->B(0), s->B(0));\
    233     d->B(1) = F(d->B(1), s->B(1));\
    234     d->B(2) = F(d->B(2), s->B(2));\
    235     d->B(3) = F(d->B(3), s->B(3));\
    236     d->B(4) = F(d->B(4), s->B(4));\
    237     d->B(5) = F(d->B(5), s->B(5));\
    238     d->B(6) = F(d->B(6), s->B(6));\
    239     d->B(7) = F(d->B(7), s->B(7));\
    240     XMM_ONLY(\
    241     d->B(8) = F(d->B(8), s->B(8));\
    242     d->B(9) = F(d->B(9), s->B(9));\
    243     d->B(10) = F(d->B(10), s->B(10));\
    244     d->B(11) = F(d->B(11), s->B(11));\
    245     d->B(12) = F(d->B(12), s->B(12));\
    246     d->B(13) = F(d->B(13), s->B(13));\
    247     d->B(14) = F(d->B(14), s->B(14));\
    248     d->B(15) = F(d->B(15), s->B(15));\
    249     )\
    250 }
    251 
    252 #define SSE_HELPER_W(name, F)\
    253 void glue(name, SUFFIX) (Reg *d, Reg *s)\
    254 {\
    255     d->W(0) = F(d->W(0), s->W(0));\
    256     d->W(1) = F(d->W(1), s->W(1));\
    257     d->W(2) = F(d->W(2), s->W(2));\
    258     d->W(3) = F(d->W(3), s->W(3));\
    259     XMM_ONLY(\
    260     d->W(4) = F(d->W(4), s->W(4));\
    261     d->W(5) = F(d->W(5), s->W(5));\
    262     d->W(6) = F(d->W(6), s->W(6));\
    263     d->W(7) = F(d->W(7), s->W(7));\
    264     )\
    265 }
    266 
    267 #define SSE_HELPER_L(name, F)\
    268 void glue(name, SUFFIX) (Reg *d, Reg *s)\
    269 {\
    270     d->L(0) = F(d->L(0), s->L(0));\
    271     d->L(1) = F(d->L(1), s->L(1));\
    272     XMM_ONLY(\
    273     d->L(2) = F(d->L(2), s->L(2));\
    274     d->L(3) = F(d->L(3), s->L(3));\
    275     )\
    276 }
    277 
    278 #define SSE_HELPER_Q(name, F)\
    279 void glue(name, SUFFIX) (Reg *d, Reg *s)\
    280 {\
    281     d->Q(0) = F(d->Q(0), s->Q(0));\
    282     XMM_ONLY(\
    283     d->Q(1) = F(d->Q(1), s->Q(1));\
    284     )\
    285 }
    286 
    287 #if SHIFT == 0
    288 static inline int satub(int x)
    289 {
    290     if (x < 0)
    291         return 0;
    292     else if (x > 255)
    293         return 255;
    294     else
    295         return x;
    296 }
    297 
    298 static inline int satuw(int x)
    299 {
    300     if (x < 0)
    301         return 0;
    302     else if (x > 65535)
    303         return 65535;
    304     else
    305         return x;
    306 }
    307 
    308 static inline int satsb(int x)
    309 {
    310     if (x < -128)
    311         return -128;
    312     else if (x > 127)
    313         return 127;
    314     else
    315         return x;
    316 }
    317 
    318 static inline int satsw(int x)
    319 {
    320     if (x < -32768)
    321         return -32768;
    322     else if (x > 32767)
    323         return 32767;
    324     else
    325         return x;
    326 }
    327 
    328 #define FADD(a, b) ((a) + (b))
    329 #define FADDUB(a, b) satub((a) + (b))
    330 #define FADDUW(a, b) satuw((a) + (b))
    331 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
    332 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
    333 
    334 #define FSUB(a, b) ((a) - (b))
    335 #define FSUBUB(a, b) satub((a) - (b))
    336 #define FSUBUW(a, b) satuw((a) - (b))
    337 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
    338 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
    339 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
    340 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
    341 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
    342 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
    343 
    344 #define FAND(a, b) (a) & (b)
    345 #define FANDN(a, b) ((~(a)) & (b))
    346 #define FOR(a, b) (a) | (b)
    347 #define FXOR(a, b) (a) ^ (b)
    348 
    349 #define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
    350 #define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
    351 #define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
    352 #define FCMPEQ(a, b) (a) == (b) ? -1 : 0
    353 
    354 #define FMULLW(a, b) (a) * (b)
    355 #define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
    356 #define FMULHUW(a, b) (a) * (b) >> 16
    357 #define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
    358 
    359 #define FAVG(a, b) ((a) + (b) + 1) >> 1
    360 #endif
    361 
    362 SSE_HELPER_B(helper_paddb, FADD)
    363 SSE_HELPER_W(helper_paddw, FADD)
    364 SSE_HELPER_L(helper_paddl, FADD)
    365 SSE_HELPER_Q(helper_paddq, FADD)
    366 
    367 SSE_HELPER_B(helper_psubb, FSUB)
    368 SSE_HELPER_W(helper_psubw, FSUB)
    369 SSE_HELPER_L(helper_psubl, FSUB)
    370 SSE_HELPER_Q(helper_psubq, FSUB)
    371 
    372 SSE_HELPER_B(helper_paddusb, FADDUB)
    373 SSE_HELPER_B(helper_paddsb, FADDSB)
    374 SSE_HELPER_B(helper_psubusb, FSUBUB)
    375 SSE_HELPER_B(helper_psubsb, FSUBSB)
    376 
    377 SSE_HELPER_W(helper_paddusw, FADDUW)
    378 SSE_HELPER_W(helper_paddsw, FADDSW)
    379 SSE_HELPER_W(helper_psubusw, FSUBUW)
    380 SSE_HELPER_W(helper_psubsw, FSUBSW)
    381 
    382 SSE_HELPER_B(helper_pminub, FMINUB)
    383 SSE_HELPER_B(helper_pmaxub, FMAXUB)
    384 
    385 SSE_HELPER_W(helper_pminsw, FMINSW)
    386 SSE_HELPER_W(helper_pmaxsw, FMAXSW)
    387 
    388 SSE_HELPER_Q(helper_pand, FAND)
    389 SSE_HELPER_Q(helper_pandn, FANDN)
    390 SSE_HELPER_Q(helper_por, FOR)
    391 SSE_HELPER_Q(helper_pxor, FXOR)
    392 
    393 SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
    394 SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
    395 SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
    396 
    397 SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
    398 SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
    399 SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
    400 
    401 SSE_HELPER_W(helper_pmullw, FMULLW)
    402 #if SHIFT == 0
    403 SSE_HELPER_W(helper_pmulhrw, FMULHRW)
    404 #endif
    405 SSE_HELPER_W(helper_pmulhuw, FMULHUW)
    406 SSE_HELPER_W(helper_pmulhw, FMULHW)
    407 
    408 SSE_HELPER_B(helper_pavgb, FAVG)
    409 SSE_HELPER_W(helper_pavgw, FAVG)
    410 
    411 void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
    412 {
    413     d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
    414 #if SHIFT == 1
    415     d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
    416 #endif
    417 }
    418 
    419 void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
    420 {
    421     int i;
    422 
    423     for(i = 0; i < (2 << SHIFT); i++) {
    424         d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
    425             (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
    426     }
    427 }
    428 
    429 #if SHIFT == 0
    430 static inline int abs1(int a)
    431 {
    432     if (a < 0)
    433         return -a;
    434     else
    435         return a;
    436 }
    437 #endif
    438 void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
    439 {
    440     unsigned int val;
    441 
    442     val = 0;
    443     val += abs1(d->B(0) - s->B(0));
    444     val += abs1(d->B(1) - s->B(1));
    445     val += abs1(d->B(2) - s->B(2));
    446     val += abs1(d->B(3) - s->B(3));
    447     val += abs1(d->B(4) - s->B(4));
    448     val += abs1(d->B(5) - s->B(5));
    449     val += abs1(d->B(6) - s->B(6));
    450     val += abs1(d->B(7) - s->B(7));
    451     d->Q(0) = val;
    452 #if SHIFT == 1
    453     val = 0;
    454     val += abs1(d->B(8) - s->B(8));
    455     val += abs1(d->B(9) - s->B(9));
    456     val += abs1(d->B(10) - s->B(10));
    457     val += abs1(d->B(11) - s->B(11));
    458     val += abs1(d->B(12) - s->B(12));
    459     val += abs1(d->B(13) - s->B(13));
    460     val += abs1(d->B(14) - s->B(14));
    461     val += abs1(d->B(15) - s->B(15));
    462     d->Q(1) = val;
    463 #endif
    464 }
    465 
    466 void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
    467 {
    468     int i;
    469     for(i = 0; i < (8 << SHIFT); i++) {
    470         if (s->B(i) & 0x80)
    471             stb(a0 + i, d->B(i));
    472     }
    473 }
    474 
    475 void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
    476 {
    477     d->L(0) = val;
    478     d->L(1) = 0;
    479 #if SHIFT == 1
    480     d->Q(1) = 0;
    481 #endif
    482 }
    483 
    484 #ifdef TARGET_X86_64
    485 void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
    486 {
    487     d->Q(0) = val;
    488 #if SHIFT == 1
    489     d->Q(1) = 0;
    490 #endif
    491 }
    492 #endif
    493 
    494 #if SHIFT == 0
    495 void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
    496 {
    497     Reg r;
    498     r.W(0) = s->W(order & 3);
    499     r.W(1) = s->W((order >> 2) & 3);
    500     r.W(2) = s->W((order >> 4) & 3);
    501     r.W(3) = s->W((order >> 6) & 3);
    502     *d = r;
    503 }
    504 #else
    505 void helper_shufps(Reg *d, Reg *s, int order)
    506 {
    507     Reg r;
    508     r.L(0) = d->L(order & 3);
    509     r.L(1) = d->L((order >> 2) & 3);
    510     r.L(2) = s->L((order >> 4) & 3);
    511     r.L(3) = s->L((order >> 6) & 3);
    512     *d = r;
    513 }
    514 
    515 void helper_shufpd(Reg *d, Reg *s, int order)
    516 {
    517     Reg r;
    518     r.Q(0) = d->Q(order & 1);
    519     r.Q(1) = s->Q((order >> 1) & 1);
    520     *d = r;
    521 }
    522 
    523 void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
    524 {
    525     Reg r;
    526     r.L(0) = s->L(order & 3);
    527     r.L(1) = s->L((order >> 2) & 3);
    528     r.L(2) = s->L((order >> 4) & 3);
    529     r.L(3) = s->L((order >> 6) & 3);
    530     *d = r;
    531 }
    532 
    533 void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
    534 {
    535     Reg r;
    536     r.W(0) = s->W(order & 3);
    537     r.W(1) = s->W((order >> 2) & 3);
    538     r.W(2) = s->W((order >> 4) & 3);
    539     r.W(3) = s->W((order >> 6) & 3);
    540     r.Q(1) = s->Q(1);
    541     *d = r;
    542 }
    543 
    544 void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
    545 {
    546     Reg r;
    547     r.Q(0) = s->Q(0);
    548     r.W(4) = s->W(4 + (order & 3));
    549     r.W(5) = s->W(4 + ((order >> 2) & 3));
    550     r.W(6) = s->W(4 + ((order >> 4) & 3));
    551     r.W(7) = s->W(4 + ((order >> 6) & 3));
    552     *d = r;
    553 }
    554 #endif
    555 
    556 #if SHIFT == 1
    557 /* FPU ops */
    558 /* XXX: not accurate */
    559 
    560 #define SSE_HELPER_S(name, F)\
    561 void helper_ ## name ## ps (Reg *d, Reg *s)\
    562 {\
    563     d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
    564     d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
    565     d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
    566     d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
    567 }\
    568 \
    569 void helper_ ## name ## ss (Reg *d, Reg *s)\
    570 {\
    571     d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
    572 }\
    573 void helper_ ## name ## pd (Reg *d, Reg *s)\
    574 {\
    575     d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
    576     d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
    577 }\
    578 \
    579 void helper_ ## name ## sd (Reg *d, Reg *s)\
    580 {\
    581     d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
    582 }
    583 
    584 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
    585 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
    586 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
    587 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
    588 #define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
    589 #define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
    590 #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
    591 
    592 SSE_HELPER_S(add, FPU_ADD)
    593 SSE_HELPER_S(sub, FPU_SUB)
    594 SSE_HELPER_S(mul, FPU_MUL)
    595 SSE_HELPER_S(div, FPU_DIV)
    596 SSE_HELPER_S(min, FPU_MIN)
    597 SSE_HELPER_S(max, FPU_MAX)
    598 SSE_HELPER_S(sqrt, FPU_SQRT)
    599 
    600 
    601 /* float to float conversions */
    602 void helper_cvtps2pd(Reg *d, Reg *s)
    603 {
    604     float32 s0, s1;
    605     s0 = s->XMM_S(0);
    606     s1 = s->XMM_S(1);
    607     d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
    608     d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
    609 }
    610 
    611 void helper_cvtpd2ps(Reg *d, Reg *s)
    612 {
    613     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
    614     d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
    615     d->Q(1) = 0;
    616 }
    617 
    618 void helper_cvtss2sd(Reg *d, Reg *s)
    619 {
    620     d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
    621 }
    622 
    623 void helper_cvtsd2ss(Reg *d, Reg *s)
    624 {
    625     d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
    626 }
    627 
    628 /* integer to float */
    629 void helper_cvtdq2ps(Reg *d, Reg *s)
    630 {
    631     d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
    632     d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
    633     d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
    634     d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
    635 }
    636 
    637 void helper_cvtdq2pd(Reg *d, Reg *s)
    638 {
    639     int32_t l0, l1;
    640     l0 = (int32_t)s->XMM_L(0);
    641     l1 = (int32_t)s->XMM_L(1);
    642     d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
    643     d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
    644 }
    645 
    646 void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
    647 {
    648     d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
    649     d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
    650 }
    651 
    652 void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
    653 {
    654     d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
    655     d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
    656 }
    657 
    658 void helper_cvtsi2ss(XMMReg *d, uint32_t val)
    659 {
    660     d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
    661 }
    662 
    663 void helper_cvtsi2sd(XMMReg *d, uint32_t val)
    664 {
    665     d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
    666 }
    667 
    668 #ifdef TARGET_X86_64
    669 void helper_cvtsq2ss(XMMReg *d, uint64_t val)
    670 {
    671     d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
    672 }
    673 
    674 void helper_cvtsq2sd(XMMReg *d, uint64_t val)
    675 {
    676     d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
    677 }
    678 #endif
    679 
    680 /* float to integer */
    681 void helper_cvtps2dq(XMMReg *d, XMMReg *s)
    682 {
    683     d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
    684     d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
    685     d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
    686     d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
    687 }
    688 
    689 void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
    690 {
    691     d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
    692     d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
    693     d->XMM_Q(1) = 0;
    694 }
    695 
    696 void helper_cvtps2pi(MMXReg *d, XMMReg *s)
    697 {
    698     d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
    699     d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
    700 }
    701 
    702 void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
    703 {
    704     d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
    705     d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
    706 }
    707 
    708 int32_t helper_cvtss2si(XMMReg *s)
    709 {
    710     return float32_to_int32(s->XMM_S(0), &env->sse_status);
    711 }
    712 
    713 int32_t helper_cvtsd2si(XMMReg *s)
    714 {
    715     return float64_to_int32(s->XMM_D(0), &env->sse_status);
    716 }
    717 
    718 #ifdef TARGET_X86_64
    719 int64_t helper_cvtss2sq(XMMReg *s)
    720 {
    721     return float32_to_int64(s->XMM_S(0), &env->sse_status);
    722 }
    723 
    724 int64_t helper_cvtsd2sq(XMMReg *s)
    725 {
    726     return float64_to_int64(s->XMM_D(0), &env->sse_status);
    727 }
    728 #endif
    729 
    730 /* float to integer truncated */
    731 void helper_cvttps2dq(XMMReg *d, XMMReg *s)
    732 {
    733     d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
    734     d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
    735     d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
    736     d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
    737 }
    738 
    739 void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
    740 {
    741     d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
    742     d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
    743     d->XMM_Q(1) = 0;
    744 }
    745 
    746 void helper_cvttps2pi(MMXReg *d, XMMReg *s)
    747 {
    748     d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
    749     d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
    750 }
    751 
    752 void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
    753 {
    754     d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
    755     d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
    756 }
    757 
    758 int32_t helper_cvttss2si(XMMReg *s)
    759 {
    760     return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
    761 }
    762 
    763 int32_t helper_cvttsd2si(XMMReg *s)
    764 {
    765     return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
    766 }
    767 
    768 #ifdef TARGET_X86_64
    769 int64_t helper_cvttss2sq(XMMReg *s)
    770 {
    771     return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
    772 }
    773 
    774 int64_t helper_cvttsd2sq(XMMReg *s)
    775 {
    776     return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
    777 }
    778 #endif
    779 
    780 void helper_rsqrtps(XMMReg *d, XMMReg *s)
    781 {
    782     d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
    783     d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
    784     d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
    785     d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
    786 }
    787 
    788 void helper_rsqrtss(XMMReg *d, XMMReg *s)
    789 {
    790     d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
    791 }
    792 
    793 void helper_rcpps(XMMReg *d, XMMReg *s)
    794 {
    795     d->XMM_S(0) = approx_rcp(s->XMM_S(0));
    796     d->XMM_S(1) = approx_rcp(s->XMM_S(1));
    797     d->XMM_S(2) = approx_rcp(s->XMM_S(2));
    798     d->XMM_S(3) = approx_rcp(s->XMM_S(3));
    799 }
    800 
    801 void helper_rcpss(XMMReg *d, XMMReg *s)
    802 {
    803     d->XMM_S(0) = approx_rcp(s->XMM_S(0));
    804 }
    805 
    806 void helper_haddps(XMMReg *d, XMMReg *s)
    807 {
    808     XMMReg r;
    809     r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
    810     r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
    811     r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
    812     r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
    813     *d = r;
    814 }
    815 
    816 void helper_haddpd(XMMReg *d, XMMReg *s)
    817 {
    818     XMMReg r;
    819     r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
    820     r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
    821     *d = r;
    822 }
    823 
    824 void helper_hsubps(XMMReg *d, XMMReg *s)
    825 {
    826     XMMReg r;
    827     r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
    828     r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
    829     r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
    830     r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
    831     *d = r;
    832 }
    833 
    834 void helper_hsubpd(XMMReg *d, XMMReg *s)
    835 {
    836     XMMReg r;
    837     r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
    838     r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
    839     *d = r;
    840 }
    841 
    842 void helper_addsubps(XMMReg *d, XMMReg *s)
    843 {
    844     d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
    845     d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
    846     d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
    847     d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
    848 }
    849 
    850 void helper_addsubpd(XMMReg *d, XMMReg *s)
    851 {
    852     d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
    853     d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
    854 }
    855 
    856 /* XXX: unordered */
    857 #define SSE_HELPER_CMP(name, F)\
    858 void helper_ ## name ## ps (Reg *d, Reg *s)\
    859 {\
    860     d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
    861     d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
    862     d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
    863     d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
    864 }\
    865 \
    866 void helper_ ## name ## ss (Reg *d, Reg *s)\
    867 {\
    868     d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
    869 }\
    870 void helper_ ## name ## pd (Reg *d, Reg *s)\
    871 {\
    872     d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
    873     d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
    874 }\
    875 \
    876 void helper_ ## name ## sd (Reg *d, Reg *s)\
    877 {\
    878     d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
    879 }
    880 
    881 #define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
    882 #define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
    883 #define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
    884 #define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
    885 #define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
    886 #define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
    887 #define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
    888 #define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
    889 
    890 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
    891 SSE_HELPER_CMP(cmplt, FPU_CMPLT)
    892 SSE_HELPER_CMP(cmple, FPU_CMPLE)
    893 SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
    894 SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
    895 SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
    896 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
    897 SSE_HELPER_CMP(cmpord, FPU_CMPORD)
    898 
    899 const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
    900 
    901 void helper_ucomiss(Reg *d, Reg *s)
    902 {
    903     int ret;
    904     float32 s0, s1;
    905 
    906     s0 = d->XMM_S(0);
    907     s1 = s->XMM_S(0);
    908     ret = float32_compare_quiet(s0, s1, &env->sse_status);
    909     CC_SRC = comis_eflags[ret + 1];
    910 }
    911 
    912 void helper_comiss(Reg *d, Reg *s)
    913 {
    914     int ret;
    915     float32 s0, s1;
    916 
    917     s0 = d->XMM_S(0);
    918     s1 = s->XMM_S(0);
    919     ret = float32_compare(s0, s1, &env->sse_status);
    920     CC_SRC = comis_eflags[ret + 1];
    921 }
    922 
    923 void helper_ucomisd(Reg *d, Reg *s)
    924 {
    925     int ret;
    926     float64 d0, d1;
    927 
    928     d0 = d->XMM_D(0);
    929     d1 = s->XMM_D(0);
    930     ret = float64_compare_quiet(d0, d1, &env->sse_status);
    931     CC_SRC = comis_eflags[ret + 1];
    932 }
    933 
    934 void helper_comisd(Reg *d, Reg *s)
    935 {
    936     int ret;
    937     float64 d0, d1;
    938 
    939     d0 = d->XMM_D(0);
    940     d1 = s->XMM_D(0);
    941     ret = float64_compare(d0, d1, &env->sse_status);
    942     CC_SRC = comis_eflags[ret + 1];
    943 }
    944 
    945 uint32_t helper_movmskps(Reg *s)
    946 {
    947     int b0, b1, b2, b3;
    948     b0 = s->XMM_L(0) >> 31;
    949     b1 = s->XMM_L(1) >> 31;
    950     b2 = s->XMM_L(2) >> 31;
    951     b3 = s->XMM_L(3) >> 31;
    952     return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
    953 }
    954 
    955 uint32_t helper_movmskpd(Reg *s)
    956 {
    957     int b0, b1;
    958     b0 = s->XMM_L(1) >> 31;
    959     b1 = s->XMM_L(3) >> 31;
    960     return b0 | (b1 << 1);
    961 }
    962 
    963 #endif
    964 
    965 uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
    966 {
    967     uint32_t val;
    968     val = 0;
    969     val |= (s->B(0) >> 7);
    970     val |= (s->B(1) >> 6) & 0x02;
    971     val |= (s->B(2) >> 5) & 0x04;
    972     val |= (s->B(3) >> 4) & 0x08;
    973     val |= (s->B(4) >> 3) & 0x10;
    974     val |= (s->B(5) >> 2) & 0x20;
    975     val |= (s->B(6) >> 1) & 0x40;
    976     val |= (s->B(7)) & 0x80;
    977 #if SHIFT == 1
    978     val |= (s->B(8) << 1) & 0x0100;
    979     val |= (s->B(9) << 2) & 0x0200;
    980     val |= (s->B(10) << 3) & 0x0400;
    981     val |= (s->B(11) << 4) & 0x0800;
    982     val |= (s->B(12) << 5) & 0x1000;
    983     val |= (s->B(13) << 6) & 0x2000;
    984     val |= (s->B(14) << 7) & 0x4000;
    985     val |= (s->B(15) << 8) & 0x8000;
    986 #endif
    987     return val;
    988 }
    989 
    990 void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
    991 {
    992     Reg r;
    993 
    994     r.B(0) = satsb((int16_t)d->W(0));
    995     r.B(1) = satsb((int16_t)d->W(1));
    996     r.B(2) = satsb((int16_t)d->W(2));
    997     r.B(3) = satsb((int16_t)d->W(3));
    998 #if SHIFT == 1
    999     r.B(4) = satsb((int16_t)d->W(4));
   1000     r.B(5) = satsb((int16_t)d->W(5));
   1001     r.B(6) = satsb((int16_t)d->W(6));
   1002     r.B(7) = satsb((int16_t)d->W(7));
   1003 #endif
   1004     r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
   1005     r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
   1006     r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
   1007     r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
   1008 #if SHIFT == 1
   1009     r.B(12) = satsb((int16_t)s->W(4));
   1010     r.B(13) = satsb((int16_t)s->W(5));
   1011     r.B(14) = satsb((int16_t)s->W(6));
   1012     r.B(15) = satsb((int16_t)s->W(7));
   1013 #endif
   1014     *d = r;
   1015 }
   1016 
   1017 void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
   1018 {
   1019     Reg r;
   1020 
   1021     r.B(0) = satub((int16_t)d->W(0));
   1022     r.B(1) = satub((int16_t)d->W(1));
   1023     r.B(2) = satub((int16_t)d->W(2));
   1024     r.B(3) = satub((int16_t)d->W(3));
   1025 #if SHIFT == 1
   1026     r.B(4) = satub((int16_t)d->W(4));
   1027     r.B(5) = satub((int16_t)d->W(5));
   1028     r.B(6) = satub((int16_t)d->W(6));
   1029     r.B(7) = satub((int16_t)d->W(7));
   1030 #endif
   1031     r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
   1032     r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
   1033     r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
   1034     r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
   1035 #if SHIFT == 1
   1036     r.B(12) = satub((int16_t)s->W(4));
   1037     r.B(13) = satub((int16_t)s->W(5));
   1038     r.B(14) = satub((int16_t)s->W(6));
   1039     r.B(15) = satub((int16_t)s->W(7));
   1040 #endif
   1041     *d = r;
   1042 }
   1043 
   1044 void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
   1045 {
   1046     Reg r;
   1047 
   1048     r.W(0) = satsw(d->L(0));
   1049     r.W(1) = satsw(d->L(1));
   1050 #if SHIFT == 1
   1051     r.W(2) = satsw(d->L(2));
   1052     r.W(3) = satsw(d->L(3));
   1053 #endif
   1054     r.W((2 << SHIFT) + 0) = satsw(s->L(0));
   1055     r.W((2 << SHIFT) + 1) = satsw(s->L(1));
   1056 #if SHIFT == 1
   1057     r.W(6) = satsw(s->L(2));
   1058     r.W(7) = satsw(s->L(3));
   1059 #endif
   1060     *d = r;
   1061 }
   1062 
   1063 #define UNPCK_OP(base_name, base)                               \
   1064                                                                 \
   1065 void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s)   \
   1066 {                                                               \
   1067     Reg r;                                              \
   1068                                                                 \
   1069     r.B(0) = d->B((base << (SHIFT + 2)) + 0);                   \
   1070     r.B(1) = s->B((base << (SHIFT + 2)) + 0);                   \
   1071     r.B(2) = d->B((base << (SHIFT + 2)) + 1);                   \
   1072     r.B(3) = s->B((base << (SHIFT + 2)) + 1);                   \
   1073     r.B(4) = d->B((base << (SHIFT + 2)) + 2);                   \
   1074     r.B(5) = s->B((base << (SHIFT + 2)) + 2);                   \
   1075     r.B(6) = d->B((base << (SHIFT + 2)) + 3);                   \
   1076     r.B(7) = s->B((base << (SHIFT + 2)) + 3);                   \
   1077 XMM_ONLY(                                                       \
   1078     r.B(8) = d->B((base << (SHIFT + 2)) + 4);                   \
   1079     r.B(9) = s->B((base << (SHIFT + 2)) + 4);                   \
   1080     r.B(10) = d->B((base << (SHIFT + 2)) + 5);                  \
   1081     r.B(11) = s->B((base << (SHIFT + 2)) + 5);                  \
   1082     r.B(12) = d->B((base << (SHIFT + 2)) + 6);                  \
   1083     r.B(13) = s->B((base << (SHIFT + 2)) + 6);                  \
   1084     r.B(14) = d->B((base << (SHIFT + 2)) + 7);                  \
   1085     r.B(15) = s->B((base << (SHIFT + 2)) + 7);                  \
   1086 )                                                               \
   1087     *d = r;                                                     \
   1088 }                                                               \
   1089                                                                 \
   1090 void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s)   \
   1091 {                                                               \
   1092     Reg r;                                              \
   1093                                                                 \
   1094     r.W(0) = d->W((base << (SHIFT + 1)) + 0);                   \
   1095     r.W(1) = s->W((base << (SHIFT + 1)) + 0);                   \
   1096     r.W(2) = d->W((base << (SHIFT + 1)) + 1);                   \
   1097     r.W(3) = s->W((base << (SHIFT + 1)) + 1);                   \
   1098 XMM_ONLY(                                                       \
   1099     r.W(4) = d->W((base << (SHIFT + 1)) + 2);                   \
   1100     r.W(5) = s->W((base << (SHIFT + 1)) + 2);                   \
   1101     r.W(6) = d->W((base << (SHIFT + 1)) + 3);                   \
   1102     r.W(7) = s->W((base << (SHIFT + 1)) + 3);                   \
   1103 )                                                               \
   1104     *d = r;                                                     \
   1105 }                                                               \
   1106                                                                 \
   1107 void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s)   \
   1108 {                                                               \
   1109     Reg r;                                              \
   1110                                                                 \
   1111     r.L(0) = d->L((base << SHIFT) + 0);                         \
   1112     r.L(1) = s->L((base << SHIFT) + 0);                         \
   1113 XMM_ONLY(                                                       \
   1114     r.L(2) = d->L((base << SHIFT) + 1);                         \
   1115     r.L(3) = s->L((base << SHIFT) + 1);                         \
   1116 )                                                               \
   1117     *d = r;                                                     \
   1118 }                                                               \
   1119                                                                 \
   1120 XMM_ONLY(                                                       \
   1121 void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s)  \
   1122 {                                                               \
   1123     Reg r;                                              \
   1124                                                                 \
   1125     r.Q(0) = d->Q(base);                                        \
   1126     r.Q(1) = s->Q(base);                                        \
   1127     *d = r;                                                     \
   1128 }                                                               \
   1129 )
   1130 
   1131 UNPCK_OP(l, 0)
   1132 UNPCK_OP(h, 1)
   1133 
   1134 /* 3DNow! float ops */
   1135 #if SHIFT == 0
   1136 void helper_pi2fd(MMXReg *d, MMXReg *s)
   1137 {
   1138     d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
   1139     d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
   1140 }
   1141 
   1142 void helper_pi2fw(MMXReg *d, MMXReg *s)
   1143 {
   1144     d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
   1145     d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
   1146 }
   1147 
   1148 void helper_pf2id(MMXReg *d, MMXReg *s)
   1149 {
   1150     d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
   1151     d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
   1152 }
   1153 
   1154 void helper_pf2iw(MMXReg *d, MMXReg *s)
   1155 {
   1156     d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
   1157     d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
   1158 }
   1159 
   1160 void helper_pfacc(MMXReg *d, MMXReg *s)
   1161 {
   1162     MMXReg r;
   1163     r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1164     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1165     *d = r;
   1166 }
   1167 
   1168 void helper_pfadd(MMXReg *d, MMXReg *s)
   1169 {
   1170     d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1171     d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1172 }
   1173 
   1174 void helper_pfcmpeq(MMXReg *d, MMXReg *s)
   1175 {
   1176     d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
   1177     d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
   1178 }
   1179 
   1180 void helper_pfcmpge(MMXReg *d, MMXReg *s)
   1181 {
   1182     d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
   1183     d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
   1184 }
   1185 
   1186 void helper_pfcmpgt(MMXReg *d, MMXReg *s)
   1187 {
   1188     d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
   1189     d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
   1190 }
   1191 
   1192 void helper_pfmax(MMXReg *d, MMXReg *s)
   1193 {
   1194     if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
   1195         d->MMX_S(0) = s->MMX_S(0);
   1196     if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
   1197         d->MMX_S(1) = s->MMX_S(1);
   1198 }
   1199 
   1200 void helper_pfmin(MMXReg *d, MMXReg *s)
   1201 {
   1202     if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
   1203         d->MMX_S(0) = s->MMX_S(0);
   1204     if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
   1205         d->MMX_S(1) = s->MMX_S(1);
   1206 }
   1207 
   1208 void helper_pfmul(MMXReg *d, MMXReg *s)
   1209 {
   1210     d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1211     d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1212 }
   1213 
   1214 void helper_pfnacc(MMXReg *d, MMXReg *s)
   1215 {
   1216     MMXReg r;
   1217     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1218     r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1219     *d = r;
   1220 }
   1221 
   1222 void helper_pfpnacc(MMXReg *d, MMXReg *s)
   1223 {
   1224     MMXReg r;
   1225     r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
   1226     r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
   1227     *d = r;
   1228 }
   1229 
   1230 void helper_pfrcp(MMXReg *d, MMXReg *s)
   1231 {
   1232     d->MMX_S(0) = approx_rcp(s->MMX_S(0));
   1233     d->MMX_S(1) = d->MMX_S(0);
   1234 }
   1235 
   1236 void helper_pfrsqrt(MMXReg *d, MMXReg *s)
   1237 {
   1238     d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
   1239     d->MMX_S(1) = approx_rsqrt(d->MMX_S(1));
   1240     d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
   1241     d->MMX_L(0) = d->MMX_L(1);
   1242 }
   1243 
   1244 void helper_pfsub(MMXReg *d, MMXReg *s)
   1245 {
   1246     d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
   1247     d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
   1248 }
   1249 
   1250 void helper_pfsubr(MMXReg *d, MMXReg *s)
   1251 {
   1252     d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
   1253     d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
   1254 }
   1255 
   1256 void helper_pswapd(MMXReg *d, MMXReg *s)
   1257 {
   1258     MMXReg r;
   1259     r.MMX_L(0) = s->MMX_L(1);
   1260     r.MMX_L(1) = s->MMX_L(0);
   1261     *d = r;
   1262 }
   1263 #endif
   1264 
   1265 /* SSSE3 op helpers */
   1266 void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s)
   1267 {
   1268     int i;
   1269     Reg r;
   1270 
   1271     for (i = 0; i < (8 << SHIFT); i++)
   1272         r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1)));
   1273 
   1274     *d = r;
   1275 }
   1276 
   1277 void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s)
   1278 {
   1279     d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
   1280     d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
   1281     XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
   1282     XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
   1283     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
   1284     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
   1285     XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
   1286     XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
   1287 }
   1288 
   1289 void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s)
   1290 {
   1291     d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
   1292     XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
   1293     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
   1294     XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
   1295 }
   1296 
   1297 void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s)
   1298 {
   1299     d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
   1300     d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
   1301     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
   1302     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
   1303     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
   1304     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
   1305     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
   1306     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
   1307 }
   1308 
   1309 void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s)
   1310 {
   1311     d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) +
   1312                     (int8_t)s->B( 1) * (uint8_t)d->B( 1));
   1313     d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) +
   1314                     (int8_t)s->B( 3) * (uint8_t)d->B( 3));
   1315     d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) +
   1316                     (int8_t)s->B( 5) * (uint8_t)d->B( 5));
   1317     d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) +
   1318                     (int8_t)s->B( 7) * (uint8_t)d->B( 7));
   1319 #if SHIFT == 1
   1320     d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) +
   1321                     (int8_t)s->B( 9) * (uint8_t)d->B( 9));
   1322     d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
   1323                     (int8_t)s->B(11) * (uint8_t)d->B(11));
   1324     d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
   1325                     (int8_t)s->B(13) * (uint8_t)d->B(13));
   1326     d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
   1327                     (int8_t)s->B(15) * (uint8_t)d->B(15));
   1328 #endif
   1329 }
   1330 
   1331 void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s)
   1332 {
   1333     d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1);
   1334     d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3);
   1335     XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5));
   1336     XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7));
   1337     d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1);
   1338     d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3);
   1339     XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5));
   1340     XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7));
   1341 }
   1342 
   1343 void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s)
   1344 {
   1345     d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1);
   1346     XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3));
   1347     d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1);
   1348     XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3));
   1349 }
   1350 
   1351 void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s)
   1352 {
   1353     d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1));
   1354     d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3));
   1355     XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
   1356     XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7)));
   1357     d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1));
   1358     d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3));
   1359     XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5)));
   1360     XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7)));
   1361 }
   1362 
   1363 #define FABSB(_, x) x > INT8_MAX  ? -(int8_t ) x : x
   1364 #define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x
   1365 #define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x
   1366 SSE_HELPER_B(helper_pabsb, FABSB)
   1367 SSE_HELPER_W(helper_pabsw, FABSW)
   1368 SSE_HELPER_L(helper_pabsd, FABSL)
   1369 
   1370 #define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15
   1371 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
   1372 
   1373 #define FSIGNB(d, s) s <= INT8_MAX  ? s ? d : 0 : -(int8_t ) d
   1374 #define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d
   1375 #define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d
   1376 SSE_HELPER_B(helper_psignb, FSIGNB)
   1377 SSE_HELPER_W(helper_psignw, FSIGNW)
   1378 SSE_HELPER_L(helper_psignd, FSIGNL)
   1379 
   1380 void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift)
   1381 {
   1382     Reg r;
   1383 
   1384     /* XXX could be checked during translation */
   1385     if (shift >= (16 << SHIFT)) {
   1386         r.Q(0) = 0;
   1387         XMM_ONLY(r.Q(1) = 0);
   1388     } else {
   1389         shift <<= 3;
   1390 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
   1391 #if SHIFT == 0
   1392         r.Q(0) = SHR(s->Q(0), shift -   0) |
   1393                  SHR(d->Q(0), shift -  64);
   1394 #else
   1395         r.Q(0) = SHR(s->Q(0), shift -   0) |
   1396                  SHR(s->Q(1), shift -  64) |
   1397                  SHR(d->Q(0), shift - 128) |
   1398                  SHR(d->Q(1), shift - 192);
   1399         r.Q(1) = SHR(s->Q(0), shift +  64) |
   1400                  SHR(s->Q(1), shift -   0) |
   1401                  SHR(d->Q(0), shift -  64) |
   1402                  SHR(d->Q(1), shift - 128);
   1403 #endif
   1404 #undef SHR
   1405     }
   1406 
   1407     *d = r;
   1408 }
   1409 
   1410 #define XMM0 env->xmm_regs[0]
   1411 
   1412 #if SHIFT == 1
   1413 #define SSE_HELPER_V(name, elem, num, F)\
   1414 void glue(name, SUFFIX) (Reg *d, Reg *s)\
   1415 {\
   1416     d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\
   1417     d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\
   1418     if (num > 2) {\
   1419         d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\
   1420         d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\
   1421         if (num > 4) {\
   1422             d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\
   1423             d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\
   1424             d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\
   1425             d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\
   1426             if (num > 8) {\
   1427                 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\
   1428                 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\
   1429                 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\
   1430                 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\
   1431                 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\
   1432                 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\
   1433                 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\
   1434                 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\
   1435             }\
   1436         }\
   1437     }\
   1438 }
   1439 
   1440 #define SSE_HELPER_I(name, elem, num, F)\
   1441 void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\
   1442 {\
   1443     d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\
   1444     d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\
   1445     if (num > 2) {\
   1446         d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\
   1447         d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\
   1448         if (num > 4) {\
   1449             d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\
   1450             d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\
   1451             d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\
   1452             d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\
   1453             if (num > 8) {\
   1454                 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\
   1455                 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\
   1456                 d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\
   1457                 d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\
   1458                 d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\
   1459                 d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\
   1460                 d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\
   1461                 d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\
   1462             }\
   1463         }\
   1464     }\
   1465 }
   1466 
   1467 /* SSE4.1 op helpers */
   1468 #define FBLENDVB(d, s, m) (m & 0x80) ? s : d
   1469 #define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d
   1470 #define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d
   1471 SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
   1472 SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
   1473 SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
   1474 
   1475 void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s)
   1476 {
   1477     uint64_t zf = (s->Q(0) &  d->Q(0)) | (s->Q(1) &  d->Q(1));
   1478     uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1));
   1479 
   1480     CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
   1481 }
   1482 
   1483 #define SSE_HELPER_F(name, elem, num, F)\
   1484 void glue(name, SUFFIX) (Reg *d, Reg *s)\
   1485 {\
   1486     d->elem(0) = F(0);\
   1487     d->elem(1) = F(1);\
   1488     if (num > 2) {\
   1489         d->elem(2) = F(2);\
   1490         d->elem(3) = F(3);\
   1491         if (num > 4) {\
   1492             d->elem(4) = F(4);\
   1493             d->elem(5) = F(5);\
   1494             d->elem(6) = F(6);\
   1495             d->elem(7) = F(7);\
   1496         }\
   1497     }\
   1498 }
   1499 
   1500 SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B)
   1501 SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B)
   1502 SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B)
   1503 SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W)
   1504 SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W)
   1505 SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L)
   1506 SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B)
   1507 SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B)
   1508 SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B)
   1509 SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W)
   1510 SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W)
   1511 SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L)
   1512 
   1513 void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s)
   1514 {
   1515     d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0);
   1516     d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2);
   1517 }
   1518 
   1519 #define FCMPEQQ(d, s) d == s ? -1 : 0
   1520 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
   1521 
   1522 void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s)
   1523 {
   1524     d->W(0) = satuw((int32_t) d->L(0));
   1525     d->W(1) = satuw((int32_t) d->L(1));
   1526     d->W(2) = satuw((int32_t) d->L(2));
   1527     d->W(3) = satuw((int32_t) d->L(3));
   1528     d->W(4) = satuw((int32_t) s->L(0));
   1529     d->W(5) = satuw((int32_t) s->L(1));
   1530     d->W(6) = satuw((int32_t) s->L(2));
   1531     d->W(7) = satuw((int32_t) s->L(3));
   1532 }
   1533 
   1534 #define FMINSB(d, s) MIN((int8_t) d, (int8_t) s)
   1535 #define FMINSD(d, s) MIN((int32_t) d, (int32_t) s)
   1536 #define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s)
   1537 #define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s)
   1538 SSE_HELPER_B(helper_pminsb, FMINSB)
   1539 SSE_HELPER_L(helper_pminsd, FMINSD)
   1540 SSE_HELPER_W(helper_pminuw, MIN)
   1541 SSE_HELPER_L(helper_pminud, MIN)
   1542 SSE_HELPER_B(helper_pmaxsb, FMAXSB)
   1543 SSE_HELPER_L(helper_pmaxsd, FMAXSD)
   1544 SSE_HELPER_W(helper_pmaxuw, MAX)
   1545 SSE_HELPER_L(helper_pmaxud, MAX)
   1546 
   1547 #define FMULLD(d, s) (int32_t) d * (int32_t) s
   1548 SSE_HELPER_L(helper_pmulld, FMULLD)
   1549 
   1550 void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s)
   1551 {
   1552     int idx = 0;
   1553 
   1554     if (s->W(1) < s->W(idx))
   1555         idx = 1;
   1556     if (s->W(2) < s->W(idx))
   1557         idx = 2;
   1558     if (s->W(3) < s->W(idx))
   1559         idx = 3;
   1560     if (s->W(4) < s->W(idx))
   1561         idx = 4;
   1562     if (s->W(5) < s->W(idx))
   1563         idx = 5;
   1564     if (s->W(6) < s->W(idx))
   1565         idx = 6;
   1566     if (s->W(7) < s->W(idx))
   1567         idx = 7;
   1568 
   1569     d->Q(1) = 0;
   1570     d->L(1) = 0;
   1571     d->W(1) = idx;
   1572     d->W(0) = s->W(idx);
   1573 }
   1574 
   1575 void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
   1576 {
   1577     signed char prev_rounding_mode;
   1578 
   1579     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1580     if (!(mode & (1 << 2)))
   1581         switch (mode & 3) {
   1582         case 0:
   1583             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1584             break;
   1585         case 1:
   1586             set_float_rounding_mode(float_round_down, &env->sse_status);
   1587             break;
   1588         case 2:
   1589             set_float_rounding_mode(float_round_up, &env->sse_status);
   1590             break;
   1591         case 3:
   1592             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1593             break;
   1594         }
   1595 
   1596     d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
   1597     d->L(1) = float64_round_to_int(s->L(1), &env->sse_status);
   1598     d->L(2) = float64_round_to_int(s->L(2), &env->sse_status);
   1599     d->L(3) = float64_round_to_int(s->L(3), &env->sse_status);
   1600 
   1601 #if 0 /* TODO */
   1602     if (mode & (1 << 3))
   1603         set_float_exception_flags(
   1604                         get_float_exception_flags(&env->sse_status) &
   1605                         ~float_flag_inexact,
   1606                         &env->sse_status);
   1607 #endif
   1608     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1609 }
   1610 
   1611 void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
   1612 {
   1613     signed char prev_rounding_mode;
   1614 
   1615     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1616     if (!(mode & (1 << 2)))
   1617         switch (mode & 3) {
   1618         case 0:
   1619             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1620             break;
   1621         case 1:
   1622             set_float_rounding_mode(float_round_down, &env->sse_status);
   1623             break;
   1624         case 2:
   1625             set_float_rounding_mode(float_round_up, &env->sse_status);
   1626             break;
   1627         case 3:
   1628             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1629             break;
   1630         }
   1631 
   1632     d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
   1633     d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status);
   1634 
   1635 #if 0 /* TODO */
   1636     if (mode & (1 << 3))
   1637         set_float_exception_flags(
   1638                         get_float_exception_flags(&env->sse_status) &
   1639                         ~float_flag_inexact,
   1640                         &env->sse_status);
   1641 #endif
   1642     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1643 }
   1644 
   1645 void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
   1646 {
   1647     signed char prev_rounding_mode;
   1648 
   1649     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1650     if (!(mode & (1 << 2)))
   1651         switch (mode & 3) {
   1652         case 0:
   1653             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1654             break;
   1655         case 1:
   1656             set_float_rounding_mode(float_round_down, &env->sse_status);
   1657             break;
   1658         case 2:
   1659             set_float_rounding_mode(float_round_up, &env->sse_status);
   1660             break;
   1661         case 3:
   1662             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1663             break;
   1664         }
   1665 
   1666     d->L(0) = float64_round_to_int(s->L(0), &env->sse_status);
   1667 
   1668 #if 0 /* TODO */
   1669     if (mode & (1 << 3))
   1670         set_float_exception_flags(
   1671                         get_float_exception_flags(&env->sse_status) &
   1672                         ~float_flag_inexact,
   1673                         &env->sse_status);
   1674 #endif
   1675     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1676 }
   1677 
   1678 void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode)
   1679 {
   1680     signed char prev_rounding_mode;
   1681 
   1682     prev_rounding_mode = env->sse_status.float_rounding_mode;
   1683     if (!(mode & (1 << 2)))
   1684         switch (mode & 3) {
   1685         case 0:
   1686             set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
   1687             break;
   1688         case 1:
   1689             set_float_rounding_mode(float_round_down, &env->sse_status);
   1690             break;
   1691         case 2:
   1692             set_float_rounding_mode(float_round_up, &env->sse_status);
   1693             break;
   1694         case 3:
   1695             set_float_rounding_mode(float_round_to_zero, &env->sse_status);
   1696             break;
   1697         }
   1698 
   1699     d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status);
   1700 
   1701 #if 0 /* TODO */
   1702     if (mode & (1 << 3))
   1703         set_float_exception_flags(
   1704                         get_float_exception_flags(&env->sse_status) &
   1705                         ~float_flag_inexact,
   1706                         &env->sse_status);
   1707 #endif
   1708     env->sse_status.float_rounding_mode = prev_rounding_mode;
   1709 }
   1710 
   1711 #define FBLENDP(d, s, m) m ? s : d
   1712 SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
   1713 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
   1714 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
   1715 
   1716 void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
   1717 {
   1718     float32 iresult = 0 /*float32_zero*/;
   1719 
   1720     if (mask & (1 << 4))
   1721         iresult = float32_add(iresult,
   1722                         float32_mul(d->L(0), s->L(0), &env->sse_status),
   1723                         &env->sse_status);
   1724     if (mask & (1 << 5))
   1725         iresult = float32_add(iresult,
   1726                         float32_mul(d->L(1), s->L(1), &env->sse_status),
   1727                         &env->sse_status);
   1728     if (mask & (1 << 6))
   1729         iresult = float32_add(iresult,
   1730                         float32_mul(d->L(2), s->L(2), &env->sse_status),
   1731                         &env->sse_status);
   1732     if (mask & (1 << 7))
   1733         iresult = float32_add(iresult,
   1734                         float32_mul(d->L(3), s->L(3), &env->sse_status),
   1735                         &env->sse_status);
   1736     d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/;
   1737     d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/;
   1738     d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/;
   1739     d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/;
   1740 }
   1741 
   1742 void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask)
   1743 {
   1744     float64 iresult = 0 /*float64_zero*/;
   1745 
   1746     if (mask & (1 << 4))
   1747         iresult = float64_add(iresult,
   1748                         float64_mul(d->Q(0), s->Q(0), &env->sse_status),
   1749                         &env->sse_status);
   1750     if (mask & (1 << 5))
   1751         iresult = float64_add(iresult,
   1752                         float64_mul(d->Q(1), s->Q(1), &env->sse_status),
   1753                         &env->sse_status);
   1754     d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/;
   1755     d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/;
   1756 }
   1757 
   1758 void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset)
   1759 {
   1760     int s0 = (offset & 3) << 2;
   1761     int d0 = (offset & 4) << 0;
   1762     int i;
   1763     Reg r;
   1764 
   1765     for (i = 0; i < 8; i++, d0++) {
   1766         r.W(i) = 0;
   1767         r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0));
   1768         r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1));
   1769         r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2));
   1770         r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3));
   1771     }
   1772 
   1773     *d = r;
   1774 }
   1775 
   1776 /* SSE4.2 op helpers */
   1777 /* it's unclear whether signed or unsigned */
   1778 #define FCMPGTQ(d, s) d > s ? -1 : 0
   1779 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
   1780 
   1781 static inline int pcmp_elen(int reg, uint32_t ctrl)
   1782 {
   1783     int val;
   1784 
   1785     /* Presence of REX.W is indicated by a bit higher than 7 set */
   1786     if (ctrl >> 8)
   1787         val = abs1((int64_t) env->regs[reg]);
   1788     else
   1789         val = abs1((int32_t) env->regs[reg]);
   1790 
   1791     if (ctrl & 1) {
   1792         if (val > 8)
   1793             return 8;
   1794     } else
   1795         if (val > 16)
   1796             return 16;
   1797 
   1798     return val;
   1799 }
   1800 
   1801 static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
   1802 {
   1803     int val = 0;
   1804 
   1805     if (ctrl & 1) {
   1806         while (val < 8 && r->W(val))
   1807             val++;
   1808     } else
   1809         while (val < 16 && r->B(val))
   1810             val++;
   1811 
   1812     return val;
   1813 }
   1814 
   1815 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
   1816 {
   1817     switch ((ctrl >> 0) & 3) {
   1818     case 0:
   1819         return r->B(i);
   1820     case 1:
   1821         return r->W(i);
   1822     case 2:
   1823         return (int8_t) r->B(i);
   1824     case 3:
   1825     default:
   1826         return (int16_t) r->W(i);
   1827     }
   1828 }
   1829 
   1830 static inline unsigned pcmpxstrx(Reg *d, Reg *s,
   1831                 int8_t ctrl, int valids, int validd)
   1832 {
   1833     unsigned int res = 0;
   1834     int v;
   1835     int j, i;
   1836     int upper = (ctrl & 1) ? 7 : 15;
   1837 
   1838     valids--;
   1839     validd--;
   1840 
   1841     CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
   1842 
   1843     switch ((ctrl >> 2) & 3) {
   1844     case 0:
   1845         for (j = valids; j >= 0; j--) {
   1846             res <<= 1;
   1847             v = pcmp_val(s, ctrl, j);
   1848             for (i = validd; i >= 0; i--)
   1849                 res |= (v == pcmp_val(d, ctrl, i));
   1850         }
   1851         break;
   1852     case 1:
   1853         for (j = valids; j >= 0; j--) {
   1854             res <<= 1;
   1855             v = pcmp_val(s, ctrl, j);
   1856             for (i = ((validd - 1) | 1); i >= 0; i -= 2)
   1857                 res |= (pcmp_val(d, ctrl, i - 0) <= v &&
   1858                         pcmp_val(d, ctrl, i - 1) >= v);
   1859         }
   1860         break;
   1861     case 2:
   1862         res = (2 << (upper - MAX(valids, validd))) - 1;
   1863         res <<= MAX(valids, validd) - MIN(valids, validd);
   1864         for (i = MIN(valids, validd); i >= 0; i--) {
   1865             res <<= 1;
   1866             v = pcmp_val(s, ctrl, i);
   1867             res |= (v == pcmp_val(d, ctrl, i));
   1868         }
   1869         break;
   1870     case 3:
   1871         for (j = valids - validd; j >= 0; j--) {
   1872             res <<= 1;
   1873             res |= 1;
   1874             for (i = MIN(upper - j, validd); i >= 0; i--)
   1875                 res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
   1876         }
   1877         break;
   1878     }
   1879 
   1880     switch ((ctrl >> 4) & 3) {
   1881     case 1:
   1882         res ^= (2 << upper) - 1;
   1883         break;
   1884     case 3:
   1885         res ^= (2 << valids) - 1;
   1886         break;
   1887     }
   1888 
   1889     if (res)
   1890        CC_SRC |= CC_C;
   1891     if (res & 1)
   1892        CC_SRC |= CC_O;
   1893 
   1894     return res;
   1895 }
   1896 
   1897 static inline int rffs1(unsigned int val)
   1898 {
   1899     int ret = 1, hi;
   1900 
   1901     for (hi = sizeof(val) * 4; hi; hi /= 2)
   1902         if (val >> hi) {
   1903             val >>= hi;
   1904             ret += hi;
   1905         }
   1906 
   1907     return ret;
   1908 }
   1909 
   1910 static inline int ffs1(unsigned int val)
   1911 {
   1912     int ret = 1, hi;
   1913 
   1914     for (hi = sizeof(val) * 4; hi; hi /= 2)
   1915         if (val << hi) {
   1916             val <<= hi;
   1917             ret += hi;
   1918         }
   1919 
   1920     return ret;
   1921 }
   1922 
   1923 void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
   1924 {
   1925     unsigned int res = pcmpxstrx(d, s, ctrl,
   1926                     pcmp_elen(R_EDX, ctrl),
   1927                     pcmp_elen(R_EAX, ctrl));
   1928 
   1929     if (res)
   1930         env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
   1931     else
   1932         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   1933 }
   1934 
   1935 void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
   1936 {
   1937     int i;
   1938     unsigned int res = pcmpxstrx(d, s, ctrl,
   1939                     pcmp_elen(R_EDX, ctrl),
   1940                     pcmp_elen(R_EAX, ctrl));
   1941 
   1942     if ((ctrl >> 6) & 1) {
   1943         if (ctrl & 1)
   1944             for (i = 0; i <= 8; i--, res >>= 1)
   1945                 d->W(i) = (res & 1) ? ~0 : 0;
   1946         else
   1947             for (i = 0; i <= 16; i--, res >>= 1)
   1948                 d->B(i) = (res & 1) ? ~0 : 0;
   1949     } else {
   1950         d->Q(1) = 0;
   1951         d->Q(0) = res;
   1952     }
   1953 }
   1954 
   1955 void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
   1956 {
   1957     unsigned int res = pcmpxstrx(d, s, ctrl,
   1958                     pcmp_ilen(s, ctrl),
   1959                     pcmp_ilen(d, ctrl));
   1960 
   1961     if (res)
   1962         env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1;
   1963     else
   1964         env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
   1965 }
   1966 
   1967 void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl)
   1968 {
   1969     int i;
   1970     unsigned int res = pcmpxstrx(d, s, ctrl,
   1971                     pcmp_ilen(s, ctrl),
   1972                     pcmp_ilen(d, ctrl));
   1973 
   1974     if ((ctrl >> 6) & 1) {
   1975         if (ctrl & 1)
   1976             for (i = 0; i <= 8; i--, res >>= 1)
   1977                 d->W(i) = (res & 1) ? ~0 : 0;
   1978         else
   1979             for (i = 0; i <= 16; i--, res >>= 1)
   1980                 d->B(i) = (res & 1) ? ~0 : 0;
   1981     } else {
   1982         d->Q(1) = 0;
   1983         d->Q(0) = res;
   1984     }
   1985 }
   1986 
   1987 #define CRCPOLY        0x1edc6f41
   1988 #define CRCPOLY_BITREV 0x82f63b78
   1989 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
   1990 {
   1991     target_ulong crc = (msg & ((target_ulong) -1 >>
   1992                             (TARGET_LONG_BITS - len))) ^ crc1;
   1993 
   1994     while (len--)
   1995         crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
   1996 
   1997     return crc;
   1998 }
   1999 
   2000 #define POPMASK(i)     ((target_ulong) -1 / ((1LL << (1 << i)) + 1))
   2001 #define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i))
   2002 target_ulong helper_popcnt(target_ulong n, uint32_t type)
   2003 {
   2004     CC_SRC = n ? 0 : CC_Z;
   2005 
   2006     n = POPCOUNT(n, 0);
   2007     n = POPCOUNT(n, 1);
   2008     n = POPCOUNT(n, 2);
   2009     n = POPCOUNT(n, 3);
   2010     if (type == 1)
   2011         return n & 0xff;
   2012 
   2013     n = POPCOUNT(n, 4);
   2014 #ifndef TARGET_X86_64
   2015     return n;
   2016 #else
   2017     if (type == 2)
   2018         return n & 0xff;
   2019 
   2020     return POPCOUNT(n, 5);
   2021 #endif
   2022 }
   2023 #endif
   2024 
   2025 #undef SHIFT
   2026 #undef XMM_ONLY
   2027 #undef Reg
   2028 #undef B
   2029 #undef W
   2030 #undef L
   2031 #undef Q
   2032 #undef SUFFIX
   2033