Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                   host_x86_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2010 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_x86_defs.h"
     46 
     47 /* TODO 21 Apr 2005:
     48 
     49    -- (Really an assembler issue) don't emit CMov32 as a cmov
     50       insn, since that's expensive on P4 and conditional branch
     51       is cheaper if (as we expect) the condition is highly predictable
     52 
     53    -- preserve xmm registers across function calls (by declaring them
     54       as trashed by call insns)
     55 
     56    -- preserve x87 ST stack discipline across function calls.  Sigh.
     57 
     58    -- Check doHelperCall: if a call is conditional, we cannot safely
     59       compute any regparm args directly to registers.  Hence, the
     60       fast-regparm marshalling should be restricted to unconditional
     61       calls only.
     62 */
     63 
     64 /*---------------------------------------------------------*/
     65 /*--- x87 control word stuff                            ---*/
     66 /*---------------------------------------------------------*/
     67 
     68 /* Vex-generated code expects to run with the FPU set as follows: all
     69    exceptions masked, round-to-nearest, precision = 53 bits.  This
     70    corresponds to a FPU control word value of 0x027F.
     71 
     72    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     73 
     74    %fpucw and %mxcsr should have these values on entry to
     75    Vex-generated code, and should those values should be
     76    unchanged at exit.
     77 */
     78 
     79 #define DEFAULT_FPUCW 0x027F
     80 
     81 /* debugging only, do not use */
     82 /* define DEFAULT_FPUCW 0x037F */
     83 
     84 
     85 /*---------------------------------------------------------*/
     86 /*--- misc helpers                                      ---*/
     87 /*---------------------------------------------------------*/
     88 
     89 /* These are duplicated in guest-x86/toIR.c */
     90 static IRExpr* unop ( IROp op, IRExpr* a )
     91 {
     92    return IRExpr_Unop(op, a);
     93 }
     94 
     95 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     96 {
     97    return IRExpr_Binop(op, a1, a2);
     98 }
     99 
    100 static IRExpr* bind ( Int binder )
    101 {
    102    return IRExpr_Binder(binder);
    103 }
    104 
    105 static Bool isZeroU8 ( IRExpr* e )
    106 {
    107    return e->tag == Iex_Const
    108           && e->Iex.Const.con->tag == Ico_U8
    109           && e->Iex.Const.con->Ico.U8 == 0;
    110 }
    111 
    112 static Bool isZeroU32 ( IRExpr* e )
    113 {
    114    return e->tag == Iex_Const
    115           && e->Iex.Const.con->tag == Ico_U32
    116           && e->Iex.Const.con->Ico.U32 == 0;
    117 }
    118 
    119 static Bool isZeroU64 ( IRExpr* e )
    120 {
    121    return e->tag == Iex_Const
    122           && e->Iex.Const.con->tag == Ico_U64
    123           && e->Iex.Const.con->Ico.U64 == 0ULL;
    124 }
    125 
    126 
    127 /*---------------------------------------------------------*/
    128 /*--- ISelEnv                                           ---*/
    129 /*---------------------------------------------------------*/
    130 
    131 /* This carries around:
    132 
    133    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    134      might encounter.  This is computed before insn selection starts,
    135      and does not change.
    136 
    137    - A mapping from IRTemp to HReg.  This tells the insn selector
    138      which virtual register(s) are associated with each IRTemp
    139      temporary.  This is computed before insn selection starts, and
    140      does not change.  We expect this mapping to map precisely the
    141      same set of IRTemps as the type mapping does.
    142 
    143         - vregmap   holds the primary register for the IRTemp.
    144         - vregmapHI is only used for 64-bit integer-typed
    145              IRTemps.  It holds the identity of a second
    146              32-bit virtual HReg, which holds the high half
    147              of the value.
    148 
    149    - The code array, that is, the insns selected so far.
    150 
    151    - A counter, for generating new virtual registers.
    152 
    153    - The host subarchitecture we are selecting insns for.
    154      This is set at the start and does not change.
    155 
    156    Note, this is all host-independent.  */
    157 
    158 typedef
    159    struct {
    160       IRTypeEnv*   type_env;
    161 
    162       HReg*        vregmap;
    163       HReg*        vregmapHI;
    164       Int          n_vregmap;
    165 
    166       HInstrArray* code;
    167 
    168       Int          vreg_ctr;
    169 
    170       UInt         hwcaps;
    171    }
    172    ISelEnv;
    173 
    174 
    175 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    176 {
    177    vassert(tmp >= 0);
    178    vassert(tmp < env->n_vregmap);
    179    return env->vregmap[tmp];
    180 }
    181 
    182 static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
    183 {
    184    vassert(tmp >= 0);
    185    vassert(tmp < env->n_vregmap);
    186    vassert(env->vregmapHI[tmp] != INVALID_HREG);
    187    *vrLO = env->vregmap[tmp];
    188    *vrHI = env->vregmapHI[tmp];
    189 }
    190 
    191 static void addInstr ( ISelEnv* env, X86Instr* instr )
    192 {
    193    addHInstr(env->code, instr);
    194    if (vex_traceflags & VEX_TRACE_VCODE) {
    195       ppX86Instr(instr, False);
    196       vex_printf("\n");
    197    }
    198 }
    199 
    200 static HReg newVRegI ( ISelEnv* env )
    201 {
    202    HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
    203    env->vreg_ctr++;
    204    return reg;
    205 }
    206 
    207 static HReg newVRegF ( ISelEnv* env )
    208 {
    209    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
    210    env->vreg_ctr++;
    211    return reg;
    212 }
    213 
    214 static HReg newVRegV ( ISelEnv* env )
    215 {
    216    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    217    env->vreg_ctr++;
    218    return reg;
    219 }
    220 
    221 
    222 /*---------------------------------------------------------*/
    223 /*--- ISEL: Forward declarations                        ---*/
    224 /*---------------------------------------------------------*/
    225 
    226 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    227    iselXXX_wrk do the real work, but are not to be called directly.
    228    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    229    checks that all returned registers are virtual.  You should not
    230    call the _wrk version directly.
    231 */
    232 static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    233 static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    234 
    235 static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
    236 static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
    237 
    238 static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
    239 static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
    240 
    241 static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
    242 static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
    243 
    244 static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    245 static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    246 
    247 static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
    248                                        ISelEnv* env, IRExpr* e );
    249 static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
    250                                        ISelEnv* env, IRExpr* e );
    251 
    252 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
    253 static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
    254 
    255 static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
    256 static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
    257 
    258 static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
    259 static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
    260 
    261 static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
    262 static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
    263 
    264 
    265 /*---------------------------------------------------------*/
    266 /*--- ISEL: Misc helpers                                ---*/
    267 /*---------------------------------------------------------*/
    268 
    269 /* Make a int reg-reg move. */
    270 
    271 static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    272 {
    273    vassert(hregClass(src) == HRcInt32);
    274    vassert(hregClass(dst) == HRcInt32);
    275    return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
    276 }
    277 
    278 
    279 /* Make a vector reg-reg move. */
    280 
    281 static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    282 {
    283    vassert(hregClass(src) == HRcVec128);
    284    vassert(hregClass(dst) == HRcVec128);
    285    return X86Instr_SseReRg(Xsse_MOV, src, dst);
    286 }
    287 
    288 /* Advance/retreat %esp by n. */
    289 
    290 static void add_to_esp ( ISelEnv* env, Int n )
    291 {
    292    vassert(n > 0 && n < 256 && (n%4) == 0);
    293    addInstr(env,
    294             X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
    295 }
    296 
    297 static void sub_from_esp ( ISelEnv* env, Int n )
    298 {
    299    vassert(n > 0 && n < 256 && (n%4) == 0);
    300    addInstr(env,
    301             X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
    302 }
    303 
    304 
    305 /* Given an amode, return one which references 4 bytes further
    306    along. */
    307 
    308 static X86AMode* advance4 ( X86AMode* am )
    309 {
    310    X86AMode* am4 = dopyX86AMode(am);
    311    switch (am4->tag) {
    312       case Xam_IRRS:
    313          am4->Xam.IRRS.imm += 4; break;
    314       case Xam_IR:
    315          am4->Xam.IR.imm += 4; break;
    316       default:
    317          vpanic("advance4(x86,host)");
    318    }
    319    return am4;
    320 }
    321 
    322 
    323 /* Push an arg onto the host stack, in preparation for a call to a
    324    helper function of some kind.  Returns the number of 32-bit words
    325    pushed. */
    326 
    327 static Int pushArg ( ISelEnv* env, IRExpr* arg )
    328 {
    329    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
    330    if (arg_ty == Ity_I32) {
    331       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
    332       return 1;
    333    } else
    334    if (arg_ty == Ity_I64) {
    335       HReg rHi, rLo;
    336       iselInt64Expr(&rHi, &rLo, env, arg);
    337       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
    338       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
    339       return 2;
    340    }
    341    ppIRExpr(arg);
    342    vpanic("pushArg(x86): can't handle arg of this type");
    343 }
    344 
    345 
    346 /* Complete the call to a helper function, by calling the
    347    helper and clearing the args off the stack. */
    348 
    349 static
    350 void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
    351                               IRCallee* cee, Int n_arg_ws )
    352 {
    353    /* Complication.  Need to decide which reg to use as the fn address
    354       pointer, in a way that doesn't trash regparm-passed
    355       parameters. */
    356    vassert(sizeof(void*) == 4);
    357 
    358    addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
    359                                     cee->regparms));
    360    if (n_arg_ws > 0)
    361       add_to_esp(env, 4*n_arg_ws);
    362 }
    363 
    364 
    365 /* Used only in doHelperCall.  See big comment in doHelperCall re
    366    handling of regparm args.  This function figures out whether
    367    evaluation of an expression might require use of a fixed register.
    368    If in doubt return True (safe but suboptimal).
    369 */
    370 static
    371 Bool mightRequireFixedRegs ( IRExpr* e )
    372 {
    373    switch (e->tag) {
    374       case Iex_RdTmp: case Iex_Const: case Iex_Get:
    375          return False;
    376       default:
    377          return True;
    378    }
    379 }
    380 
    381 
    382 /* Do a complete function call.  guard is a Ity_Bit expression
    383    indicating whether or not the call happens.  If guard==NULL, the
    384    call is unconditional. */
    385 
    386 static
    387 void doHelperCall ( ISelEnv* env,
    388                     Bool passBBP,
    389                     IRExpr* guard, IRCallee* cee, IRExpr** args )
    390 {
    391    X86CondCode cc;
    392    HReg        argregs[3];
    393    HReg        tmpregs[3];
    394    Bool        danger;
    395    Int         not_done_yet, n_args, n_arg_ws, stack_limit,
    396                i, argreg, argregX;
    397 
    398    /* Marshal args for a call, do the call, and clear the stack.
    399       Complexities to consider:
    400 
    401       * if passBBP is True, %ebp (the baseblock pointer) is to be
    402         passed as the first arg.
    403 
    404       * If the callee claims regparmness of 1, 2 or 3, we must pass the
    405         first 1, 2 or 3 args in registers (EAX, EDX, and ECX
    406         respectively).  To keep things relatively simple, only args of
    407         type I32 may be passed as regparms -- just bomb out if anything
    408         else turns up.  Clearly this depends on the front ends not
    409         trying to pass any other types as regparms.
    410    */
    411 
    412    /* 16 Nov 2004: the regparm handling is complicated by the
    413       following problem.
    414 
    415       Consider a call two a function with two regparm parameters:
    416       f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
    417       Suppose code is first generated to compute e1 into %eax.  Then,
    418       code is generated to compute e2 into %edx.  Unfortunately, if
    419       the latter code sequence uses %eax, it will trash the value of
    420       e1 computed by the former sequence.  This could happen if (for
    421       example) e2 itself involved a function call.  In the code below,
    422       args are evaluated right-to-left, not left-to-right, but the
    423       principle and the problem are the same.
    424 
    425       One solution is to compute all regparm-bound args into vregs
    426       first, and once they are all done, move them to the relevant
    427       real regs.  This always gives correct code, but it also gives
    428       a bunch of vreg-to-rreg moves which are usually redundant but
    429       are hard for the register allocator to get rid of.
    430 
    431       A compromise is to first examine all regparm'd argument
    432       expressions.  If they are all so simple that it is clear
    433       they will be evaluated without use of any fixed registers,
    434       use the old compute-directly-to-fixed-target scheme.  If not,
    435       be safe and use the via-vregs scheme.
    436 
    437       Note this requires being able to examine an expression and
    438       determine whether or not evaluation of it might use a fixed
    439       register.  That requires knowledge of how the rest of this
    440       insn selector works.  Currently just the following 3 are
    441       regarded as safe -- hopefully they cover the majority of
    442       arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    443    */
    444    vassert(cee->regparms >= 0 && cee->regparms <= 3);
    445 
    446    n_args = n_arg_ws = 0;
    447    while (args[n_args]) n_args++;
    448 
    449    not_done_yet = n_args;
    450    if (passBBP)
    451       not_done_yet++;
    452 
    453    stack_limit = cee->regparms;
    454    if (cee->regparms > 0 && passBBP) stack_limit--;
    455 
    456    /* ------ BEGIN marshall all arguments ------ */
    457 
    458    /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
    459    for (i = n_args-1; i >= stack_limit; i--) {
    460       n_arg_ws += pushArg(env, args[i]);
    461       not_done_yet--;
    462    }
    463 
    464    /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
    465       registers. */
    466 
    467    if (cee->regparms > 0) {
    468 
    469       /* ------ BEGIN deal with regparms ------ */
    470 
    471       /* deal with regparms, not forgetting %ebp if needed. */
    472       argregs[0] = hregX86_EAX();
    473       argregs[1] = hregX86_EDX();
    474       argregs[2] = hregX86_ECX();
    475       tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
    476 
    477       argreg = cee->regparms;
    478 
    479       /* In keeping with big comment above, detect potential danger
    480          and use the via-vregs scheme if needed. */
    481       danger = False;
    482       for (i = stack_limit-1; i >= 0; i--) {
    483          if (mightRequireFixedRegs(args[i])) {
    484             danger = True;
    485             break;
    486          }
    487       }
    488 
    489       if (danger) {
    490 
    491          /* Move via temporaries */
    492          argregX = argreg;
    493          for (i = stack_limit-1; i >= 0; i--) {
    494 
    495             if (0) {
    496                vex_printf("x86 host: register param is complex: ");
    497                ppIRExpr(args[i]);
    498                vex_printf("\n");
    499             }
    500 
    501             argreg--;
    502             vassert(argreg >= 0);
    503             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    504             tmpregs[argreg] = iselIntExpr_R(env, args[i]);
    505             not_done_yet--;
    506          }
    507          for (i = stack_limit-1; i >= 0; i--) {
    508             argregX--;
    509             vassert(argregX >= 0);
    510             addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
    511          }
    512 
    513       } else {
    514          /* It's safe to compute all regparm args directly into their
    515             target registers. */
    516          for (i = stack_limit-1; i >= 0; i--) {
    517             argreg--;
    518             vassert(argreg >= 0);
    519             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    520             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    521                                           iselIntExpr_RMI(env, args[i]),
    522                                           argregs[argreg]));
    523             not_done_yet--;
    524          }
    525 
    526       }
    527 
    528       /* Not forgetting %ebp if needed. */
    529       if (passBBP) {
    530          vassert(argreg == 1);
    531          addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
    532          not_done_yet--;
    533       }
    534 
    535       /* ------ END deal with regparms ------ */
    536 
    537    } else {
    538 
    539       /* No regparms.  Heave %ebp on the stack if needed. */
    540       if (passBBP) {
    541          addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
    542          n_arg_ws++;
    543          not_done_yet--;
    544       }
    545 
    546    }
    547 
    548    vassert(not_done_yet == 0);
    549 
    550    /* ------ END marshall all arguments ------ */
    551 
    552    /* Now we can compute the condition.  We can't do it earlier
    553       because the argument computations could trash the condition
    554       codes.  Be a bit clever to handle the common case where the
    555       guard is 1:Bit. */
    556    cc = Xcc_ALWAYS;
    557    if (guard) {
    558       if (guard->tag == Iex_Const
    559           && guard->Iex.Const.con->tag == Ico_U1
    560           && guard->Iex.Const.con->Ico.U1 == True) {
    561          /* unconditional -- do nothing */
    562       } else {
    563          cc = iselCondCode( env, guard );
    564       }
    565    }
    566 
    567    /* call the helper, and get the args off the stack afterwards. */
    568    callHelperAndClearArgs( env, cc, cee, n_arg_ws );
    569 }
    570 
    571 
    572 /* Given a guest-state array descriptor, an index expression and a
    573    bias, generate an X86AMode holding the relevant guest state
    574    offset. */
    575 
    576 static
    577 X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    578                                 IRExpr* off, Int bias )
    579 {
    580    HReg tmp, roff;
    581    Int  elemSz = sizeofIRType(descr->elemTy);
    582    Int  nElems = descr->nElems;
    583    Int  shift  = 0;
    584 
    585    /* throw out any cases not generated by an x86 front end.  In
    586       theory there might be a day where we need to handle them -- if
    587       we ever run non-x86-guest on x86 host. */
    588 
    589    if (nElems != 8)
    590       vpanic("genGuestArrayOffset(x86 host)(1)");
    591 
    592    switch (elemSz) {
    593       case 1:  shift = 0; break;
    594       case 4:  shift = 2; break;
    595       case 8:  shift = 3; break;
    596       default: vpanic("genGuestArrayOffset(x86 host)(2)");
    597    }
    598 
    599    /* Compute off into a reg, %off.  Then return:
    600 
    601          movl %off, %tmp
    602          addl $bias, %tmp  (if bias != 0)
    603          andl %tmp, 7
    604          ... base(%ebp, %tmp, shift) ...
    605    */
    606    tmp  = newVRegI(env);
    607    roff = iselIntExpr_R(env, off);
    608    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    609    if (bias != 0) {
    610       addInstr(env,
    611                X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
    612    }
    613    addInstr(env,
    614             X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
    615    return
    616       X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
    617 }
    618 
    619 
    620 /* Mess with the FPU's rounding mode: set to the default rounding mode
    621    (DEFAULT_FPUCW). */
    622 static
    623 void set_FPU_rounding_default ( ISelEnv* env )
    624 {
    625    /* pushl $DEFAULT_FPUCW
    626       fldcw 0(%esp)
    627       addl $4, %esp
    628    */
    629    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    630    addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
    631    addInstr(env, X86Instr_FpLdCW(zero_esp));
    632    add_to_esp(env, 4);
    633 }
    634 
    635 
    636 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    637    expression denoting a value in the range 0 .. 3, indicating a round
    638    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    639    the same rounding.
    640 */
    641 static
    642 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    643 {
    644    HReg rrm  = iselIntExpr_R(env, mode);
    645    HReg rrm2 = newVRegI(env);
    646    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    647 
    648    /* movl  %rrm, %rrm2
    649       andl  $3, %rrm2   -- shouldn't be needed; paranoia
    650       shll  $10, %rrm2
    651       orl   $DEFAULT_FPUCW, %rrm2
    652       pushl %rrm2
    653       fldcw 0(%esp)
    654       addl  $4, %esp
    655    */
    656    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    657    addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
    658    addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
    659    addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
    660    addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
    661    addInstr(env, X86Instr_FpLdCW(zero_esp));
    662    add_to_esp(env, 4);
    663 }
    664 
    665 
    666 /* Generate !src into a new vector register, and be sure that the code
    667    is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
    668    way to do this.
    669 */
    670 static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
    671 {
    672    HReg dst = newVRegV(env);
    673    /* Set dst to zero.  If dst contains a NaN then all hell might
    674       break loose after the comparison.  So, first zero it. */
    675    addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
    676    /* And now make it all 1s ... */
    677    addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
    678    /* Finally, xor 'src' into it. */
    679    addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
    680    /* Doesn't that just totally suck? */
    681    return dst;
    682 }
    683 
    684 
    685 /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
    686    after most non-simple FPU operations (simple = +, -, *, / and
    687    sqrt).
    688 
    689    This could be done a lot more efficiently if needed, by loading
    690    zero and adding it to the value to be rounded (fldz ; faddp?).
    691 */
    692 static void roundToF64 ( ISelEnv* env, HReg reg )
    693 {
    694    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    695    sub_from_esp(env, 8);
    696    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
    697    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
    698    add_to_esp(env, 8);
    699 }
    700 
    701 
    702 /*---------------------------------------------------------*/
    703 /*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
    704 /*---------------------------------------------------------*/
    705 
    706 /* Select insns for an integer-typed expression, and add them to the
    707    code list.  Return a reg holding the result.  This reg will be a
    708    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    709    want to modify it, ask for a new vreg, copy it in there, and modify
    710    the copy.  The register allocator will do its best to map both
    711    vregs to the same real register, so the copies will often disappear
    712    later in the game.
    713 
    714    This should handle expressions of 32, 16 and 8-bit type.  All
    715    results are returned in a 32-bit register.  For 16- and 8-bit
    716    expressions, the upper 16/24 bits are arbitrary, so you should mask
    717    or sign extend partial values if necessary.
    718 */
    719 
    720 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    721 {
    722    HReg r = iselIntExpr_R_wrk(env, e);
    723    /* sanity checks ... */
    724 #  if 0
    725    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
    726 #  endif
    727    vassert(hregClass(r) == HRcInt32);
    728    vassert(hregIsVirtual(r));
    729    return r;
    730 }
    731 
    732 /* DO NOT CALL THIS DIRECTLY ! */
    733 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    734 {
    735    MatchInfo mi;
    736 
    737    IRType ty = typeOfIRExpr(env->type_env,e);
    738    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
    739 
    740    switch (e->tag) {
    741 
    742    /* --------- TEMP --------- */
    743    case Iex_RdTmp: {
    744       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    745    }
    746 
    747    /* --------- LOAD --------- */
    748    case Iex_Load: {
    749       HReg dst = newVRegI(env);
    750       X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    751 
    752       /* We can't handle big-endian loads, nor load-linked. */
    753       if (e->Iex.Load.end != Iend_LE)
    754          goto irreducible;
    755 
    756       if (ty == Ity_I32) {
    757          addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    758                                        X86RMI_Mem(amode), dst) );
    759          return dst;
    760       }
    761       if (ty == Ity_I16) {
    762          addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
    763          return dst;
    764       }
    765       if (ty == Ity_I8) {
    766          addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
    767          return dst;
    768       }
    769       break;
    770    }
    771 
    772    /* --------- TERNARY OP --------- */
    773    case Iex_Triop: {
    774       /* C3210 flags following FPU partial remainder (fprem), both
    775          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
    776       if (e->Iex.Triop.op == Iop_PRemC3210F64
    777           || e->Iex.Triop.op == Iop_PRem1C3210F64) {
    778          HReg junk = newVRegF(env);
    779          HReg dst  = newVRegI(env);
    780          HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
    781          HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
    782          /* XXXROUNDINGFIXME */
    783          /* set roundingmode here */
    784          addInstr(env, X86Instr_FpBinary(
    785                            e->Iex.Binop.op==Iop_PRemC3210F64
    786                               ? Xfp_PREM : Xfp_PREM1,
    787                            srcL,srcR,junk
    788                  ));
    789          /* The previous pseudo-insn will have left the FPU's C3210
    790             flags set correctly.  So bag them. */
    791          addInstr(env, X86Instr_FpStSW_AX());
    792          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
    793          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
    794          return dst;
    795       }
    796 
    797       break;
    798    }
    799 
    800    /* --------- BINARY OP --------- */
    801    case Iex_Binop: {
    802       X86AluOp   aluOp;
    803       X86ShiftOp shOp;
    804 
    805       /* Pattern: Sub32(0,x) */
    806       if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
    807          HReg dst = newVRegI(env);
    808          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    809          addInstr(env, mk_iMOVsd_RR(reg,dst));
    810          addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
    811          return dst;
    812       }
    813 
    814       /* Is it an addition or logical style op? */
    815       switch (e->Iex.Binop.op) {
    816          case Iop_Add8: case Iop_Add16: case Iop_Add32:
    817             aluOp = Xalu_ADD; break;
    818          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
    819             aluOp = Xalu_SUB; break;
    820          case Iop_And8: case Iop_And16: case Iop_And32:
    821             aluOp = Xalu_AND; break;
    822          case Iop_Or8: case Iop_Or16: case Iop_Or32:
    823             aluOp = Xalu_OR; break;
    824          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
    825             aluOp = Xalu_XOR; break;
    826          case Iop_Mul16: case Iop_Mul32:
    827             aluOp = Xalu_MUL; break;
    828          default:
    829             aluOp = Xalu_INVALID; break;
    830       }
    831       /* For commutative ops we assume any literal
    832          values are on the second operand. */
    833       if (aluOp != Xalu_INVALID) {
    834          HReg dst    = newVRegI(env);
    835          HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
    836          X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
    837          addInstr(env, mk_iMOVsd_RR(reg,dst));
    838          addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
    839          return dst;
    840       }
    841       /* Could do better here; forcing the first arg into a reg
    842          isn't always clever.
    843          -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
    844                         LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
    845                         t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
    846             movl 0xFFFFFFA0(%vr41),%vr107
    847             movl 0xFFFFFFA4(%vr41),%vr108
    848             movl %vr107,%vr106
    849             xorl %vr108,%vr106
    850             movl 0xFFFFFFA8(%vr41),%vr109
    851             movl %vr106,%vr105
    852             andl %vr109,%vr105
    853             movl 0xFFFFFFA0(%vr41),%vr110
    854             movl %vr105,%vr104
    855             xorl %vr110,%vr104
    856             movl %vr104,%vr70
    857       */
    858 
    859       /* Perhaps a shift op? */
    860       switch (e->Iex.Binop.op) {
    861          case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    862             shOp = Xsh_SHL; break;
    863          case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
    864             shOp = Xsh_SHR; break;
    865          case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
    866             shOp = Xsh_SAR; break;
    867          default:
    868             shOp = Xsh_INVALID; break;
    869       }
    870       if (shOp != Xsh_INVALID) {
    871          HReg dst = newVRegI(env);
    872 
    873          /* regL = the value to be shifted */
    874          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
    875          addInstr(env, mk_iMOVsd_RR(regL,dst));
    876 
    877          /* Do any necessary widening for 16/8 bit operands */
    878          switch (e->Iex.Binop.op) {
    879             case Iop_Shr8:
    880                addInstr(env, X86Instr_Alu32R(
    881                                 Xalu_AND, X86RMI_Imm(0xFF), dst));
    882                break;
    883             case Iop_Shr16:
    884                addInstr(env, X86Instr_Alu32R(
    885                                 Xalu_AND, X86RMI_Imm(0xFFFF), dst));
    886                break;
    887             case Iop_Sar8:
    888                addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
    889                addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
    890                break;
    891             case Iop_Sar16:
    892                addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
    893                addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
    894                break;
    895             default: break;
    896          }
    897 
    898          /* Now consider the shift amount.  If it's a literal, we
    899             can do a much better job than the general case. */
    900          if (e->Iex.Binop.arg2->tag == Iex_Const) {
    901             /* assert that the IR is well-typed */
    902             Int nshift;
    903             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
    904             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
    905 	    vassert(nshift >= 0);
    906 	    if (nshift > 0)
    907                /* Can't allow nshift==0 since that means %cl */
    908                addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
    909          } else {
    910             /* General case; we have to force the amount into %cl. */
    911             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
    912             addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
    913             addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
    914          }
    915          return dst;
    916       }
    917 
    918       /* Handle misc other ops. */
    919 
    920       if (e->Iex.Binop.op == Iop_Max32U) {
    921          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
    922          HReg dst  = newVRegI(env);
    923          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
    924          addInstr(env, mk_iMOVsd_RR(src1,dst));
    925          addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
    926          addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
    927          return dst;
    928       }
    929 
    930       if (e->Iex.Binop.op == Iop_8HLto16) {
    931          HReg hi8  = newVRegI(env);
    932          HReg lo8  = newVRegI(env);
    933          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    934          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    935          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
    936          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
    937          addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
    938          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
    939          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
    940          return hi8;
    941       }
    942 
    943       if (e->Iex.Binop.op == Iop_16HLto32) {
    944          HReg hi16  = newVRegI(env);
    945          HReg lo16  = newVRegI(env);
    946          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    947          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    948          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
    949          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
    950          addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
    951          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
    952          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
    953          return hi16;
    954       }
    955 
    956       if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
    957           || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
    958          HReg a16   = newVRegI(env);
    959          HReg b16   = newVRegI(env);
    960          HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
    961          HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
    962          Int  shift = (e->Iex.Binop.op == Iop_MullS8
    963                        || e->Iex.Binop.op == Iop_MullU8)
    964                          ? 24 : 16;
    965          X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
    966                               || e->Iex.Binop.op == Iop_MullS16)
    967                                 ? Xsh_SAR : Xsh_SHR;
    968 
    969          addInstr(env, mk_iMOVsd_RR(a16s, a16));
    970          addInstr(env, mk_iMOVsd_RR(b16s, b16));
    971          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
    972          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
    973          addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
    974          addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
    975          addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
    976          return b16;
    977       }
    978 
    979       if (e->Iex.Binop.op == Iop_CmpF64) {
    980          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
    981          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
    982          HReg dst = newVRegI(env);
    983          addInstr(env, X86Instr_FpCmp(fL,fR,dst));
    984          /* shift this right 8 bits so as to conform to CmpF64
    985             definition. */
    986          addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
    987          return dst;
    988       }
    989 
    990       if (e->Iex.Binop.op == Iop_F64toI32S
    991           || e->Iex.Binop.op == Iop_F64toI16S) {
    992          Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
    993          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
    994          HReg dst = newVRegI(env);
    995 
    996          /* Used several times ... */
    997          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    998 
    999 	 /* rf now holds the value to be converted, and rrm holds the
   1000 	    rounding mode value, encoded as per the IRRoundingMode
   1001 	    enum.  The first thing to do is set the FPU's rounding
   1002 	    mode accordingly. */
   1003 
   1004          /* Create a space for the format conversion. */
   1005          /* subl $4, %esp */
   1006          sub_from_esp(env, 4);
   1007 
   1008 	 /* Set host rounding mode */
   1009 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   1010 
   1011          /* gistw/l %rf, 0(%esp) */
   1012          addInstr(env, X86Instr_FpLdStI(False/*store*/,
   1013                                         toUChar(sz), rf, zero_esp));
   1014 
   1015          if (sz == 2) {
   1016             /* movzwl 0(%esp), %dst */
   1017             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
   1018          } else {
   1019             /* movl 0(%esp), %dst */
   1020             vassert(sz == 4);
   1021             addInstr(env, X86Instr_Alu32R(
   1022                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1023          }
   1024 
   1025 	 /* Restore default FPU rounding. */
   1026          set_FPU_rounding_default( env );
   1027 
   1028          /* addl $4, %esp */
   1029 	 add_to_esp(env, 4);
   1030          return dst;
   1031       }
   1032 
   1033       break;
   1034    }
   1035 
   1036    /* --------- UNARY OP --------- */
   1037    case Iex_Unop: {
   1038 
   1039       /* 1Uto8(32to1(expr32)) */
   1040       if (e->Iex.Unop.op == Iop_1Uto8) {
   1041          DECLARE_PATTERN(p_32to1_then_1Uto8);
   1042          DEFINE_PATTERN(p_32to1_then_1Uto8,
   1043                         unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
   1044          if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
   1045             IRExpr* expr32 = mi.bindee[0];
   1046             HReg dst = newVRegI(env);
   1047             HReg src = iselIntExpr_R(env, expr32);
   1048             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1049             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1050                                           X86RMI_Imm(1), dst));
   1051             return dst;
   1052          }
   1053       }
   1054 
   1055       /* 8Uto32(LDle(expr32)) */
   1056       if (e->Iex.Unop.op == Iop_8Uto32) {
   1057          DECLARE_PATTERN(p_LDle8_then_8Uto32);
   1058          DEFINE_PATTERN(p_LDle8_then_8Uto32,
   1059                         unop(Iop_8Uto32,
   1060                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1061          if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
   1062             HReg dst = newVRegI(env);
   1063             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1064             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1065             return dst;
   1066          }
   1067       }
   1068 
   1069       /* 8Sto32(LDle(expr32)) */
   1070       if (e->Iex.Unop.op == Iop_8Sto32) {
   1071          DECLARE_PATTERN(p_LDle8_then_8Sto32);
   1072          DEFINE_PATTERN(p_LDle8_then_8Sto32,
   1073                         unop(Iop_8Sto32,
   1074                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1075          if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
   1076             HReg dst = newVRegI(env);
   1077             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1078             addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
   1079             return dst;
   1080          }
   1081       }
   1082 
   1083       /* 16Uto32(LDle(expr32)) */
   1084       if (e->Iex.Unop.op == Iop_16Uto32) {
   1085          DECLARE_PATTERN(p_LDle16_then_16Uto32);
   1086          DEFINE_PATTERN(p_LDle16_then_16Uto32,
   1087                         unop(Iop_16Uto32,
   1088                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1089          if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
   1090             HReg dst = newVRegI(env);
   1091             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1092             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1093             return dst;
   1094          }
   1095       }
   1096 
   1097       /* 8Uto32(GET:I8) */
   1098       if (e->Iex.Unop.op == Iop_8Uto32) {
   1099          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1100             HReg      dst;
   1101             X86AMode* amode;
   1102             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
   1103             dst = newVRegI(env);
   1104             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1105                                 hregX86_EBP());
   1106             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1107             return dst;
   1108          }
   1109       }
   1110 
   1111       /* 16to32(GET:I16) */
   1112       if (e->Iex.Unop.op == Iop_16Uto32) {
   1113          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1114             HReg      dst;
   1115             X86AMode* amode;
   1116             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
   1117             dst = newVRegI(env);
   1118             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1119                                 hregX86_EBP());
   1120             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1121             return dst;
   1122          }
   1123       }
   1124 
   1125       switch (e->Iex.Unop.op) {
   1126          case Iop_8Uto16:
   1127          case Iop_8Uto32:
   1128          case Iop_16Uto32: {
   1129             HReg dst = newVRegI(env);
   1130             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1131             UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
   1132             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1133             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1134                                           X86RMI_Imm(mask), dst));
   1135             return dst;
   1136          }
   1137          case Iop_8Sto16:
   1138          case Iop_8Sto32:
   1139          case Iop_16Sto32: {
   1140             HReg dst = newVRegI(env);
   1141             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1142             UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
   1143             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1144             addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
   1145             addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
   1146             return dst;
   1147          }
   1148 	 case Iop_Not8:
   1149 	 case Iop_Not16:
   1150          case Iop_Not32: {
   1151             HReg dst = newVRegI(env);
   1152             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1153             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1154             addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
   1155             return dst;
   1156          }
   1157          case Iop_64HIto32: {
   1158             HReg rHi, rLo;
   1159             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1160             return rHi; /* and abandon rLo .. poor wee thing :-) */
   1161          }
   1162          case Iop_64to32: {
   1163             HReg rHi, rLo;
   1164             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1165             return rLo; /* similar stupid comment to the above ... */
   1166          }
   1167          case Iop_16HIto8:
   1168          case Iop_32HIto16: {
   1169             HReg dst  = newVRegI(env);
   1170             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1171             Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
   1172             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1173             addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
   1174             return dst;
   1175          }
   1176          case Iop_1Uto32:
   1177          case Iop_1Uto8: {
   1178             HReg dst         = newVRegI(env);
   1179             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1180             addInstr(env, X86Instr_Set32(cond,dst));
   1181             return dst;
   1182          }
   1183          case Iop_1Sto8:
   1184          case Iop_1Sto16:
   1185          case Iop_1Sto32: {
   1186             /* could do better than this, but for now ... */
   1187             HReg dst         = newVRegI(env);
   1188             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1189             addInstr(env, X86Instr_Set32(cond,dst));
   1190             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
   1191             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1192             return dst;
   1193          }
   1194          case Iop_Ctz32: {
   1195             /* Count trailing zeroes, implemented by x86 'bsfl' */
   1196             HReg dst = newVRegI(env);
   1197             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1198             addInstr(env, X86Instr_Bsfr32(True,src,dst));
   1199             return dst;
   1200          }
   1201          case Iop_Clz32: {
   1202             /* Count leading zeroes.  Do 'bsrl' to establish the index
   1203                of the highest set bit, and subtract that value from
   1204                31. */
   1205             HReg tmp = newVRegI(env);
   1206             HReg dst = newVRegI(env);
   1207             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1208             addInstr(env, X86Instr_Bsfr32(False,src,tmp));
   1209             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
   1210                                           X86RMI_Imm(31), dst));
   1211             addInstr(env, X86Instr_Alu32R(Xalu_SUB,
   1212                                           X86RMI_Reg(tmp), dst));
   1213             return dst;
   1214          }
   1215 
   1216          case Iop_CmpwNEZ32: {
   1217             HReg dst = newVRegI(env);
   1218             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1219             addInstr(env, mk_iMOVsd_RR(src,dst));
   1220             addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
   1221             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   1222                                           X86RMI_Reg(src), dst));
   1223             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1224             return dst;
   1225          }
   1226          case Iop_Left8:
   1227          case Iop_Left16:
   1228          case Iop_Left32: {
   1229             HReg dst = newVRegI(env);
   1230             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1231             addInstr(env, mk_iMOVsd_RR(src, dst));
   1232             addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
   1233             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
   1234             return dst;
   1235          }
   1236 
   1237          case Iop_V128to32: {
   1238             HReg      dst  = newVRegI(env);
   1239             HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
   1240             X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   1241             sub_from_esp(env, 16);
   1242             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   1243             addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
   1244             add_to_esp(env, 16);
   1245             return dst;
   1246          }
   1247 
   1248          /* ReinterpF32asI32(e) */
   1249          /* Given an IEEE754 single, produce an I32 with the same bit
   1250             pattern.  Keep stack 8-aligned even though only using 4
   1251             bytes. */
   1252          case Iop_ReinterpF32asI32: {
   1253             HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
   1254             HReg dst  = newVRegI(env);
   1255             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1256             /* paranoia */
   1257             set_FPU_rounding_default(env);
   1258             /* subl $8, %esp */
   1259             sub_from_esp(env, 8);
   1260             /* gstF %rf, 0(%esp) */
   1261             addInstr(env,
   1262                      X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
   1263             /* movl 0(%esp), %dst */
   1264             addInstr(env,
   1265                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1266             /* addl $8, %esp */
   1267             add_to_esp(env, 8);
   1268             return dst;
   1269          }
   1270 
   1271          case Iop_16to8:
   1272          case Iop_32to8:
   1273          case Iop_32to16:
   1274             /* These are no-ops. */
   1275             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1276 
   1277          default:
   1278             break;
   1279       }
   1280       break;
   1281    }
   1282 
   1283    /* --------- GET --------- */
   1284    case Iex_Get: {
   1285       if (ty == Ity_I32) {
   1286          HReg dst = newVRegI(env);
   1287          addInstr(env, X86Instr_Alu32R(
   1288                           Xalu_MOV,
   1289                           X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1290                                                  hregX86_EBP())),
   1291                           dst));
   1292          return dst;
   1293       }
   1294       if (ty == Ity_I8 || ty == Ity_I16) {
   1295          HReg dst = newVRegI(env);
   1296          addInstr(env, X86Instr_LoadEX(
   1297                           toUChar(ty==Ity_I8 ? 1 : 2),
   1298                           False,
   1299                           X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
   1300                           dst));
   1301          return dst;
   1302       }
   1303       break;
   1304    }
   1305 
   1306    case Iex_GetI: {
   1307       X86AMode* am
   1308          = genGuestArrayOffset(
   1309               env, e->Iex.GetI.descr,
   1310                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1311       HReg dst = newVRegI(env);
   1312       if (ty == Ity_I8) {
   1313          addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
   1314          return dst;
   1315       }
   1316       if (ty == Ity_I32) {
   1317          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
   1318          return dst;
   1319       }
   1320       break;
   1321    }
   1322 
   1323    /* --------- CCALL --------- */
   1324    case Iex_CCall: {
   1325       HReg    dst = newVRegI(env);
   1326       vassert(ty == e->Iex.CCall.retty);
   1327 
   1328       /* be very restrictive for now.  Only 32/64-bit ints allowed
   1329          for args, and 32 bits for return type. */
   1330       if (e->Iex.CCall.retty != Ity_I32)
   1331          goto irreducible;
   1332 
   1333       /* Marshal args, do the call, clear stack. */
   1334       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   1335 
   1336       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
   1337       return dst;
   1338    }
   1339 
   1340    /* --------- LITERAL --------- */
   1341    /* 32/16/8-bit literals */
   1342    case Iex_Const: {
   1343       X86RMI* rmi = iselIntExpr_RMI ( env, e );
   1344       HReg    r   = newVRegI(env);
   1345       addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
   1346       return r;
   1347    }
   1348 
   1349    /* --------- MULTIPLEX --------- */
   1350    case Iex_Mux0X: {
   1351      if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1352          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   1353         X86RM* r8;
   1354         HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
   1355         X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
   1356         HReg   dst = newVRegI(env);
   1357         addInstr(env, mk_iMOVsd_RR(rX,dst));
   1358         r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   1359         addInstr(env, X86Instr_Test32(0xFF, r8));
   1360         addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
   1361         return dst;
   1362       }
   1363       break;
   1364    }
   1365 
   1366    default:
   1367    break;
   1368    } /* switch (e->tag) */
   1369 
   1370    /* We get here if no pattern matched. */
   1371   irreducible:
   1372    ppIRExpr(e);
   1373    vpanic("iselIntExpr_R: cannot reduce tree");
   1374 }
   1375 
   1376 
   1377 /*---------------------------------------------------------*/
   1378 /*--- ISEL: Integer expression auxiliaries              ---*/
   1379 /*---------------------------------------------------------*/
   1380 
   1381 /* --------------------- AMODEs --------------------- */
   1382 
   1383 /* Return an AMode which computes the value of the specified
   1384    expression, possibly also adding insns to the code list as a
   1385    result.  The expression may only be a 32-bit one.
   1386 */
   1387 
   1388 static Bool sane_AMode ( X86AMode* am )
   1389 {
   1390    switch (am->tag) {
   1391       case Xam_IR:
   1392          return
   1393             toBool( hregClass(am->Xam.IR.reg) == HRcInt32
   1394                     && (hregIsVirtual(am->Xam.IR.reg)
   1395                         || am->Xam.IR.reg == hregX86_EBP()) );
   1396       case Xam_IRRS:
   1397          return
   1398             toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
   1399                     && hregIsVirtual(am->Xam.IRRS.base)
   1400                     && hregClass(am->Xam.IRRS.index) == HRcInt32
   1401                     && hregIsVirtual(am->Xam.IRRS.index) );
   1402       default:
   1403         vpanic("sane_AMode: unknown x86 amode tag");
   1404    }
   1405 }
   1406 
   1407 static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1408 {
   1409    X86AMode* am = iselIntExpr_AMode_wrk(env, e);
   1410    vassert(sane_AMode(am));
   1411    return am;
   1412 }
   1413 
   1414 /* DO NOT CALL THIS DIRECTLY ! */
   1415 static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1416 {
   1417    IRType ty = typeOfIRExpr(env->type_env,e);
   1418    vassert(ty == Ity_I32);
   1419 
   1420    /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
   1421    if (e->tag == Iex_Binop
   1422        && e->Iex.Binop.op == Iop_Add32
   1423        && e->Iex.Binop.arg2->tag == Iex_Const
   1424        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
   1425        && e->Iex.Binop.arg1->tag == Iex_Binop
   1426        && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
   1427        && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
   1428        && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1429        && e->Iex.Binop.arg1
   1430            ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1431        && e->Iex.Binop.arg1
   1432            ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1433       UInt shift = e->Iex.Binop.arg1
   1434                     ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1435       UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
   1436       if (shift == 1 || shift == 2 || shift == 3) {
   1437          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
   1438          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
   1439                                        ->Iex.Binop.arg2->Iex.Binop.arg1 );
   1440          return X86AMode_IRRS(imm32, r1, r2, shift);
   1441       }
   1442    }
   1443 
   1444    /* Add32(expr1, Shl32(expr2, imm)) */
   1445    if (e->tag == Iex_Binop
   1446        && e->Iex.Binop.op == Iop_Add32
   1447        && e->Iex.Binop.arg2->tag == Iex_Binop
   1448        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1449        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1450        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1451       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1452       if (shift == 1 || shift == 2 || shift == 3) {
   1453          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1454          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1455          return X86AMode_IRRS(0, r1, r2, shift);
   1456       }
   1457    }
   1458 
   1459    /* Add32(expr,i) */
   1460    if (e->tag == Iex_Binop
   1461        && e->Iex.Binop.op == Iop_Add32
   1462        && e->Iex.Binop.arg2->tag == Iex_Const
   1463        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
   1464       HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
   1465       return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
   1466    }
   1467 
   1468    /* Doesn't match anything in particular.  Generate it into
   1469       a register and use that. */
   1470    {
   1471       HReg r1 = iselIntExpr_R(env, e);
   1472       return X86AMode_IR(0, r1);
   1473    }
   1474 }
   1475 
   1476 
   1477 /* --------------------- RMIs --------------------- */
   1478 
   1479 /* Similarly, calculate an expression into an X86RMI operand.  As with
   1480    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1481 
   1482 static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   1483 {
   1484    X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   1485    /* sanity checks ... */
   1486    switch (rmi->tag) {
   1487       case Xrmi_Imm:
   1488          return rmi;
   1489       case Xrmi_Reg:
   1490          vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
   1491          vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
   1492          return rmi;
   1493       case Xrmi_Mem:
   1494          vassert(sane_AMode(rmi->Xrmi.Mem.am));
   1495          return rmi;
   1496       default:
   1497          vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
   1498    }
   1499 }
   1500 
   1501 /* DO NOT CALL THIS DIRECTLY ! */
   1502 static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   1503 {
   1504    IRType ty = typeOfIRExpr(env->type_env,e);
   1505    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1506 
   1507    /* special case: immediate */
   1508    if (e->tag == Iex_Const) {
   1509       UInt u;
   1510       switch (e->Iex.Const.con->tag) {
   1511          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1512          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1513          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1514          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1515       }
   1516       return X86RMI_Imm(u);
   1517    }
   1518 
   1519    /* special case: 32-bit GET */
   1520    if (e->tag == Iex_Get && ty == Ity_I32) {
   1521       return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1522                                     hregX86_EBP()));
   1523    }
   1524 
   1525    /* special case: 32-bit load from memory */
   1526    if (e->tag == Iex_Load && ty == Ity_I32
   1527        && e->Iex.Load.end == Iend_LE) {
   1528       X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1529       return X86RMI_Mem(am);
   1530    }
   1531 
   1532    /* default case: calculate into a register and return that */
   1533    {
   1534       HReg r = iselIntExpr_R ( env, e );
   1535       return X86RMI_Reg(r);
   1536    }
   1537 }
   1538 
   1539 
   1540 /* --------------------- RIs --------------------- */
   1541 
   1542 /* Calculate an expression into an X86RI operand.  As with
   1543    iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
   1544 
   1545 static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   1546 {
   1547    X86RI* ri = iselIntExpr_RI_wrk(env, e);
   1548    /* sanity checks ... */
   1549    switch (ri->tag) {
   1550       case Xri_Imm:
   1551          return ri;
   1552       case Xri_Reg:
   1553          vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
   1554          vassert(hregIsVirtual(ri->Xri.Reg.reg));
   1555          return ri;
   1556       default:
   1557          vpanic("iselIntExpr_RI: unknown x86 RI tag");
   1558    }
   1559 }
   1560 
   1561 /* DO NOT CALL THIS DIRECTLY ! */
   1562 static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   1563 {
   1564    IRType ty = typeOfIRExpr(env->type_env,e);
   1565    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1566 
   1567    /* special case: immediate */
   1568    if (e->tag == Iex_Const) {
   1569       UInt u;
   1570       switch (e->Iex.Const.con->tag) {
   1571          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1572          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1573          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1574          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1575       }
   1576       return X86RI_Imm(u);
   1577    }
   1578 
   1579    /* default case: calculate into a register and return that */
   1580    {
   1581       HReg r = iselIntExpr_R ( env, e );
   1582       return X86RI_Reg(r);
   1583    }
   1584 }
   1585 
   1586 
   1587 /* --------------------- RMs --------------------- */
   1588 
   1589 /* Similarly, calculate an expression into an X86RM operand.  As with
   1590    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1591 
   1592 static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   1593 {
   1594    X86RM* rm = iselIntExpr_RM_wrk(env, e);
   1595    /* sanity checks ... */
   1596    switch (rm->tag) {
   1597       case Xrm_Reg:
   1598          vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
   1599          vassert(hregIsVirtual(rm->Xrm.Reg.reg));
   1600          return rm;
   1601       case Xrm_Mem:
   1602          vassert(sane_AMode(rm->Xrm.Mem.am));
   1603          return rm;
   1604       default:
   1605          vpanic("iselIntExpr_RM: unknown x86 RM tag");
   1606    }
   1607 }
   1608 
   1609 /* DO NOT CALL THIS DIRECTLY ! */
   1610 static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   1611 {
   1612    IRType ty = typeOfIRExpr(env->type_env,e);
   1613    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1614 
   1615    /* special case: 32-bit GET */
   1616    if (e->tag == Iex_Get && ty == Ity_I32) {
   1617       return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
   1618                                    hregX86_EBP()));
   1619    }
   1620 
   1621    /* special case: load from memory */
   1622 
   1623    /* default case: calculate into a register and return that */
   1624    {
   1625       HReg r = iselIntExpr_R ( env, e );
   1626       return X86RM_Reg(r);
   1627    }
   1628 }
   1629 
   1630 
   1631 /* --------------------- CONDCODE --------------------- */
   1632 
   1633 /* Generate code to evaluated a bit-typed expression, returning the
   1634    condition code which would correspond when the expression would
   1635    notionally have returned 1. */
   1636 
   1637 static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   1638 {
   1639    /* Uh, there's nothing we can sanity check here, unfortunately. */
   1640    return iselCondCode_wrk(env,e);
   1641 }
   1642 
   1643 /* DO NOT CALL THIS DIRECTLY ! */
   1644 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   1645 {
   1646    MatchInfo mi;
   1647 
   1648    vassert(e);
   1649    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   1650 
   1651    /* var */
   1652    if (e->tag == Iex_RdTmp) {
   1653       HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   1654       /* Test32 doesn't modify r32; so this is OK. */
   1655       addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
   1656       return Xcc_NZ;
   1657    }
   1658 
   1659    /* Constant 1:Bit */
   1660    if (e->tag == Iex_Const) {
   1661       HReg r;
   1662       vassert(e->Iex.Const.con->tag == Ico_U1);
   1663       vassert(e->Iex.Const.con->Ico.U1 == True
   1664               || e->Iex.Const.con->Ico.U1 == False);
   1665       r = newVRegI(env);
   1666       addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
   1667       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
   1668       return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
   1669    }
   1670 
   1671    /* Not1(e) */
   1672    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   1673       /* Generate code for the arg, and negate the test condition */
   1674       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   1675    }
   1676 
   1677    /* --- patterns rooted at: 32to1 --- */
   1678 
   1679    if (e->tag == Iex_Unop
   1680        && e->Iex.Unop.op == Iop_32to1) {
   1681       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1682       addInstr(env, X86Instr_Test32(1,rm));
   1683       return Xcc_NZ;
   1684    }
   1685 
   1686    /* --- patterns rooted at: CmpNEZ8 --- */
   1687 
   1688    /* CmpNEZ8(x) */
   1689    if (e->tag == Iex_Unop
   1690        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   1691       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1692       addInstr(env, X86Instr_Test32(0xFF,rm));
   1693       return Xcc_NZ;
   1694    }
   1695 
   1696    /* --- patterns rooted at: CmpNEZ16 --- */
   1697 
   1698    /* CmpNEZ16(x) */
   1699    if (e->tag == Iex_Unop
   1700        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   1701       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1702       addInstr(env, X86Instr_Test32(0xFFFF,rm));
   1703       return Xcc_NZ;
   1704    }
   1705 
   1706    /* --- patterns rooted at: CmpNEZ32 --- */
   1707 
   1708    /* CmpNEZ32(And32(x,y)) */
   1709    {
   1710       DECLARE_PATTERN(p_CmpNEZ32_And32);
   1711       DEFINE_PATTERN(p_CmpNEZ32_And32,
   1712                      unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
   1713       if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
   1714          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1715          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1716          HReg    tmp  = newVRegI(env);
   1717          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1718          addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
   1719          return Xcc_NZ;
   1720       }
   1721    }
   1722 
   1723    /* CmpNEZ32(Or32(x,y)) */
   1724    {
   1725       DECLARE_PATTERN(p_CmpNEZ32_Or32);
   1726       DEFINE_PATTERN(p_CmpNEZ32_Or32,
   1727                      unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
   1728       if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
   1729          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1730          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1731          HReg    tmp  = newVRegI(env);
   1732          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1733          addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
   1734          return Xcc_NZ;
   1735       }
   1736    }
   1737 
   1738    /* CmpNEZ32(GET(..):I32) */
   1739    if (e->tag == Iex_Unop
   1740        && e->Iex.Unop.op == Iop_CmpNEZ32
   1741        && e->Iex.Unop.arg->tag == Iex_Get) {
   1742       X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1743                                  hregX86_EBP());
   1744       addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
   1745       return Xcc_NZ;
   1746    }
   1747 
   1748    /* CmpNEZ32(x) */
   1749    if (e->tag == Iex_Unop
   1750        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   1751       HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   1752       X86RMI* rmi2 = X86RMI_Imm(0);
   1753       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1754       return Xcc_NZ;
   1755    }
   1756 
   1757    /* --- patterns rooted at: CmpNEZ64 --- */
   1758 
   1759    /* CmpNEZ64(Or64(x,y)) */
   1760    {
   1761       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   1762       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   1763                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   1764       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   1765          HReg    hi1, lo1, hi2, lo2;
   1766          HReg    tmp  = newVRegI(env);
   1767          iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
   1768          addInstr(env, mk_iMOVsd_RR(hi1, tmp));
   1769          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
   1770          iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
   1771          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
   1772          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
   1773          return Xcc_NZ;
   1774       }
   1775    }
   1776 
   1777    /* CmpNEZ64(x) */
   1778    if (e->tag == Iex_Unop
   1779        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   1780       HReg hi, lo;
   1781       HReg tmp = newVRegI(env);
   1782       iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
   1783       addInstr(env, mk_iMOVsd_RR(hi, tmp));
   1784       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
   1785       return Xcc_NZ;
   1786    }
   1787 
   1788    /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
   1789 
   1790    /* CmpEQ8 / CmpNE8 */
   1791    if (e->tag == Iex_Binop
   1792        && (e->Iex.Binop.op == Iop_CmpEQ8
   1793            || e->Iex.Binop.op == Iop_CmpNE8
   1794            || e->Iex.Binop.op == Iop_CasCmpEQ8
   1795            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   1796       if (isZeroU8(e->Iex.Binop.arg2)) {
   1797          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1798          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
   1799          switch (e->Iex.Binop.op) {
   1800             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1801             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1802             default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
   1803          }
   1804       } else {
   1805          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1806          X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1807          HReg    r    = newVRegI(env);
   1808          addInstr(env, mk_iMOVsd_RR(r1,r));
   1809          addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1810          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
   1811          switch (e->Iex.Binop.op) {
   1812             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1813             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1814             default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
   1815          }
   1816       }
   1817    }
   1818 
   1819    /* CmpEQ16 / CmpNE16 */
   1820    if (e->tag == Iex_Binop
   1821        && (e->Iex.Binop.op == Iop_CmpEQ16
   1822            || e->Iex.Binop.op == Iop_CmpNE16
   1823            || e->Iex.Binop.op == Iop_CasCmpEQ16
   1824            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   1825       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1826       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1827       HReg    r    = newVRegI(env);
   1828       addInstr(env, mk_iMOVsd_RR(r1,r));
   1829       addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1830       addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
   1831       switch (e->Iex.Binop.op) {
   1832          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
   1833          case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
   1834          default: vpanic("iselCondCode(x86): CmpXX16");
   1835       }
   1836    }
   1837 
   1838    /* Cmp*32*(x,y) */
   1839    if (e->tag == Iex_Binop
   1840        && (e->Iex.Binop.op == Iop_CmpEQ32
   1841            || e->Iex.Binop.op == Iop_CmpNE32
   1842            || e->Iex.Binop.op == Iop_CmpLT32S
   1843            || e->Iex.Binop.op == Iop_CmpLT32U
   1844            || e->Iex.Binop.op == Iop_CmpLE32S
   1845            || e->Iex.Binop.op == Iop_CmpLE32U
   1846            || e->Iex.Binop.op == Iop_CasCmpEQ32
   1847            || e->Iex.Binop.op == Iop_CasCmpNE32)) {
   1848       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1849       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1850       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1851       switch (e->Iex.Binop.op) {
   1852          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
   1853          case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
   1854          case Iop_CmpLT32S: return Xcc_L;
   1855          case Iop_CmpLT32U: return Xcc_B;
   1856          case Iop_CmpLE32S: return Xcc_LE;
   1857          case Iop_CmpLE32U: return Xcc_BE;
   1858          default: vpanic("iselCondCode(x86): CmpXX32");
   1859       }
   1860    }
   1861 
   1862    /* CmpNE64 */
   1863    if (e->tag == Iex_Binop
   1864        && (e->Iex.Binop.op == Iop_CmpNE64
   1865            || e->Iex.Binop.op == Iop_CmpEQ64)) {
   1866       HReg hi1, hi2, lo1, lo2;
   1867       HReg tHi = newVRegI(env);
   1868       HReg tLo = newVRegI(env);
   1869       iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
   1870       iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
   1871       addInstr(env, mk_iMOVsd_RR(hi1, tHi));
   1872       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
   1873       addInstr(env, mk_iMOVsd_RR(lo1, tLo));
   1874       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
   1875       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
   1876       switch (e->Iex.Binop.op) {
   1877          case Iop_CmpNE64: return Xcc_NZ;
   1878          case Iop_CmpEQ64: return Xcc_Z;
   1879          default: vpanic("iselCondCode(x86): CmpXX64");
   1880       }
   1881    }
   1882 
   1883    ppIRExpr(e);
   1884    vpanic("iselCondCode");
   1885 }
   1886 
   1887 
   1888 /*---------------------------------------------------------*/
   1889 /*--- ISEL: Integer expressions (64 bit)                ---*/
   1890 /*---------------------------------------------------------*/
   1891 
   1892 /* Compute a 64-bit value into a register pair, which is returned as
   1893    the first two parameters.  As with iselIntExpr_R, these may be
   1894    either real or virtual regs; in any case they must not be changed
   1895    by subsequent code emitted by the caller.  */
   1896 
   1897 static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1898 {
   1899    iselInt64Expr_wrk(rHi, rLo, env, e);
   1900 #  if 0
   1901    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   1902 #  endif
   1903    vassert(hregClass(*rHi) == HRcInt32);
   1904    vassert(hregIsVirtual(*rHi));
   1905    vassert(hregClass(*rLo) == HRcInt32);
   1906    vassert(hregIsVirtual(*rLo));
   1907 }
   1908 
   1909 /* DO NOT CALL THIS DIRECTLY ! */
   1910 static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1911 {
   1912    MatchInfo mi;
   1913    HWord fn = 0; /* helper fn for most SIMD64 stuff */
   1914    vassert(e);
   1915    vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
   1916 
   1917    /* 64-bit literal */
   1918    if (e->tag == Iex_Const) {
   1919       ULong w64 = e->Iex.Const.con->Ico.U64;
   1920       UInt  wHi = toUInt(w64 >> 32);
   1921       UInt  wLo = toUInt(w64);
   1922       HReg  tLo = newVRegI(env);
   1923       HReg  tHi = newVRegI(env);
   1924       vassert(e->Iex.Const.con->tag == Ico_U64);
   1925       if (wLo == wHi) {
   1926          /* Save a precious Int register in this special case. */
   1927          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1928          *rHi = tLo;
   1929          *rLo = tLo;
   1930       } else {
   1931          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
   1932          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1933          *rHi = tHi;
   1934          *rLo = tLo;
   1935       }
   1936       return;
   1937    }
   1938 
   1939    /* read 64-bit IRTemp */
   1940    if (e->tag == Iex_RdTmp) {
   1941       lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
   1942       return;
   1943    }
   1944 
   1945    /* 64-bit load */
   1946    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   1947       HReg     tLo, tHi;
   1948       X86AMode *am0, *am4;
   1949       vassert(e->Iex.Load.ty == Ity_I64);
   1950       tLo = newVRegI(env);
   1951       tHi = newVRegI(env);
   1952       am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1953       am4 = advance4(am0);
   1954       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
   1955       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1956       *rHi = tHi;
   1957       *rLo = tLo;
   1958       return;
   1959    }
   1960 
   1961    /* 64-bit GET */
   1962    if (e->tag == Iex_Get) {
   1963       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
   1964       X86AMode* am4 = advance4(am);
   1965       HReg tLo = newVRegI(env);
   1966       HReg tHi = newVRegI(env);
   1967       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   1968       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1969       *rHi = tHi;
   1970       *rLo = tLo;
   1971       return;
   1972    }
   1973 
   1974    /* 64-bit GETI */
   1975    if (e->tag == Iex_GetI) {
   1976       X86AMode* am
   1977          = genGuestArrayOffset( env, e->Iex.GetI.descr,
   1978                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
   1979       X86AMode* am4 = advance4(am);
   1980       HReg tLo = newVRegI(env);
   1981       HReg tHi = newVRegI(env);
   1982       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   1983       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1984       *rHi = tHi;
   1985       *rLo = tLo;
   1986       return;
   1987    }
   1988 
   1989    /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
   1990    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
   1991       X86RM* r8;
   1992       HReg e0Lo, e0Hi;
   1993       HReg tLo = newVRegI(env);
   1994       HReg tHi = newVRegI(env);
   1995       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1996       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   1997       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   1998       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   1999       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2000       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2001       addInstr(env, X86Instr_Test32(0xFF, r8));
   2002       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
   2003       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
   2004       add_to_esp(env, 4);
   2005       *rHi = tHi;
   2006       *rLo = tLo;
   2007       return;
   2008    }
   2009    /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
   2010    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
   2011       X86RM* r8;
   2012       HReg e0Lo, e0Hi;
   2013       HReg tLo = newVRegI(env);
   2014       HReg tHi = newVRegI(env);
   2015       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2016       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
   2017       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2018       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   2019       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2020       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2021       addInstr(env, X86Instr_Test32(0xFF, r8));
   2022       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
   2023       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
   2024       add_to_esp(env, 4);
   2025       *rHi = tHi;
   2026       *rLo = tLo;
   2027       return;
   2028    }
   2029 
   2030    /* 64-bit Mux0X: Mux0X(g, expr, expr) */
   2031    if (e->tag == Iex_Mux0X) {
   2032       X86RM* r8;
   2033       HReg e0Lo, e0Hi, eXLo, eXHi;
   2034       HReg tLo = newVRegI(env);
   2035       HReg tHi = newVRegI(env);
   2036       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2037       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
   2038       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
   2039       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
   2040       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2041       addInstr(env, X86Instr_Test32(0xFF, r8));
   2042       /* This assumes the first cmov32 doesn't trash the condition
   2043          codes, so they are still available for the second cmov32 */
   2044       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
   2045       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
   2046       *rHi = tHi;
   2047       *rLo = tLo;
   2048       return;
   2049    }
   2050 
   2051    /* --------- BINARY ops --------- */
   2052    if (e->tag == Iex_Binop) {
   2053       switch (e->Iex.Binop.op) {
   2054          /* 32 x 32 -> 64 multiply */
   2055          case Iop_MullU32:
   2056          case Iop_MullS32: {
   2057             /* get one operand into %eax, and the other into a R/M.
   2058                Need to make an educated guess about which is better in
   2059                which. */
   2060             HReg   tLo    = newVRegI(env);
   2061             HReg   tHi    = newVRegI(env);
   2062             Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
   2063             X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2064             HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2065             addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
   2066             addInstr(env, X86Instr_MulL(syned, rmLeft));
   2067             /* Result is now in EDX:EAX.  Tell the caller. */
   2068             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2069             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2070             *rHi = tHi;
   2071             *rLo = tLo;
   2072             return;
   2073          }
   2074 
   2075          /* 64 x 32 -> (32(rem),32(div)) division */
   2076          case Iop_DivModU64to32:
   2077          case Iop_DivModS64to32: {
   2078             /* Get the 64-bit operand into edx:eax, and the other into
   2079                any old R/M. */
   2080             HReg sHi, sLo;
   2081             HReg   tLo     = newVRegI(env);
   2082             HReg   tHi     = newVRegI(env);
   2083             Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   2084             X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2085             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2086             addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
   2087             addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
   2088             addInstr(env, X86Instr_Div(syned, rmRight));
   2089             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2090             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2091             *rHi = tHi;
   2092             *rLo = tLo;
   2093             return;
   2094          }
   2095 
   2096          /* Or64/And64/Xor64 */
   2097          case Iop_Or64:
   2098          case Iop_And64:
   2099          case Iop_Xor64: {
   2100             HReg xLo, xHi, yLo, yHi;
   2101             HReg tLo = newVRegI(env);
   2102             HReg tHi = newVRegI(env);
   2103             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
   2104                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
   2105                           : Xalu_XOR;
   2106             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2107             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2108             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2109             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
   2110             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2111             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
   2112             *rHi = tHi;
   2113             *rLo = tLo;
   2114             return;
   2115          }
   2116 
   2117          /* Add64/Sub64 */
   2118          case Iop_Add64:
   2119             if (e->Iex.Binop.arg2->tag == Iex_Const) {
   2120                /* special case Add64(e, const) */
   2121                ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
   2122                UInt  wHi = toUInt(w64 >> 32);
   2123                UInt  wLo = toUInt(w64);
   2124                HReg  tLo = newVRegI(env);
   2125                HReg  tHi = newVRegI(env);
   2126                HReg  xLo, xHi;
   2127                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
   2128                iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2129                addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2130                addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2131                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
   2132                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
   2133                *rHi = tHi;
   2134                *rLo = tLo;
   2135                return;
   2136             }
   2137             /* else fall through to the generic case */
   2138          case Iop_Sub64: {
   2139             HReg xLo, xHi, yLo, yHi;
   2140             HReg tLo = newVRegI(env);
   2141             HReg tHi = newVRegI(env);
   2142             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2143             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2144             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2145             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2146             if (e->Iex.Binop.op==Iop_Add64) {
   2147                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
   2148                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
   2149             } else {
   2150                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2151                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2152             }
   2153             *rHi = tHi;
   2154             *rLo = tLo;
   2155             return;
   2156          }
   2157 
   2158          /* 32HLto64(e1,e2) */
   2159          case Iop_32HLto64:
   2160             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2161             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2162             return;
   2163 
   2164          /* 64-bit shifts */
   2165          case Iop_Shl64: {
   2166             /* We use the same ingenious scheme as gcc.  Put the value
   2167                to be shifted into %hi:%lo, and the shift amount into
   2168                %cl.  Then (dsts on right, a la ATT syntax):
   2169 
   2170                shldl %cl, %lo, %hi   -- make %hi be right for the
   2171                                      -- shift amt %cl % 32
   2172                shll  %cl, %lo        -- make %lo be right for the
   2173                                      -- shift amt %cl % 32
   2174 
   2175                Now, if (shift amount % 64) is in the range 32 .. 63,
   2176                we have to do a fixup, which puts the result low half
   2177                into the result high half, and zeroes the low half:
   2178 
   2179                testl $32, %ecx
   2180 
   2181                cmovnz %lo, %hi
   2182                movl $0, %tmp         -- sigh; need yet another reg
   2183                cmovnz %tmp, %lo
   2184             */
   2185             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2186             tLo = newVRegI(env);
   2187             tHi = newVRegI(env);
   2188             tTemp = newVRegI(env);
   2189             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2190             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2191             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2192             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2193             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2194             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2195                and those regs are legitimately modifiable. */
   2196             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
   2197             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
   2198             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2199             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
   2200             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2201             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
   2202             *rHi = tHi;
   2203             *rLo = tLo;
   2204             return;
   2205          }
   2206 
   2207          case Iop_Shr64: {
   2208             /* We use the same ingenious scheme as gcc.  Put the value
   2209                to be shifted into %hi:%lo, and the shift amount into
   2210                %cl.  Then:
   2211 
   2212                shrdl %cl, %hi, %lo   -- make %lo be right for the
   2213                                      -- shift amt %cl % 32
   2214                shrl  %cl, %hi        -- make %hi be right for the
   2215                                      -- shift amt %cl % 32
   2216 
   2217                Now, if (shift amount % 64) is in the range 32 .. 63,
   2218                we have to do a fixup, which puts the result high half
   2219                into the result low half, and zeroes the high half:
   2220 
   2221                testl $32, %ecx
   2222 
   2223                cmovnz %hi, %lo
   2224                movl $0, %tmp         -- sigh; need yet another reg
   2225                cmovnz %tmp, %hi
   2226             */
   2227             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2228             tLo = newVRegI(env);
   2229             tHi = newVRegI(env);
   2230             tTemp = newVRegI(env);
   2231             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2232             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2233             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2234             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2235             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2236             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2237                and those regs are legitimately modifiable. */
   2238             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
   2239             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
   2240             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2241             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
   2242             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2243             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
   2244             *rHi = tHi;
   2245             *rLo = tLo;
   2246             return;
   2247          }
   2248 
   2249          /* F64 -> I64 */
   2250          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
   2251             case.  Unfortunately I see no easy way to avoid the
   2252             duplication. */
   2253          case Iop_F64toI64S: {
   2254             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2255             HReg tLo = newVRegI(env);
   2256             HReg tHi = newVRegI(env);
   2257 
   2258             /* Used several times ... */
   2259             /* Careful ... this sharing is only safe because
   2260 	       zero_esp/four_esp do not hold any registers which the
   2261 	       register allocator could attempt to swizzle later. */
   2262             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2263             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2264 
   2265             /* rf now holds the value to be converted, and rrm holds
   2266                the rounding mode value, encoded as per the
   2267                IRRoundingMode enum.  The first thing to do is set the
   2268                FPU's rounding mode accordingly. */
   2269 
   2270             /* Create a space for the format conversion. */
   2271             /* subl $8, %esp */
   2272             sub_from_esp(env, 8);
   2273 
   2274             /* Set host rounding mode */
   2275             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2276 
   2277             /* gistll %rf, 0(%esp) */
   2278             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
   2279 
   2280             /* movl 0(%esp), %dstLo */
   2281             /* movl 4(%esp), %dstHi */
   2282             addInstr(env, X86Instr_Alu32R(
   2283                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2284             addInstr(env, X86Instr_Alu32R(
   2285                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2286 
   2287             /* Restore default FPU rounding. */
   2288             set_FPU_rounding_default( env );
   2289 
   2290             /* addl $8, %esp */
   2291             add_to_esp(env, 8);
   2292 
   2293             *rHi = tHi;
   2294             *rLo = tLo;
   2295             return;
   2296          }
   2297 
   2298          case Iop_Add8x8:
   2299             fn = (HWord)h_generic_calc_Add8x8; goto binnish;
   2300          case Iop_Add16x4:
   2301             fn = (HWord)h_generic_calc_Add16x4; goto binnish;
   2302          case Iop_Add32x2:
   2303             fn = (HWord)h_generic_calc_Add32x2; goto binnish;
   2304 
   2305          case Iop_Avg8Ux8:
   2306             fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
   2307          case Iop_Avg16Ux4:
   2308             fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
   2309 
   2310          case Iop_CmpEQ8x8:
   2311             fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
   2312          case Iop_CmpEQ16x4:
   2313             fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
   2314          case Iop_CmpEQ32x2:
   2315             fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
   2316 
   2317          case Iop_CmpGT8Sx8:
   2318             fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
   2319          case Iop_CmpGT16Sx4:
   2320             fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
   2321          case Iop_CmpGT32Sx2:
   2322             fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
   2323 
   2324          case Iop_InterleaveHI8x8:
   2325             fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
   2326          case Iop_InterleaveLO8x8:
   2327             fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
   2328          case Iop_InterleaveHI16x4:
   2329             fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
   2330          case Iop_InterleaveLO16x4:
   2331             fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
   2332          case Iop_InterleaveHI32x2:
   2333             fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
   2334          case Iop_InterleaveLO32x2:
   2335             fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
   2336          case Iop_CatOddLanes16x4:
   2337             fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
   2338          case Iop_CatEvenLanes16x4:
   2339             fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
   2340          case Iop_Perm8x8:
   2341             fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
   2342 
   2343          case Iop_Max8Ux8:
   2344             fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
   2345          case Iop_Max16Sx4:
   2346             fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
   2347          case Iop_Min8Ux8:
   2348             fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
   2349          case Iop_Min16Sx4:
   2350             fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
   2351 
   2352          case Iop_Mul16x4:
   2353             fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
   2354          case Iop_Mul32x2:
   2355             fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
   2356          case Iop_MulHi16Sx4:
   2357             fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
   2358          case Iop_MulHi16Ux4:
   2359             fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
   2360 
   2361          case Iop_QAdd8Sx8:
   2362             fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
   2363          case Iop_QAdd16Sx4:
   2364             fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
   2365          case Iop_QAdd8Ux8:
   2366             fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
   2367          case Iop_QAdd16Ux4:
   2368             fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
   2369 
   2370          case Iop_QNarrow32Sx2:
   2371             fn = (HWord)h_generic_calc_QNarrow32Sx2; goto binnish;
   2372          case Iop_QNarrow16Sx4:
   2373             fn = (HWord)h_generic_calc_QNarrow16Sx4; goto binnish;
   2374          case Iop_QNarrow16Ux4:
   2375             fn = (HWord)h_generic_calc_QNarrow16Ux4; goto binnish;
   2376 
   2377          case Iop_QSub8Sx8:
   2378             fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
   2379          case Iop_QSub16Sx4:
   2380             fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
   2381          case Iop_QSub8Ux8:
   2382             fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
   2383          case Iop_QSub16Ux4:
   2384             fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
   2385 
   2386          case Iop_Sub8x8:
   2387             fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
   2388          case Iop_Sub16x4:
   2389             fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
   2390          case Iop_Sub32x2:
   2391             fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
   2392 
   2393          binnish: {
   2394             /* Note: the following assumes all helpers are of
   2395                signature
   2396                   ULong fn ( ULong, ULong ), and they are
   2397                not marked as regparm functions.
   2398             */
   2399             HReg xLo, xHi, yLo, yHi;
   2400             HReg tLo = newVRegI(env);
   2401             HReg tHi = newVRegI(env);
   2402             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2403             addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
   2404             addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
   2405             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2406             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2407             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2408             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2409             add_to_esp(env, 4*4);
   2410             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2411             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2412             *rHi = tHi;
   2413             *rLo = tLo;
   2414             return;
   2415          }
   2416 
   2417          case Iop_ShlN32x2:
   2418             fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
   2419          case Iop_ShlN16x4:
   2420             fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
   2421          case Iop_ShlN8x8:
   2422             fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
   2423          case Iop_ShrN32x2:
   2424             fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
   2425          case Iop_ShrN16x4:
   2426             fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
   2427          case Iop_SarN32x2:
   2428             fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
   2429          case Iop_SarN16x4:
   2430             fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
   2431          case Iop_SarN8x8:
   2432             fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
   2433          shifty: {
   2434             /* Note: the following assumes all helpers are of
   2435                signature
   2436                   ULong fn ( ULong, UInt ), and they are
   2437                not marked as regparm functions.
   2438             */
   2439             HReg xLo, xHi;
   2440             HReg tLo = newVRegI(env);
   2441             HReg tHi = newVRegI(env);
   2442             X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2443             addInstr(env, X86Instr_Push(y));
   2444             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2445             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2446             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2447             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2448             add_to_esp(env, 3*4);
   2449             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2450             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2451             *rHi = tHi;
   2452             *rLo = tLo;
   2453             return;
   2454          }
   2455 
   2456          default:
   2457             break;
   2458       }
   2459    } /* if (e->tag == Iex_Binop) */
   2460 
   2461 
   2462    /* --------- UNARY ops --------- */
   2463    if (e->tag == Iex_Unop) {
   2464       switch (e->Iex.Unop.op) {
   2465 
   2466          /* 32Sto64(e) */
   2467          case Iop_32Sto64: {
   2468             HReg tLo = newVRegI(env);
   2469             HReg tHi = newVRegI(env);
   2470             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2471             addInstr(env, mk_iMOVsd_RR(src,tHi));
   2472             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2473             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
   2474             *rHi = tHi;
   2475             *rLo = tLo;
   2476             return;
   2477          }
   2478 
   2479          /* 32Uto64(e) */
   2480          case Iop_32Uto64: {
   2481             HReg tLo = newVRegI(env);
   2482             HReg tHi = newVRegI(env);
   2483             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2484             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2485             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2486             *rHi = tHi;
   2487             *rLo = tLo;
   2488             return;
   2489          }
   2490 
   2491          /* 16Uto64(e) */
   2492          case Iop_16Uto64: {
   2493             HReg tLo = newVRegI(env);
   2494             HReg tHi = newVRegI(env);
   2495             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2496             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2497             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   2498                                           X86RMI_Imm(0xFFFF), tLo));
   2499             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2500             *rHi = tHi;
   2501             *rLo = tLo;
   2502             return;
   2503          }
   2504 
   2505          /* V128{HI}to64 */
   2506          case Iop_V128HIto64:
   2507          case Iop_V128to64: {
   2508             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
   2509             HReg tLo = newVRegI(env);
   2510             HReg tHi = newVRegI(env);
   2511             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   2512             X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
   2513             X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
   2514             X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
   2515             sub_from_esp(env, 16);
   2516             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   2517             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2518                                            X86RMI_Mem(espLO), tLo ));
   2519             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2520                                            X86RMI_Mem(espHI), tHi ));
   2521             add_to_esp(env, 16);
   2522             *rHi = tHi;
   2523             *rLo = tLo;
   2524             return;
   2525          }
   2526 
   2527          /* could do better than this, but for now ... */
   2528          case Iop_1Sto64: {
   2529             HReg tLo = newVRegI(env);
   2530             HReg tHi = newVRegI(env);
   2531             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   2532             addInstr(env, X86Instr_Set32(cond,tLo));
   2533             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
   2534             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
   2535             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
   2536             *rHi = tHi;
   2537             *rLo = tLo;
   2538             return;
   2539          }
   2540 
   2541          /* Not64(e) */
   2542          case Iop_Not64: {
   2543             HReg tLo = newVRegI(env);
   2544             HReg tHi = newVRegI(env);
   2545             HReg sHi, sLo;
   2546             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
   2547             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2548             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2549             addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
   2550             addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
   2551             *rHi = tHi;
   2552             *rLo = tLo;
   2553             return;
   2554          }
   2555 
   2556          /* Left64(e) */
   2557          case Iop_Left64: {
   2558             HReg yLo, yHi;
   2559             HReg tLo = newVRegI(env);
   2560             HReg tHi = newVRegI(env);
   2561             /* yHi:yLo = arg */
   2562             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
   2563             /* tLo = 0 - yLo, and set carry */
   2564             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
   2565             addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2566             /* tHi = 0 - yHi - carry */
   2567             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2568             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2569             /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
   2570                back in, so as to give the final result
   2571                tHi:tLo = arg | -arg. */
   2572             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
   2573             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
   2574             *rHi = tHi;
   2575             *rLo = tLo;
   2576             return;
   2577          }
   2578 
   2579          /* --- patterns rooted at: CmpwNEZ64 --- */
   2580 
   2581          /* CmpwNEZ64(e) */
   2582          case Iop_CmpwNEZ64: {
   2583 
   2584          DECLARE_PATTERN(p_CmpwNEZ64_Or64);
   2585          DEFINE_PATTERN(p_CmpwNEZ64_Or64,
   2586                         unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
   2587          if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
   2588             /* CmpwNEZ64(Or64(x,y)) */
   2589             HReg xHi,xLo,yHi,yLo;
   2590             HReg xBoth = newVRegI(env);
   2591             HReg merged = newVRegI(env);
   2592             HReg tmp2 = newVRegI(env);
   2593 
   2594             iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
   2595             addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
   2596             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2597                                           X86RMI_Reg(xLo),xBoth));
   2598 
   2599             iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
   2600             addInstr(env, mk_iMOVsd_RR(yHi,merged));
   2601             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2602                                           X86RMI_Reg(yLo),merged));
   2603             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2604                                              X86RMI_Reg(xBoth),merged));
   2605 
   2606             /* tmp2 = (merged | -merged) >>s 31 */
   2607             addInstr(env, mk_iMOVsd_RR(merged,tmp2));
   2608             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2609             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2610                                           X86RMI_Reg(merged), tmp2));
   2611             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2612             *rHi = tmp2;
   2613             *rLo = tmp2;
   2614             return;
   2615          } else {
   2616             /* CmpwNEZ64(e) */
   2617             HReg srcLo, srcHi;
   2618             HReg tmp1  = newVRegI(env);
   2619             HReg tmp2  = newVRegI(env);
   2620             /* srcHi:srcLo = arg */
   2621             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
   2622             /* tmp1 = srcHi | srcLo */
   2623             addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
   2624             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2625                                           X86RMI_Reg(srcLo), tmp1));
   2626             /* tmp2 = (tmp1 | -tmp1) >>s 31 */
   2627             addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
   2628             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2629             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2630                                           X86RMI_Reg(tmp1), tmp2));
   2631             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2632             *rHi = tmp2;
   2633             *rLo = tmp2;
   2634             return;
   2635          }
   2636          }
   2637 
   2638          /* ReinterpF64asI64(e) */
   2639          /* Given an IEEE754 double, produce an I64 with the same bit
   2640             pattern. */
   2641          case Iop_ReinterpF64asI64: {
   2642             HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
   2643             HReg tLo  = newVRegI(env);
   2644             HReg tHi  = newVRegI(env);
   2645             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2646             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2647             /* paranoia */
   2648             set_FPU_rounding_default(env);
   2649             /* subl $8, %esp */
   2650             sub_from_esp(env, 8);
   2651             /* gstD %rf, 0(%esp) */
   2652             addInstr(env,
   2653                      X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
   2654             /* movl 0(%esp), %tLo */
   2655             addInstr(env,
   2656                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2657             /* movl 4(%esp), %tHi */
   2658             addInstr(env,
   2659                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2660             /* addl $8, %esp */
   2661             add_to_esp(env, 8);
   2662             *rHi = tHi;
   2663             *rLo = tLo;
   2664             return;
   2665          }
   2666 
   2667          case Iop_CmpNEZ32x2:
   2668             fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
   2669          case Iop_CmpNEZ16x4:
   2670             fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
   2671          case Iop_CmpNEZ8x8:
   2672             fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
   2673          unish: {
   2674             /* Note: the following assumes all helpers are of
   2675                signature
   2676                   ULong fn ( ULong ), and they are
   2677                not marked as regparm functions.
   2678             */
   2679             HReg xLo, xHi;
   2680             HReg tLo = newVRegI(env);
   2681             HReg tHi = newVRegI(env);
   2682             iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
   2683             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2684             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2685             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2686             add_to_esp(env, 2*4);
   2687             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2688             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2689             *rHi = tHi;
   2690             *rLo = tLo;
   2691             return;
   2692          }
   2693 
   2694          default:
   2695             break;
   2696       }
   2697    } /* if (e->tag == Iex_Unop) */
   2698 
   2699 
   2700    /* --------- CCALL --------- */
   2701    if (e->tag == Iex_CCall) {
   2702       HReg tLo = newVRegI(env);
   2703       HReg tHi = newVRegI(env);
   2704 
   2705       /* Marshal args, do the call, clear stack. */
   2706       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   2707 
   2708       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2709       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2710       *rHi = tHi;
   2711       *rLo = tLo;
   2712       return;
   2713    }
   2714 
   2715    ppIRExpr(e);
   2716    vpanic("iselInt64Expr");
   2717 }
   2718 
   2719 
   2720 /*---------------------------------------------------------*/
   2721 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2722 /*---------------------------------------------------------*/
   2723 
   2724 /* Nothing interesting here; really just wrappers for
   2725    64-bit stuff. */
   2726 
   2727 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2728 {
   2729    HReg r = iselFltExpr_wrk( env, e );
   2730 #  if 0
   2731    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2732 #  endif
   2733    vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
   2734    vassert(hregIsVirtual(r));
   2735    return r;
   2736 }
   2737 
   2738 /* DO NOT CALL THIS DIRECTLY */
   2739 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2740 {
   2741    IRType ty = typeOfIRExpr(env->type_env,e);
   2742    vassert(ty == Ity_F32);
   2743 
   2744    if (e->tag == Iex_RdTmp) {
   2745       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2746    }
   2747 
   2748    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2749       X86AMode* am;
   2750       HReg res = newVRegF(env);
   2751       vassert(e->Iex.Load.ty == Ity_F32);
   2752       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2753       addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
   2754       return res;
   2755    }
   2756 
   2757    if (e->tag == Iex_Binop
   2758        && e->Iex.Binop.op == Iop_F64toF32) {
   2759       /* Although the result is still held in a standard FPU register,
   2760          we need to round it to reflect the loss of accuracy/range
   2761          entailed in casting it to a 32-bit float. */
   2762       HReg dst = newVRegF(env);
   2763       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2764       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2765       addInstr(env, X86Instr_Fp64to32(src,dst));
   2766       set_FPU_rounding_default( env );
   2767       return dst;
   2768    }
   2769 
   2770    if (e->tag == Iex_Get) {
   2771       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2772                                   hregX86_EBP() );
   2773       HReg res = newVRegF(env);
   2774       addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
   2775       return res;
   2776    }
   2777 
   2778    if (e->tag == Iex_Unop
   2779        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2780        /* Given an I32, produce an IEEE754 float with the same bit
   2781           pattern. */
   2782       HReg    dst = newVRegF(env);
   2783       X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   2784       /* paranoia */
   2785       addInstr(env, X86Instr_Push(rmi));
   2786       addInstr(env, X86Instr_FpLdSt(
   2787                        True/*load*/, 4, dst,
   2788                        X86AMode_IR(0, hregX86_ESP())));
   2789       add_to_esp(env, 4);
   2790       return dst;
   2791    }
   2792 
   2793    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2794       HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
   2795       HReg dst = newVRegF(env);
   2796 
   2797       /* rf now holds the value to be rounded.  The first thing to do
   2798          is set the FPU's rounding mode accordingly. */
   2799 
   2800       /* Set host rounding mode */
   2801       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2802 
   2803       /* grndint %rf, %dst */
   2804       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   2805 
   2806       /* Restore default FPU rounding. */
   2807       set_FPU_rounding_default( env );
   2808 
   2809       return dst;
   2810    }
   2811 
   2812    ppIRExpr(e);
   2813    vpanic("iselFltExpr_wrk");
   2814 }
   2815 
   2816 
   2817 /*---------------------------------------------------------*/
   2818 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2819 /*---------------------------------------------------------*/
   2820 
   2821 /* Compute a 64-bit floating point value into a register, the identity
   2822    of which is returned.  As with iselIntExpr_R, the reg may be either
   2823    real or virtual; in any case it must not be changed by subsequent
   2824    code emitted by the caller.  */
   2825 
   2826 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2827 
   2828     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2829     ----                  ---------   -----------   -----------
   2830     signalling NaN        u           2047 (max)    .0uuuuu---u
   2831                                                     (with at least
   2832                                                      one 1 bit)
   2833     quiet NaN             u           2047 (max)    .1uuuuu---u
   2834 
   2835     negative infinity     1           2047 (max)    .000000---0
   2836 
   2837     positive infinity     0           2047 (max)    .000000---0
   2838 
   2839     negative zero         1           0             .000000---0
   2840 
   2841     positive zero         0           0             .000000---0
   2842 */
   2843 
   2844 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2845 {
   2846    HReg r = iselDblExpr_wrk( env, e );
   2847 #  if 0
   2848    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2849 #  endif
   2850    vassert(hregClass(r) == HRcFlt64);
   2851    vassert(hregIsVirtual(r));
   2852    return r;
   2853 }
   2854 
   2855 /* DO NOT CALL THIS DIRECTLY */
   2856 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2857 {
   2858    IRType ty = typeOfIRExpr(env->type_env,e);
   2859    vassert(e);
   2860    vassert(ty == Ity_F64);
   2861 
   2862    if (e->tag == Iex_RdTmp) {
   2863       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2864    }
   2865 
   2866    if (e->tag == Iex_Const) {
   2867       union { UInt u32x2[2]; ULong u64; Double f64; } u;
   2868       HReg freg = newVRegF(env);
   2869       vassert(sizeof(u) == 8);
   2870       vassert(sizeof(u.u64) == 8);
   2871       vassert(sizeof(u.f64) == 8);
   2872       vassert(sizeof(u.u32x2) == 8);
   2873 
   2874       if (e->Iex.Const.con->tag == Ico_F64) {
   2875          u.f64 = e->Iex.Const.con->Ico.F64;
   2876       }
   2877       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2878          u.u64 = e->Iex.Const.con->Ico.F64i;
   2879       }
   2880       else
   2881          vpanic("iselDblExpr(x86): const");
   2882 
   2883       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
   2884       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
   2885       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
   2886                                     X86AMode_IR(0, hregX86_ESP())));
   2887       add_to_esp(env, 8);
   2888       return freg;
   2889    }
   2890 
   2891    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2892       X86AMode* am;
   2893       HReg res = newVRegF(env);
   2894       vassert(e->Iex.Load.ty == Ity_F64);
   2895       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2896       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
   2897       return res;
   2898    }
   2899 
   2900    if (e->tag == Iex_Get) {
   2901       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2902                                   hregX86_EBP() );
   2903       HReg res = newVRegF(env);
   2904       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2905       return res;
   2906    }
   2907 
   2908    if (e->tag == Iex_GetI) {
   2909       X86AMode* am
   2910          = genGuestArrayOffset(
   2911               env, e->Iex.GetI.descr,
   2912                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2913       HReg res = newVRegF(env);
   2914       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2915       return res;
   2916    }
   2917 
   2918    if (e->tag == Iex_Triop) {
   2919       X86FpOp fpop = Xfp_INVALID;
   2920       switch (e->Iex.Triop.op) {
   2921          case Iop_AddF64:    fpop = Xfp_ADD; break;
   2922          case Iop_SubF64:    fpop = Xfp_SUB; break;
   2923          case Iop_MulF64:    fpop = Xfp_MUL; break;
   2924          case Iop_DivF64:    fpop = Xfp_DIV; break;
   2925          case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
   2926          case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
   2927          case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
   2928          case Iop_AtanF64:   fpop = Xfp_ATAN; break;
   2929          case Iop_PRemF64:   fpop = Xfp_PREM; break;
   2930          case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
   2931          default: break;
   2932       }
   2933       if (fpop != Xfp_INVALID) {
   2934          HReg res  = newVRegF(env);
   2935          HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
   2936          HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
   2937          /* XXXROUNDINGFIXME */
   2938          /* set roundingmode here */
   2939          addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
   2940 	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
   2941 	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
   2942             roundToF64(env, res);
   2943          return res;
   2944       }
   2945    }
   2946 
   2947    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2948       HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2949       HReg dst = newVRegF(env);
   2950 
   2951       /* rf now holds the value to be rounded.  The first thing to do
   2952          is set the FPU's rounding mode accordingly. */
   2953 
   2954       /* Set host rounding mode */
   2955       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2956 
   2957       /* grndint %rf, %dst */
   2958       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   2959 
   2960       /* Restore default FPU rounding. */
   2961       set_FPU_rounding_default( env );
   2962 
   2963       return dst;
   2964    }
   2965 
   2966    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   2967       HReg dst = newVRegF(env);
   2968       HReg rHi,rLo;
   2969       iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
   2970       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   2971       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   2972 
   2973       /* Set host rounding mode */
   2974       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2975 
   2976       addInstr(env, X86Instr_FpLdStI(
   2977                        True/*load*/, 8, dst,
   2978                        X86AMode_IR(0, hregX86_ESP())));
   2979 
   2980       /* Restore default FPU rounding. */
   2981       set_FPU_rounding_default( env );
   2982 
   2983       add_to_esp(env, 8);
   2984       return dst;
   2985    }
   2986 
   2987    if (e->tag == Iex_Binop) {
   2988       X86FpOp fpop = Xfp_INVALID;
   2989       switch (e->Iex.Binop.op) {
   2990          case Iop_SinF64:  fpop = Xfp_SIN; break;
   2991          case Iop_CosF64:  fpop = Xfp_COS; break;
   2992          case Iop_TanF64:  fpop = Xfp_TAN; break;
   2993          case Iop_2xm1F64: fpop = Xfp_2XM1; break;
   2994          case Iop_SqrtF64: fpop = Xfp_SQRT; break;
   2995          default: break;
   2996       }
   2997       if (fpop != Xfp_INVALID) {
   2998          HReg res = newVRegF(env);
   2999          HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   3000          /* XXXROUNDINGFIXME */
   3001          /* set roundingmode here */
   3002          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3003 	 if (fpop != Xfp_SQRT
   3004              && fpop != Xfp_NEG && fpop != Xfp_ABS)
   3005             roundToF64(env, res);
   3006          return res;
   3007       }
   3008    }
   3009 
   3010    if (e->tag == Iex_Unop) {
   3011       X86FpOp fpop = Xfp_INVALID;
   3012       switch (e->Iex.Unop.op) {
   3013          case Iop_NegF64:  fpop = Xfp_NEG; break;
   3014          case Iop_AbsF64:  fpop = Xfp_ABS; break;
   3015          default: break;
   3016       }
   3017       if (fpop != Xfp_INVALID) {
   3018          HReg res = newVRegF(env);
   3019          HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   3020          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3021 	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
   3022             roundToF64(env, res);
   3023          return res;
   3024       }
   3025    }
   3026 
   3027    if (e->tag == Iex_Unop) {
   3028       switch (e->Iex.Unop.op) {
   3029          case Iop_I32StoF64: {
   3030             HReg dst = newVRegF(env);
   3031             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3032             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3033             set_FPU_rounding_default(env);
   3034             addInstr(env, X86Instr_FpLdStI(
   3035                              True/*load*/, 4, dst,
   3036                              X86AMode_IR(0, hregX86_ESP())));
   3037 	    add_to_esp(env, 4);
   3038             return dst;
   3039          }
   3040          case Iop_ReinterpI64asF64: {
   3041             /* Given an I64, produce an IEEE754 double with the same
   3042                bit pattern. */
   3043             HReg dst = newVRegF(env);
   3044             HReg rHi, rLo;
   3045 	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
   3046             /* paranoia */
   3047             set_FPU_rounding_default(env);
   3048             addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3049             addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3050             addInstr(env, X86Instr_FpLdSt(
   3051                              True/*load*/, 8, dst,
   3052                              X86AMode_IR(0, hregX86_ESP())));
   3053 	    add_to_esp(env, 8);
   3054             return dst;
   3055 	 }
   3056          case Iop_F32toF64: {
   3057             /* this is a no-op */
   3058             HReg res = iselFltExpr(env, e->Iex.Unop.arg);
   3059             return res;
   3060 	 }
   3061          default:
   3062             break;
   3063       }
   3064    }
   3065 
   3066    /* --------- MULTIPLEX --------- */
   3067    if (e->tag == Iex_Mux0X) {
   3068      if (ty == Ity_F64
   3069          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   3070         X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3071         HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
   3072         HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
   3073         HReg dst = newVRegF(env);
   3074         addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
   3075         addInstr(env, X86Instr_Test32(0xFF, r8));
   3076         addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
   3077         return dst;
   3078       }
   3079    }
   3080 
   3081    ppIRExpr(e);
   3082    vpanic("iselDblExpr_wrk");
   3083 }
   3084 
   3085 
   3086 /*---------------------------------------------------------*/
   3087 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3088 /*---------------------------------------------------------*/
   3089 
   3090 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3091 {
   3092    HReg r = iselVecExpr_wrk( env, e );
   3093 #  if 0
   3094    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3095 #  endif
   3096    vassert(hregClass(r) == HRcVec128);
   3097    vassert(hregIsVirtual(r));
   3098    return r;
   3099 }
   3100 
   3101 
   3102 /* DO NOT CALL THIS DIRECTLY */
   3103 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3104 {
   3105 
   3106 #  define REQUIRE_SSE1                                    \
   3107       do { if (env->hwcaps == 0/*baseline, no sse*/)      \
   3108               goto vec_fail;                              \
   3109       } while (0)
   3110 
   3111 #  define REQUIRE_SSE2                                    \
   3112       do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
   3113               goto vec_fail;                              \
   3114       } while (0)
   3115 
   3116 #  define SSE2_OR_ABOVE                                   \
   3117        (env->hwcaps & VEX_HWCAPS_X86_SSE2)
   3118 
   3119    MatchInfo mi;
   3120    Bool      arg1isEReg = False;
   3121    X86SseOp  op = Xsse_INVALID;
   3122    IRType    ty = typeOfIRExpr(env->type_env,e);
   3123    vassert(e);
   3124    vassert(ty == Ity_V128);
   3125 
   3126    REQUIRE_SSE1;
   3127 
   3128    if (e->tag == Iex_RdTmp) {
   3129       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3130    }
   3131 
   3132    if (e->tag == Iex_Get) {
   3133       HReg dst = newVRegV(env);
   3134       addInstr(env, X86Instr_SseLdSt(
   3135                        True/*load*/,
   3136                        dst,
   3137                        X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
   3138                     )
   3139               );
   3140       return dst;
   3141    }
   3142 
   3143    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3144       HReg      dst = newVRegV(env);
   3145       X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3146       addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
   3147       return dst;
   3148    }
   3149 
   3150    if (e->tag == Iex_Const) {
   3151       HReg dst = newVRegV(env);
   3152       vassert(e->Iex.Const.con->tag == Ico_V128);
   3153       addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
   3154       return dst;
   3155    }
   3156 
   3157    if (e->tag == Iex_Unop) {
   3158 
   3159    if (SSE2_OR_ABOVE) {
   3160       /* 64UtoV128(LDle:I64(addr)) */
   3161       DECLARE_PATTERN(p_zwiden_load64);
   3162       DEFINE_PATTERN(p_zwiden_load64,
   3163                      unop(Iop_64UtoV128,
   3164                           IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
   3165       if (matchIRExpr(&mi, p_zwiden_load64, e)) {
   3166          X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
   3167          HReg dst = newVRegV(env);
   3168          addInstr(env, X86Instr_SseLdzLO(8, dst, am));
   3169          return dst;
   3170       }
   3171    }
   3172 
   3173    switch (e->Iex.Unop.op) {
   3174 
   3175       case Iop_NotV128: {
   3176          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3177          return do_sse_Not128(env, arg);
   3178       }
   3179 
   3180       case Iop_CmpNEZ64x2: {
   3181          /* We can use SSE2 instructions for this. */
   3182          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3183             the operand.  Problem is no such insn exists.  Solution
   3184             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3185             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3186             let the not'd result of this initial comparison be a:b:c:d.
   3187             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3188             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3189             giving the required result.
   3190 
   3191             The required selection sequence is 2,3,0,1, which
   3192             according to Intel's documentation means the pshufd
   3193             literal value is 0xB1, that is,
   3194             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3195          */
   3196          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3197          HReg tmp  = newVRegV(env);
   3198          HReg dst  = newVRegV(env);
   3199          REQUIRE_SSE2;
   3200          addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
   3201          addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
   3202          tmp = do_sse_Not128(env, tmp);
   3203          addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
   3204          addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
   3205          return dst;
   3206       }
   3207 
   3208       case Iop_CmpNEZ32x4: {
   3209          /* Sigh, we have to generate lousy code since this has to
   3210             work on SSE1 hosts */
   3211          /* basically, the idea is: for each lane:
   3212                movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
   3213                sbbl %r, %r               (now %r = 1Sto32(CF))
   3214                movl %r, lane
   3215          */
   3216          Int       i;
   3217          X86AMode* am;
   3218          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3219          HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3220          HReg      dst  = newVRegV(env);
   3221          HReg      r32  = newVRegI(env);
   3222          sub_from_esp(env, 16);
   3223          addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
   3224          for (i = 0; i < 4; i++) {
   3225             am = X86AMode_IR(i*4, hregX86_ESP());
   3226             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
   3227             addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
   3228             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
   3229             addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
   3230          }
   3231          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3232          add_to_esp(env, 16);
   3233          return dst;
   3234       }
   3235 
   3236       case Iop_CmpNEZ8x16:
   3237       case Iop_CmpNEZ16x8: {
   3238          /* We can use SSE2 instructions for this. */
   3239          HReg arg;
   3240          HReg vec0 = newVRegV(env);
   3241          HReg vec1 = newVRegV(env);
   3242          HReg dst  = newVRegV(env);
   3243          X86SseOp cmpOp
   3244             = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
   3245                                              : Xsse_CMPEQ8;
   3246          REQUIRE_SSE2;
   3247          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
   3248          addInstr(env, mk_vMOVsd_RR(vec0, vec1));
   3249          addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
   3250          /* defer arg computation to here so as to give CMPEQF as long
   3251             as possible to complete */
   3252          arg = iselVecExpr(env, e->Iex.Unop.arg);
   3253          /* vec0 is all 0s; vec1 is all 1s */
   3254          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3255          /* 16x8 or 8x16 comparison == */
   3256          addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
   3257          /* invert result */
   3258          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
   3259          return dst;
   3260       }
   3261 
   3262       case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
   3263       case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
   3264       case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
   3265       do_32Fx4_unary:
   3266       {
   3267          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3268          HReg dst = newVRegV(env);
   3269          addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
   3270          return dst;
   3271       }
   3272 
   3273       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
   3274       case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
   3275       case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
   3276       do_64Fx2_unary:
   3277       {
   3278          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3279          HReg dst = newVRegV(env);
   3280          REQUIRE_SSE2;
   3281          addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
   3282          return dst;
   3283       }
   3284 
   3285       case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
   3286       case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
   3287       case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
   3288       do_32F0x4_unary:
   3289       {
   3290          /* A bit subtle.  We have to copy the arg to the result
   3291             register first, because actually doing the SSE scalar insn
   3292             leaves the upper 3/4 of the destination register
   3293             unchanged.  Whereas the required semantics of these
   3294             primops is that the upper 3/4 is simply copied in from the
   3295             argument. */
   3296          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3297          HReg dst = newVRegV(env);
   3298          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3299          addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
   3300          return dst;
   3301       }
   3302 
   3303       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
   3304       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
   3305       case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
   3306       do_64F0x2_unary:
   3307       {
   3308          /* A bit subtle.  We have to copy the arg to the result
   3309             register first, because actually doing the SSE scalar insn
   3310             leaves the upper half of the destination register
   3311             unchanged.  Whereas the required semantics of these
   3312             primops is that the upper half is simply copied in from the
   3313             argument. */
   3314          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3315          HReg dst = newVRegV(env);
   3316          REQUIRE_SSE2;
   3317          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3318          addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
   3319          return dst;
   3320       }
   3321 
   3322       case Iop_32UtoV128: {
   3323          HReg      dst  = newVRegV(env);
   3324          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3325          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3326          addInstr(env, X86Instr_Push(rmi));
   3327 	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
   3328          add_to_esp(env, 4);
   3329          return dst;
   3330       }
   3331 
   3332       case Iop_64UtoV128: {
   3333          HReg      rHi, rLo;
   3334          HReg      dst  = newVRegV(env);
   3335          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3336          iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
   3337          addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3338          addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3339 	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
   3340          add_to_esp(env, 8);
   3341          return dst;
   3342       }
   3343 
   3344       default:
   3345          break;
   3346    } /* switch (e->Iex.Unop.op) */
   3347    } /* if (e->tag == Iex_Unop) */
   3348 
   3349    if (e->tag == Iex_Binop) {
   3350    switch (e->Iex.Binop.op) {
   3351 
   3352       case Iop_SetV128lo32: {
   3353          HReg dst = newVRegV(env);
   3354          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3355          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3356          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3357          sub_from_esp(env, 16);
   3358          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3359          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
   3360          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3361          add_to_esp(env, 16);
   3362          return dst;
   3363       }
   3364 
   3365       case Iop_SetV128lo64: {
   3366          HReg dst = newVRegV(env);
   3367          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3368          HReg srcIhi, srcIlo;
   3369          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3370          X86AMode* esp4 = advance4(esp0);
   3371          iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
   3372          sub_from_esp(env, 16);
   3373          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3374          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
   3375          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
   3376          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3377          add_to_esp(env, 16);
   3378          return dst;
   3379       }
   3380 
   3381       case Iop_64HLtoV128: {
   3382          HReg r3, r2, r1, r0;
   3383          X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
   3384          X86AMode* esp4  = advance4(esp0);
   3385          X86AMode* esp8  = advance4(esp4);
   3386          X86AMode* esp12 = advance4(esp8);
   3387          HReg dst = newVRegV(env);
   3388 	 /* do this via the stack (easy, convenient, etc) */
   3389          sub_from_esp(env, 16);
   3390          /* Do the less significant 64 bits */
   3391          iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
   3392          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
   3393          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
   3394          /* Do the more significant 64 bits */
   3395          iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
   3396          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
   3397          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
   3398 	 /* Fetch result back from stack. */
   3399          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3400          add_to_esp(env, 16);
   3401          return dst;
   3402       }
   3403 
   3404       case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
   3405       case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
   3406       case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
   3407       case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
   3408       case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
   3409       case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
   3410       case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
   3411       case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
   3412       case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
   3413       case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
   3414       do_32Fx4:
   3415       {
   3416          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3417          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3418          HReg dst = newVRegV(env);
   3419          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3420          addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
   3421          return dst;
   3422       }
   3423 
   3424       case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
   3425       case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
   3426       case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
   3427       case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
   3428       case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
   3429       case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
   3430       case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
   3431       case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
   3432       case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
   3433       case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
   3434       do_64Fx2:
   3435       {
   3436          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3437          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3438          HReg dst = newVRegV(env);
   3439          REQUIRE_SSE2;
   3440          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3441          addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
   3442          return dst;
   3443       }
   3444 
   3445       case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
   3446       case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
   3447       case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
   3448       case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
   3449       case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
   3450       case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
   3451       case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
   3452       case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
   3453       case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
   3454       case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
   3455       do_32F0x4: {
   3456          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3457          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3458          HReg dst = newVRegV(env);
   3459          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3460          addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
   3461          return dst;
   3462       }
   3463 
   3464       case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
   3465       case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
   3466       case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
   3467       case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
   3468       case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
   3469       case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
   3470       case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
   3471       case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
   3472       case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
   3473       case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
   3474       do_64F0x2: {
   3475          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3476          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3477          HReg dst = newVRegV(env);
   3478          REQUIRE_SSE2;
   3479          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3480          addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
   3481          return dst;
   3482       }
   3483 
   3484       case Iop_QNarrow32Sx4:
   3485          op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3486       case Iop_QNarrow16Sx8:
   3487          op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3488       case Iop_QNarrow16Ux8:
   3489          op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3490 
   3491       case Iop_InterleaveHI8x16:
   3492          op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3493       case Iop_InterleaveHI16x8:
   3494          op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3495       case Iop_InterleaveHI32x4:
   3496          op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3497       case Iop_InterleaveHI64x2:
   3498          op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3499 
   3500       case Iop_InterleaveLO8x16:
   3501          op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3502       case Iop_InterleaveLO16x8:
   3503          op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3504       case Iop_InterleaveLO32x4:
   3505          op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3506       case Iop_InterleaveLO64x2:
   3507          op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3508 
   3509       case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
   3510       case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
   3511       case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
   3512       case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
   3513       case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
   3514       case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
   3515       case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
   3516       case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
   3517       case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
   3518       case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
   3519       case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
   3520       case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
   3521       case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
   3522       case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
   3523       case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
   3524       case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
   3525       case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
   3526       case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
   3527       case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
   3528       case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
   3529       case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
   3530       case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
   3531       case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
   3532       case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
   3533       case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
   3534       case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
   3535       case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
   3536       case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
   3537       case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
   3538       case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
   3539       case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
   3540       case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
   3541       case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
   3542       case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
   3543       do_SseReRg: {
   3544          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3545          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3546          HReg dst = newVRegV(env);
   3547          if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
   3548             REQUIRE_SSE2;
   3549          if (arg1isEReg) {
   3550             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3551             addInstr(env, X86Instr_SseReRg(op, arg1, dst));
   3552          } else {
   3553             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3554             addInstr(env, X86Instr_SseReRg(op, arg2, dst));
   3555          }
   3556          return dst;
   3557       }
   3558 
   3559       case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
   3560       case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
   3561       case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
   3562       case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
   3563       case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
   3564       case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
   3565       case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
   3566       case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
   3567       do_SseShift: {
   3568          HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3569          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3570          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3571          HReg      ereg = newVRegV(env);
   3572          HReg      dst  = newVRegV(env);
   3573          REQUIRE_SSE2;
   3574          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3575          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3576          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3577          addInstr(env, X86Instr_Push(rmi));
   3578          addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
   3579 	 addInstr(env, mk_vMOVsd_RR(greg, dst));
   3580          addInstr(env, X86Instr_SseReRg(op, ereg, dst));
   3581          add_to_esp(env, 16);
   3582          return dst;
   3583       }
   3584 
   3585       default:
   3586          break;
   3587    } /* switch (e->Iex.Binop.op) */
   3588    } /* if (e->tag == Iex_Binop) */
   3589 
   3590    if (e->tag == Iex_Mux0X) {
   3591       X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3592       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
   3593       HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
   3594       HReg dst = newVRegV(env);
   3595       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3596       addInstr(env, X86Instr_Test32(0xFF, r8));
   3597       addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
   3598       return dst;
   3599    }
   3600 
   3601    vec_fail:
   3602    vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
   3603               LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
   3604    ppIRExpr(e);
   3605    vpanic("iselVecExpr_wrk");
   3606 
   3607 #  undef REQUIRE_SSE1
   3608 #  undef REQUIRE_SSE2
   3609 #  undef SSE2_OR_ABOVE
   3610 }
   3611 
   3612 
   3613 /*---------------------------------------------------------*/
   3614 /*--- ISEL: Statements                                  ---*/
   3615 /*---------------------------------------------------------*/
   3616 
   3617 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   3618 {
   3619    if (vex_traceflags & VEX_TRACE_VCODE) {
   3620       vex_printf("\n-- ");
   3621       ppIRStmt(stmt);
   3622       vex_printf("\n");
   3623    }
   3624 
   3625    switch (stmt->tag) {
   3626 
   3627    /* --------- STORE --------- */
   3628    case Ist_Store: {
   3629       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   3630       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   3631       IREndness end   = stmt->Ist.Store.end;
   3632 
   3633       if (tya != Ity_I32 || end != Iend_LE)
   3634          goto stmt_fail;
   3635 
   3636       if (tyd == Ity_I32) {
   3637          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3638          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   3639          addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
   3640          return;
   3641       }
   3642       if (tyd == Ity_I8 || tyd == Ity_I16) {
   3643          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3644          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   3645          addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
   3646                                        r,am ));
   3647          return;
   3648       }
   3649       if (tyd == Ity_F64) {
   3650          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3651          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   3652          addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
   3653          return;
   3654       }
   3655       if (tyd == Ity_F32) {
   3656          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3657          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   3658          addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
   3659          return;
   3660       }
   3661       if (tyd == Ity_I64) {
   3662          HReg vHi, vLo, rA;
   3663          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
   3664          rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
   3665          addInstr(env, X86Instr_Alu32M(
   3666                           Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
   3667          addInstr(env, X86Instr_Alu32M(
   3668                           Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
   3669          return;
   3670       }
   3671       if (tyd == Ity_V128) {
   3672          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3673          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   3674          addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
   3675          return;
   3676       }
   3677       break;
   3678    }
   3679 
   3680    /* --------- PUT --------- */
   3681    case Ist_Put: {
   3682       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   3683       if (ty == Ity_I32) {
   3684          /* We're going to write to memory, so compute the RHS into an
   3685             X86RI. */
   3686          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   3687          addInstr(env,
   3688                   X86Instr_Alu32M(
   3689                      Xalu_MOV,
   3690                      ri,
   3691                      X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
   3692                  ));
   3693          return;
   3694       }
   3695       if (ty == Ity_I8 || ty == Ity_I16) {
   3696          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   3697          addInstr(env, X86Instr_Store(
   3698                           toUChar(ty==Ity_I8 ? 1 : 2),
   3699                           r,
   3700                           X86AMode_IR(stmt->Ist.Put.offset,
   3701                                       hregX86_EBP())));
   3702          return;
   3703       }
   3704       if (ty == Ity_I64) {
   3705          HReg vHi, vLo;
   3706          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3707          X86AMode* am4 = advance4(am);
   3708          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
   3709          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
   3710          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
   3711          return;
   3712       }
   3713       if (ty == Ity_V128) {
   3714          HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
   3715          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3716          addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
   3717          return;
   3718       }
   3719       if (ty == Ity_F32) {
   3720          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   3721          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3722          set_FPU_rounding_default(env); /* paranoia */
   3723          addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
   3724          return;
   3725       }
   3726       if (ty == Ity_F64) {
   3727          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   3728          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3729          set_FPU_rounding_default(env); /* paranoia */
   3730          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
   3731          return;
   3732       }
   3733       break;
   3734    }
   3735 
   3736    /* --------- Indexed PUT --------- */
   3737    case Ist_PutI: {
   3738       X86AMode* am
   3739          = genGuestArrayOffset(
   3740               env, stmt->Ist.PutI.descr,
   3741                    stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
   3742 
   3743       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
   3744       if (ty == Ity_F64) {
   3745          HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
   3746          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
   3747          return;
   3748       }
   3749       if (ty == Ity_I8) {
   3750          HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
   3751          addInstr(env, X86Instr_Store( 1, r, am ));
   3752          return;
   3753       }
   3754       if (ty == Ity_I32) {
   3755          HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
   3756          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
   3757          return;
   3758       }
   3759       if (ty == Ity_I64) {
   3760          HReg rHi, rLo;
   3761          X86AMode* am4 = advance4(am);
   3762          iselInt64Expr(&rHi, &rLo, env, stmt->Ist.PutI.data);
   3763          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
   3764          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
   3765          return;
   3766       }
   3767       break;
   3768    }
   3769 
   3770    /* --------- TMP --------- */
   3771    case Ist_WrTmp: {
   3772       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   3773       IRType ty = typeOfIRTemp(env->type_env, tmp);
   3774 
   3775       /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
   3776          compute it into an AMode and then use LEA.  This usually
   3777          produces fewer instructions, often because (for memcheck
   3778          created IR) we get t = address-expression, (t is later used
   3779          twice) and so doing this naturally turns address-expression
   3780          back into an X86 amode. */
   3781       if (ty == Ity_I32
   3782           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   3783           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
   3784          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   3785          HReg dst = lookupIRTemp(env, tmp);
   3786          if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
   3787             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   3788                value into a register.  Just emit a normal reg-reg move
   3789                so reg-alloc can coalesce it away in the usual way. */
   3790             HReg src = am->Xam.IR.reg;
   3791             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
   3792          } else {
   3793             addInstr(env, X86Instr_Lea32(am,dst));
   3794          }
   3795          return;
   3796       }
   3797 
   3798       if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
   3799          X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   3800          HReg dst = lookupIRTemp(env, tmp);
   3801          addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
   3802          return;
   3803       }
   3804       if (ty == Ity_I64) {
   3805          HReg rHi, rLo, dstHi, dstLo;
   3806          iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3807          lookupIRTemp64( &dstHi, &dstLo, env, tmp);
   3808          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   3809          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   3810          return;
   3811       }
   3812       if (ty == Ity_I1) {
   3813          X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   3814          HReg dst = lookupIRTemp(env, tmp);
   3815          addInstr(env, X86Instr_Set32(cond, dst));
   3816          return;
   3817       }
   3818       if (ty == Ity_F64) {
   3819          HReg dst = lookupIRTemp(env, tmp);
   3820          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   3821          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3822          return;
   3823       }
   3824       if (ty == Ity_F32) {
   3825          HReg dst = lookupIRTemp(env, tmp);
   3826          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   3827          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3828          return;
   3829       }
   3830       if (ty == Ity_V128) {
   3831          HReg dst = lookupIRTemp(env, tmp);
   3832          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   3833          addInstr(env, mk_vMOVsd_RR(src,dst));
   3834          return;
   3835       }
   3836       break;
   3837    }
   3838 
   3839    /* --------- Call to DIRTY helper --------- */
   3840    case Ist_Dirty: {
   3841       IRType   retty;
   3842       IRDirty* d = stmt->Ist.Dirty.details;
   3843       Bool     passBBP = False;
   3844 
   3845       if (d->nFxState == 0)
   3846          vassert(!d->needsBBP);
   3847 
   3848       passBBP = toBool(d->nFxState > 0 && d->needsBBP);
   3849 
   3850       /* Marshal args, do the call, clear stack. */
   3851       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
   3852 
   3853       /* Now figure out what to do with the returned value, if any. */
   3854       if (d->tmp == IRTemp_INVALID)
   3855          /* No return value.  Nothing to do. */
   3856          return;
   3857 
   3858       retty = typeOfIRTemp(env->type_env, d->tmp);
   3859       if (retty == Ity_I64) {
   3860          HReg dstHi, dstLo;
   3861          /* The returned value is in %edx:%eax.  Park it in the
   3862             register-pair associated with tmp. */
   3863          lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
   3864          addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
   3865          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
   3866          return;
   3867       }
   3868       if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
   3869          /* The returned value is in %eax.  Park it in the register
   3870             associated with tmp. */
   3871          HReg dst = lookupIRTemp(env, d->tmp);
   3872          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
   3873          return;
   3874       }
   3875       break;
   3876    }
   3877 
   3878    /* --------- MEM FENCE --------- */
   3879    case Ist_MBE:
   3880       switch (stmt->Ist.MBE.event) {
   3881          case Imbe_Fence:
   3882             addInstr(env, X86Instr_MFence(env->hwcaps));
   3883             return;
   3884          default:
   3885             break;
   3886       }
   3887       break;
   3888 
   3889    /* --------- ACAS --------- */
   3890    case Ist_CAS:
   3891       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   3892          /* "normal" singleton CAS */
   3893          UChar  sz;
   3894          IRCAS* cas = stmt->Ist.CAS.details;
   3895          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3896          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   3897          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   3898          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   3899          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   3900          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   3901          vassert(cas->expdHi == NULL);
   3902          vassert(cas->dataHi == NULL);
   3903          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   3904          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   3905          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   3906          switch (ty) {
   3907             case Ity_I32: sz = 4; break;
   3908             case Ity_I16: sz = 2; break;
   3909             case Ity_I8:  sz = 1; break;
   3910             default: goto unhandled_cas;
   3911          }
   3912          addInstr(env, X86Instr_ACAS(am, sz));
   3913          addInstr(env,
   3914                   X86Instr_CMov32(Xcc_NZ,
   3915                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   3916          return;
   3917       } else {
   3918          /* double CAS */
   3919          IRCAS* cas = stmt->Ist.CAS.details;
   3920          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3921          /* only 32-bit allowed in this case */
   3922          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   3923          /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
   3924          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   3925          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   3926          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   3927          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   3928          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   3929          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   3930          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   3931          if (ty != Ity_I32)
   3932             goto unhandled_cas;
   3933          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   3934          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   3935          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
   3936          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   3937          addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
   3938          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   3939          addInstr(env, X86Instr_DACAS(am));
   3940          addInstr(env,
   3941                   X86Instr_CMov32(Xcc_NZ,
   3942                                   X86RM_Reg(hregX86_EDX()), rOldHi));
   3943          addInstr(env,
   3944                   X86Instr_CMov32(Xcc_NZ,
   3945                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   3946          return;
   3947       }
   3948       unhandled_cas:
   3949       break;
   3950 
   3951    /* --------- INSTR MARK --------- */
   3952    /* Doesn't generate any executable code ... */
   3953    case Ist_IMark:
   3954        return;
   3955 
   3956    /* --------- NO-OP --------- */
   3957    /* Fairly self-explanatory, wouldn't you say? */
   3958    case Ist_NoOp:
   3959        return;
   3960 
   3961    /* --------- EXIT --------- */
   3962    case Ist_Exit: {
   3963       X86RI*      dst;
   3964       X86CondCode cc;
   3965       if (stmt->Ist.Exit.dst->tag != Ico_U32)
   3966          vpanic("isel_x86: Ist_Exit: dst is not a 32-bit value");
   3967       dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
   3968       cc  = iselCondCode(env,stmt->Ist.Exit.guard);
   3969       addInstr(env, X86Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
   3970       return;
   3971    }
   3972 
   3973    default: break;
   3974    }
   3975   stmt_fail:
   3976    ppIRStmt(stmt);
   3977    vpanic("iselStmt");
   3978 }
   3979 
   3980 
   3981 /*---------------------------------------------------------*/
   3982 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   3983 /*---------------------------------------------------------*/
   3984 
   3985 static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
   3986 {
   3987    X86RI* ri;
   3988    if (vex_traceflags & VEX_TRACE_VCODE) {
   3989       vex_printf("\n-- goto {");
   3990       ppIRJumpKind(jk);
   3991       vex_printf("} ");
   3992       ppIRExpr(next);
   3993       vex_printf("\n");
   3994    }
   3995    ri = iselIntExpr_RI(env, next);
   3996    addInstr(env, X86Instr_Goto(jk, Xcc_ALWAYS,ri));
   3997 }
   3998 
   3999 
   4000 /*---------------------------------------------------------*/
   4001 /*--- Insn selector top-level                           ---*/
   4002 /*---------------------------------------------------------*/
   4003 
   4004 /* Translate an entire SB to x86 code. */
   4005 
   4006 HInstrArray* iselSB_X86 ( IRSB* bb, VexArch      arch_host,
   4007                                     VexArchInfo* archinfo_host,
   4008                                     VexAbiInfo*  vbi/*UNUSED*/ )
   4009 {
   4010    Int      i, j;
   4011    HReg     hreg, hregHI;
   4012    ISelEnv* env;
   4013    UInt     hwcaps_host = archinfo_host->hwcaps;
   4014 
   4015    /* sanity ... */
   4016    vassert(arch_host == VexArchX86);
   4017    vassert(0 == (hwcaps_host
   4018                  & ~(VEX_HWCAPS_X86_SSE1
   4019                      | VEX_HWCAPS_X86_SSE2
   4020                      | VEX_HWCAPS_X86_SSE3
   4021                      | VEX_HWCAPS_X86_LZCNT)));
   4022 
   4023    /* Make up an initial environment to use. */
   4024    env = LibVEX_Alloc(sizeof(ISelEnv));
   4025    env->vreg_ctr = 0;
   4026 
   4027    /* Set up output code array. */
   4028    env->code = newHInstrArray();
   4029 
   4030    /* Copy BB's type env. */
   4031    env->type_env = bb->tyenv;
   4032 
   4033    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4034       change as we go along. */
   4035    env->n_vregmap = bb->tyenv->types_used;
   4036    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4037    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4038 
   4039    /* and finally ... */
   4040    env->hwcaps = hwcaps_host;
   4041 
   4042    /* For each IR temporary, allocate a suitably-kinded virtual
   4043       register. */
   4044    j = 0;
   4045    for (i = 0; i < env->n_vregmap; i++) {
   4046       hregHI = hreg = INVALID_HREG;
   4047       switch (bb->tyenv->types[i]) {
   4048          case Ity_I1:
   4049          case Ity_I8:
   4050          case Ity_I16:
   4051          case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
   4052          case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
   4053                         hregHI = mkHReg(j++, HRcInt32, True); break;
   4054          case Ity_F32:
   4055          case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
   4056          case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
   4057          default: ppIRType(bb->tyenv->types[i]);
   4058                   vpanic("iselBB: IRTemp type");
   4059       }
   4060       env->vregmap[i]   = hreg;
   4061       env->vregmapHI[i] = hregHI;
   4062    }
   4063    env->vreg_ctr = j;
   4064 
   4065    /* Ok, finally we can iterate over the statements. */
   4066    for (i = 0; i < bb->stmts_used; i++)
   4067       iselStmt(env,bb->stmts[i]);
   4068 
   4069    iselNext(env,bb->next,bb->jumpkind);
   4070 
   4071    /* record the number of vregs we used. */
   4072    env->code->n_vregs = env->vreg_ctr;
   4073    return env->code;
   4074 }
   4075 
   4076 
   4077 /*---------------------------------------------------------------*/
   4078 /*--- end                                     host_x86_isel.c ---*/
   4079 /*---------------------------------------------------------------*/
   4080