Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                   host_x86_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_x86_defs.h"
     47 
     48 /* TODO 21 Apr 2005:
     49 
     50    -- (Really an assembler issue) don't emit CMov32 as a cmov
     51       insn, since that's expensive on P4 and conditional branch
     52       is cheaper if (as we expect) the condition is highly predictable
     53 
     54    -- preserve xmm registers across function calls (by declaring them
     55       as trashed by call insns)
     56 
     57    -- preserve x87 ST stack discipline across function calls.  Sigh.
     58 
     59    -- Check doHelperCall: if a call is conditional, we cannot safely
     60       compute any regparm args directly to registers.  Hence, the
     61       fast-regparm marshalling should be restricted to unconditional
     62       calls only.
     63 */
     64 
     65 /*---------------------------------------------------------*/
     66 /*--- x87 control word stuff                            ---*/
     67 /*---------------------------------------------------------*/
     68 
     69 /* Vex-generated code expects to run with the FPU set as follows: all
     70    exceptions masked, round-to-nearest, precision = 53 bits.  This
     71    corresponds to a FPU control word value of 0x027F.
     72 
     73    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     74 
     75    %fpucw and %mxcsr should have these values on entry to
     76    Vex-generated code, and should those values should be
     77    unchanged at exit.
     78 */
     79 
     80 #define DEFAULT_FPUCW 0x027F
     81 
     82 /* debugging only, do not use */
     83 /* define DEFAULT_FPUCW 0x037F */
     84 
     85 
     86 /*---------------------------------------------------------*/
     87 /*--- misc helpers                                      ---*/
     88 /*---------------------------------------------------------*/
     89 
     90 /* These are duplicated in guest-x86/toIR.c */
     91 static IRExpr* unop ( IROp op, IRExpr* a )
     92 {
     93    return IRExpr_Unop(op, a);
     94 }
     95 
     96 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     97 {
     98    return IRExpr_Binop(op, a1, a2);
     99 }
    100 
    101 static IRExpr* bind ( Int binder )
    102 {
    103    return IRExpr_Binder(binder);
    104 }
    105 
    106 static Bool isZeroU8 ( IRExpr* e )
    107 {
    108    return e->tag == Iex_Const
    109           && e->Iex.Const.con->tag == Ico_U8
    110           && e->Iex.Const.con->Ico.U8 == 0;
    111 }
    112 
    113 static Bool isZeroU32 ( IRExpr* e )
    114 {
    115    return e->tag == Iex_Const
    116           && e->Iex.Const.con->tag == Ico_U32
    117           && e->Iex.Const.con->Ico.U32 == 0;
    118 }
    119 
    120 static Bool isZeroU64 ( IRExpr* e )
    121 {
    122    return e->tag == Iex_Const
    123           && e->Iex.Const.con->tag == Ico_U64
    124           && e->Iex.Const.con->Ico.U64 == 0ULL;
    125 }
    126 
    127 
    128 /*---------------------------------------------------------*/
    129 /*--- ISelEnv                                           ---*/
    130 /*---------------------------------------------------------*/
    131 
    132 /* This carries around:
    133 
    134    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    135      might encounter.  This is computed before insn selection starts,
    136      and does not change.
    137 
    138    - A mapping from IRTemp to HReg.  This tells the insn selector
    139      which virtual register(s) are associated with each IRTemp
    140      temporary.  This is computed before insn selection starts, and
    141      does not change.  We expect this mapping to map precisely the
    142      same set of IRTemps as the type mapping does.
    143 
    144         - vregmap   holds the primary register for the IRTemp.
    145         - vregmapHI is only used for 64-bit integer-typed
    146              IRTemps.  It holds the identity of a second
    147              32-bit virtual HReg, which holds the high half
    148              of the value.
    149 
    150    - The code array, that is, the insns selected so far.
    151 
    152    - A counter, for generating new virtual registers.
    153 
    154    - The host subarchitecture we are selecting insns for.
    155      This is set at the start and does not change.
    156 
    157    - A Bool for indicating whether we may generate chain-me
    158      instructions for control flow transfers, or whether we must use
    159      XAssisted.
    160 
    161    - The maximum guest address of any guest insn in this block.
    162      Actually, the address of the highest-addressed byte from any insn
    163      in this block.  Is set at the start and does not change.  This is
    164      used for detecting jumps which are definitely forward-edges from
    165      this block, and therefore can be made (chained) to the fast entry
    166      point of the destination, thereby avoiding the destination's
    167      event check.
    168 
    169    Note, this is all (well, mostly) host-independent.
    170 */
    171 
    172 typedef
    173    struct {
    174       /* Constant -- are set at the start and do not change. */
    175       IRTypeEnv*   type_env;
    176 
    177       HReg*        vregmap;
    178       HReg*        vregmapHI;
    179       Int          n_vregmap;
    180 
    181       UInt         hwcaps;
    182 
    183       Bool         chainingAllowed;
    184       Addr64       max_ga;
    185 
    186       /* These are modified as we go along. */
    187       HInstrArray* code;
    188       Int          vreg_ctr;
    189    }
    190    ISelEnv;
    191 
    192 
    193 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    194 {
    195    vassert(tmp >= 0);
    196    vassert(tmp < env->n_vregmap);
    197    return env->vregmap[tmp];
    198 }
    199 
    200 static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
    201 {
    202    vassert(tmp >= 0);
    203    vassert(tmp < env->n_vregmap);
    204    vassert(env->vregmapHI[tmp] != INVALID_HREG);
    205    *vrLO = env->vregmap[tmp];
    206    *vrHI = env->vregmapHI[tmp];
    207 }
    208 
    209 static void addInstr ( ISelEnv* env, X86Instr* instr )
    210 {
    211    addHInstr(env->code, instr);
    212    if (vex_traceflags & VEX_TRACE_VCODE) {
    213       ppX86Instr(instr, False);
    214       vex_printf("\n");
    215    }
    216 }
    217 
    218 static HReg newVRegI ( ISelEnv* env )
    219 {
    220    HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
    221    env->vreg_ctr++;
    222    return reg;
    223 }
    224 
    225 static HReg newVRegF ( ISelEnv* env )
    226 {
    227    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
    228    env->vreg_ctr++;
    229    return reg;
    230 }
    231 
    232 static HReg newVRegV ( ISelEnv* env )
    233 {
    234    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    235    env->vreg_ctr++;
    236    return reg;
    237 }
    238 
    239 
    240 /*---------------------------------------------------------*/
    241 /*--- ISEL: Forward declarations                        ---*/
    242 /*---------------------------------------------------------*/
    243 
    244 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    245    iselXXX_wrk do the real work, but are not to be called directly.
    246    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    247    checks that all returned registers are virtual.  You should not
    248    call the _wrk version directly.
    249 */
    250 static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    251 static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    252 
    253 static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
    254 static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
    255 
    256 static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
    257 static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
    258 
    259 static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
    260 static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
    261 
    262 static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    263 static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    264 
    265 static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
    266                                        ISelEnv* env, IRExpr* e );
    267 static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
    268                                        ISelEnv* env, IRExpr* e );
    269 
    270 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
    271 static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
    272 
    273 static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
    274 static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
    275 
    276 static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
    277 static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
    278 
    279 static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
    280 static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
    281 
    282 
    283 /*---------------------------------------------------------*/
    284 /*--- ISEL: Misc helpers                                ---*/
    285 /*---------------------------------------------------------*/
    286 
    287 /* Make a int reg-reg move. */
    288 
    289 static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    290 {
    291    vassert(hregClass(src) == HRcInt32);
    292    vassert(hregClass(dst) == HRcInt32);
    293    return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
    294 }
    295 
    296 
    297 /* Make a vector reg-reg move. */
    298 
    299 static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    300 {
    301    vassert(hregClass(src) == HRcVec128);
    302    vassert(hregClass(dst) == HRcVec128);
    303    return X86Instr_SseReRg(Xsse_MOV, src, dst);
    304 }
    305 
    306 /* Advance/retreat %esp by n. */
    307 
    308 static void add_to_esp ( ISelEnv* env, Int n )
    309 {
    310    vassert(n > 0 && n < 256 && (n%4) == 0);
    311    addInstr(env,
    312             X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
    313 }
    314 
    315 static void sub_from_esp ( ISelEnv* env, Int n )
    316 {
    317    vassert(n > 0 && n < 256 && (n%4) == 0);
    318    addInstr(env,
    319             X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
    320 }
    321 
    322 
    323 /* Given an amode, return one which references 4 bytes further
    324    along. */
    325 
    326 static X86AMode* advance4 ( X86AMode* am )
    327 {
    328    X86AMode* am4 = dopyX86AMode(am);
    329    switch (am4->tag) {
    330       case Xam_IRRS:
    331          am4->Xam.IRRS.imm += 4; break;
    332       case Xam_IR:
    333          am4->Xam.IR.imm += 4; break;
    334       default:
    335          vpanic("advance4(x86,host)");
    336    }
    337    return am4;
    338 }
    339 
    340 
    341 /* Push an arg onto the host stack, in preparation for a call to a
    342    helper function of some kind.  Returns the number of 32-bit words
    343    pushed. */
    344 
    345 static Int pushArg ( ISelEnv* env, IRExpr* arg )
    346 {
    347    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
    348    if (arg_ty == Ity_I32) {
    349       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
    350       return 1;
    351    } else
    352    if (arg_ty == Ity_I64) {
    353       HReg rHi, rLo;
    354       iselInt64Expr(&rHi, &rLo, env, arg);
    355       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
    356       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
    357       return 2;
    358    }
    359    ppIRExpr(arg);
    360    vpanic("pushArg(x86): can't handle arg of this type");
    361 }
    362 
    363 
    364 /* Complete the call to a helper function, by calling the
    365    helper and clearing the args off the stack. */
    366 
    367 static
    368 void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
    369                               IRCallee* cee, Int n_arg_ws )
    370 {
    371    /* Complication.  Need to decide which reg to use as the fn address
    372       pointer, in a way that doesn't trash regparm-passed
    373       parameters. */
    374    vassert(sizeof(void*) == 4);
    375 
    376    addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
    377                                     cee->regparms));
    378    if (n_arg_ws > 0)
    379       add_to_esp(env, 4*n_arg_ws);
    380 }
    381 
    382 
    383 /* Used only in doHelperCall.  See big comment in doHelperCall re
    384    handling of regparm args.  This function figures out whether
    385    evaluation of an expression might require use of a fixed register.
    386    If in doubt return True (safe but suboptimal).
    387 */
    388 static
    389 Bool mightRequireFixedRegs ( IRExpr* e )
    390 {
    391    switch (e->tag) {
    392       case Iex_RdTmp: case Iex_Const: case Iex_Get:
    393          return False;
    394       default:
    395          return True;
    396    }
    397 }
    398 
    399 
    400 /* Do a complete function call.  guard is a Ity_Bit expression
    401    indicating whether or not the call happens.  If guard==NULL, the
    402    call is unconditional. */
    403 
    404 static
    405 void doHelperCall ( ISelEnv* env,
    406                     Bool passBBP,
    407                     IRExpr* guard, IRCallee* cee, IRExpr** args )
    408 {
    409    X86CondCode cc;
    410    HReg        argregs[3];
    411    HReg        tmpregs[3];
    412    Bool        danger;
    413    Int         not_done_yet, n_args, n_arg_ws, stack_limit,
    414                i, argreg, argregX;
    415 
    416    /* Marshal args for a call, do the call, and clear the stack.
    417       Complexities to consider:
    418 
    419       * if passBBP is True, %ebp (the baseblock pointer) is to be
    420         passed as the first arg.
    421 
    422       * If the callee claims regparmness of 1, 2 or 3, we must pass the
    423         first 1, 2 or 3 args in registers (EAX, EDX, and ECX
    424         respectively).  To keep things relatively simple, only args of
    425         type I32 may be passed as regparms -- just bomb out if anything
    426         else turns up.  Clearly this depends on the front ends not
    427         trying to pass any other types as regparms.
    428    */
    429 
    430    /* 16 Nov 2004: the regparm handling is complicated by the
    431       following problem.
    432 
    433       Consider a call two a function with two regparm parameters:
    434       f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
    435       Suppose code is first generated to compute e1 into %eax.  Then,
    436       code is generated to compute e2 into %edx.  Unfortunately, if
    437       the latter code sequence uses %eax, it will trash the value of
    438       e1 computed by the former sequence.  This could happen if (for
    439       example) e2 itself involved a function call.  In the code below,
    440       args are evaluated right-to-left, not left-to-right, but the
    441       principle and the problem are the same.
    442 
    443       One solution is to compute all regparm-bound args into vregs
    444       first, and once they are all done, move them to the relevant
    445       real regs.  This always gives correct code, but it also gives
    446       a bunch of vreg-to-rreg moves which are usually redundant but
    447       are hard for the register allocator to get rid of.
    448 
    449       A compromise is to first examine all regparm'd argument
    450       expressions.  If they are all so simple that it is clear
    451       they will be evaluated without use of any fixed registers,
    452       use the old compute-directly-to-fixed-target scheme.  If not,
    453       be safe and use the via-vregs scheme.
    454 
    455       Note this requires being able to examine an expression and
    456       determine whether or not evaluation of it might use a fixed
    457       register.  That requires knowledge of how the rest of this
    458       insn selector works.  Currently just the following 3 are
    459       regarded as safe -- hopefully they cover the majority of
    460       arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    461    */
    462    vassert(cee->regparms >= 0 && cee->regparms <= 3);
    463 
    464    n_args = n_arg_ws = 0;
    465    while (args[n_args]) n_args++;
    466 
    467    not_done_yet = n_args;
    468    if (passBBP)
    469       not_done_yet++;
    470 
    471    stack_limit = cee->regparms;
    472    if (cee->regparms > 0 && passBBP) stack_limit--;
    473 
    474    /* ------ BEGIN marshall all arguments ------ */
    475 
    476    /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
    477    for (i = n_args-1; i >= stack_limit; i--) {
    478       n_arg_ws += pushArg(env, args[i]);
    479       not_done_yet--;
    480    }
    481 
    482    /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
    483       registers. */
    484 
    485    if (cee->regparms > 0) {
    486 
    487       /* ------ BEGIN deal with regparms ------ */
    488 
    489       /* deal with regparms, not forgetting %ebp if needed. */
    490       argregs[0] = hregX86_EAX();
    491       argregs[1] = hregX86_EDX();
    492       argregs[2] = hregX86_ECX();
    493       tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
    494 
    495       argreg = cee->regparms;
    496 
    497       /* In keeping with big comment above, detect potential danger
    498          and use the via-vregs scheme if needed. */
    499       danger = False;
    500       for (i = stack_limit-1; i >= 0; i--) {
    501          if (mightRequireFixedRegs(args[i])) {
    502             danger = True;
    503             break;
    504          }
    505       }
    506 
    507       if (danger) {
    508 
    509          /* Move via temporaries */
    510          argregX = argreg;
    511          for (i = stack_limit-1; i >= 0; i--) {
    512 
    513             if (0) {
    514                vex_printf("x86 host: register param is complex: ");
    515                ppIRExpr(args[i]);
    516                vex_printf("\n");
    517             }
    518 
    519             argreg--;
    520             vassert(argreg >= 0);
    521             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    522             tmpregs[argreg] = iselIntExpr_R(env, args[i]);
    523             not_done_yet--;
    524          }
    525          for (i = stack_limit-1; i >= 0; i--) {
    526             argregX--;
    527             vassert(argregX >= 0);
    528             addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
    529          }
    530 
    531       } else {
    532          /* It's safe to compute all regparm args directly into their
    533             target registers. */
    534          for (i = stack_limit-1; i >= 0; i--) {
    535             argreg--;
    536             vassert(argreg >= 0);
    537             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    538             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    539                                           iselIntExpr_RMI(env, args[i]),
    540                                           argregs[argreg]));
    541             not_done_yet--;
    542          }
    543 
    544       }
    545 
    546       /* Not forgetting %ebp if needed. */
    547       if (passBBP) {
    548          vassert(argreg == 1);
    549          addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
    550          not_done_yet--;
    551       }
    552 
    553       /* ------ END deal with regparms ------ */
    554 
    555    } else {
    556 
    557       /* No regparms.  Heave %ebp on the stack if needed. */
    558       if (passBBP) {
    559          addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
    560          n_arg_ws++;
    561          not_done_yet--;
    562       }
    563 
    564    }
    565 
    566    vassert(not_done_yet == 0);
    567 
    568    /* ------ END marshall all arguments ------ */
    569 
    570    /* Now we can compute the condition.  We can't do it earlier
    571       because the argument computations could trash the condition
    572       codes.  Be a bit clever to handle the common case where the
    573       guard is 1:Bit. */
    574    cc = Xcc_ALWAYS;
    575    if (guard) {
    576       if (guard->tag == Iex_Const
    577           && guard->Iex.Const.con->tag == Ico_U1
    578           && guard->Iex.Const.con->Ico.U1 == True) {
    579          /* unconditional -- do nothing */
    580       } else {
    581          cc = iselCondCode( env, guard );
    582       }
    583    }
    584 
    585    /* call the helper, and get the args off the stack afterwards. */
    586    callHelperAndClearArgs( env, cc, cee, n_arg_ws );
    587 }
    588 
    589 
    590 /* Given a guest-state array descriptor, an index expression and a
    591    bias, generate an X86AMode holding the relevant guest state
    592    offset. */
    593 
    594 static
    595 X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    596                                 IRExpr* off, Int bias )
    597 {
    598    HReg tmp, roff;
    599    Int  elemSz = sizeofIRType(descr->elemTy);
    600    Int  nElems = descr->nElems;
    601    Int  shift  = 0;
    602 
    603    /* throw out any cases not generated by an x86 front end.  In
    604       theory there might be a day where we need to handle them -- if
    605       we ever run non-x86-guest on x86 host. */
    606 
    607    if (nElems != 8)
    608       vpanic("genGuestArrayOffset(x86 host)(1)");
    609 
    610    switch (elemSz) {
    611       case 1:  shift = 0; break;
    612       case 4:  shift = 2; break;
    613       case 8:  shift = 3; break;
    614       default: vpanic("genGuestArrayOffset(x86 host)(2)");
    615    }
    616 
    617    /* Compute off into a reg, %off.  Then return:
    618 
    619          movl %off, %tmp
    620          addl $bias, %tmp  (if bias != 0)
    621          andl %tmp, 7
    622          ... base(%ebp, %tmp, shift) ...
    623    */
    624    tmp  = newVRegI(env);
    625    roff = iselIntExpr_R(env, off);
    626    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    627    if (bias != 0) {
    628       addInstr(env,
    629                X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
    630    }
    631    addInstr(env,
    632             X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
    633    return
    634       X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
    635 }
    636 
    637 
    638 /* Mess with the FPU's rounding mode: set to the default rounding mode
    639    (DEFAULT_FPUCW). */
    640 static
    641 void set_FPU_rounding_default ( ISelEnv* env )
    642 {
    643    /* pushl $DEFAULT_FPUCW
    644       fldcw 0(%esp)
    645       addl $4, %esp
    646    */
    647    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    648    addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
    649    addInstr(env, X86Instr_FpLdCW(zero_esp));
    650    add_to_esp(env, 4);
    651 }
    652 
    653 
    654 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    655    expression denoting a value in the range 0 .. 3, indicating a round
    656    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    657    the same rounding.
    658 */
    659 static
    660 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    661 {
    662    HReg rrm  = iselIntExpr_R(env, mode);
    663    HReg rrm2 = newVRegI(env);
    664    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    665 
    666    /* movl  %rrm, %rrm2
    667       andl  $3, %rrm2   -- shouldn't be needed; paranoia
    668       shll  $10, %rrm2
    669       orl   $DEFAULT_FPUCW, %rrm2
    670       pushl %rrm2
    671       fldcw 0(%esp)
    672       addl  $4, %esp
    673    */
    674    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    675    addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
    676    addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
    677    addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
    678    addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
    679    addInstr(env, X86Instr_FpLdCW(zero_esp));
    680    add_to_esp(env, 4);
    681 }
    682 
    683 
    684 /* Generate !src into a new vector register, and be sure that the code
    685    is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
    686    way to do this.
    687 */
    688 static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
    689 {
    690    HReg dst = newVRegV(env);
    691    /* Set dst to zero.  If dst contains a NaN then all hell might
    692       break loose after the comparison.  So, first zero it. */
    693    addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
    694    /* And now make it all 1s ... */
    695    addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
    696    /* Finally, xor 'src' into it. */
    697    addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
    698    /* Doesn't that just totally suck? */
    699    return dst;
    700 }
    701 
    702 
    703 /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
    704    after most non-simple FPU operations (simple = +, -, *, / and
    705    sqrt).
    706 
    707    This could be done a lot more efficiently if needed, by loading
    708    zero and adding it to the value to be rounded (fldz ; faddp?).
    709 */
    710 static void roundToF64 ( ISelEnv* env, HReg reg )
    711 {
    712    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    713    sub_from_esp(env, 8);
    714    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
    715    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
    716    add_to_esp(env, 8);
    717 }
    718 
    719 
    720 /*---------------------------------------------------------*/
    721 /*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
    722 /*---------------------------------------------------------*/
    723 
    724 /* Select insns for an integer-typed expression, and add them to the
    725    code list.  Return a reg holding the result.  This reg will be a
    726    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    727    want to modify it, ask for a new vreg, copy it in there, and modify
    728    the copy.  The register allocator will do its best to map both
    729    vregs to the same real register, so the copies will often disappear
    730    later in the game.
    731 
    732    This should handle expressions of 32, 16 and 8-bit type.  All
    733    results are returned in a 32-bit register.  For 16- and 8-bit
    734    expressions, the upper 16/24 bits are arbitrary, so you should mask
    735    or sign extend partial values if necessary.
    736 */
    737 
    738 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    739 {
    740    HReg r = iselIntExpr_R_wrk(env, e);
    741    /* sanity checks ... */
    742 #  if 0
    743    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
    744 #  endif
    745    vassert(hregClass(r) == HRcInt32);
    746    vassert(hregIsVirtual(r));
    747    return r;
    748 }
    749 
    750 /* DO NOT CALL THIS DIRECTLY ! */
    751 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    752 {
    753    MatchInfo mi;
    754 
    755    IRType ty = typeOfIRExpr(env->type_env,e);
    756    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
    757 
    758    switch (e->tag) {
    759 
    760    /* --------- TEMP --------- */
    761    case Iex_RdTmp: {
    762       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    763    }
    764 
    765    /* --------- LOAD --------- */
    766    case Iex_Load: {
    767       HReg dst = newVRegI(env);
    768       X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    769 
    770       /* We can't handle big-endian loads, nor load-linked. */
    771       if (e->Iex.Load.end != Iend_LE)
    772          goto irreducible;
    773 
    774       if (ty == Ity_I32) {
    775          addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    776                                        X86RMI_Mem(amode), dst) );
    777          return dst;
    778       }
    779       if (ty == Ity_I16) {
    780          addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
    781          return dst;
    782       }
    783       if (ty == Ity_I8) {
    784          addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
    785          return dst;
    786       }
    787       break;
    788    }
    789 
    790    /* --------- TERNARY OP --------- */
    791    case Iex_Triop: {
    792       IRTriop *triop = e->Iex.Triop.details;
    793       /* C3210 flags following FPU partial remainder (fprem), both
    794          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
    795       if (triop->op == Iop_PRemC3210F64
    796           || triop->op == Iop_PRem1C3210F64) {
    797          HReg junk = newVRegF(env);
    798          HReg dst  = newVRegI(env);
    799          HReg srcL = iselDblExpr(env, triop->arg2);
    800          HReg srcR = iselDblExpr(env, triop->arg3);
    801          /* XXXROUNDINGFIXME */
    802          /* set roundingmode here */
    803          addInstr(env, X86Instr_FpBinary(
    804                            e->Iex.Binop.op==Iop_PRemC3210F64
    805                               ? Xfp_PREM : Xfp_PREM1,
    806                            srcL,srcR,junk
    807                  ));
    808          /* The previous pseudo-insn will have left the FPU's C3210
    809             flags set correctly.  So bag them. */
    810          addInstr(env, X86Instr_FpStSW_AX());
    811          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
    812          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
    813          return dst;
    814       }
    815 
    816       break;
    817    }
    818 
    819    /* --------- BINARY OP --------- */
    820    case Iex_Binop: {
    821       X86AluOp   aluOp;
    822       X86ShiftOp shOp;
    823 
    824       /* Pattern: Sub32(0,x) */
    825       if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
    826          HReg dst = newVRegI(env);
    827          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    828          addInstr(env, mk_iMOVsd_RR(reg,dst));
    829          addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
    830          return dst;
    831       }
    832 
    833       /* Is it an addition or logical style op? */
    834       switch (e->Iex.Binop.op) {
    835          case Iop_Add8: case Iop_Add16: case Iop_Add32:
    836             aluOp = Xalu_ADD; break;
    837          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
    838             aluOp = Xalu_SUB; break;
    839          case Iop_And8: case Iop_And16: case Iop_And32:
    840             aluOp = Xalu_AND; break;
    841          case Iop_Or8: case Iop_Or16: case Iop_Or32:
    842             aluOp = Xalu_OR; break;
    843          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
    844             aluOp = Xalu_XOR; break;
    845          case Iop_Mul16: case Iop_Mul32:
    846             aluOp = Xalu_MUL; break;
    847          default:
    848             aluOp = Xalu_INVALID; break;
    849       }
    850       /* For commutative ops we assume any literal
    851          values are on the second operand. */
    852       if (aluOp != Xalu_INVALID) {
    853          HReg dst    = newVRegI(env);
    854          HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
    855          X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
    856          addInstr(env, mk_iMOVsd_RR(reg,dst));
    857          addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
    858          return dst;
    859       }
    860       /* Could do better here; forcing the first arg into a reg
    861          isn't always clever.
    862          -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
    863                         LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
    864                         t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
    865             movl 0xFFFFFFA0(%vr41),%vr107
    866             movl 0xFFFFFFA4(%vr41),%vr108
    867             movl %vr107,%vr106
    868             xorl %vr108,%vr106
    869             movl 0xFFFFFFA8(%vr41),%vr109
    870             movl %vr106,%vr105
    871             andl %vr109,%vr105
    872             movl 0xFFFFFFA0(%vr41),%vr110
    873             movl %vr105,%vr104
    874             xorl %vr110,%vr104
    875             movl %vr104,%vr70
    876       */
    877 
    878       /* Perhaps a shift op? */
    879       switch (e->Iex.Binop.op) {
    880          case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    881             shOp = Xsh_SHL; break;
    882          case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
    883             shOp = Xsh_SHR; break;
    884          case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
    885             shOp = Xsh_SAR; break;
    886          default:
    887             shOp = Xsh_INVALID; break;
    888       }
    889       if (shOp != Xsh_INVALID) {
    890          HReg dst = newVRegI(env);
    891 
    892          /* regL = the value to be shifted */
    893          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
    894          addInstr(env, mk_iMOVsd_RR(regL,dst));
    895 
    896          /* Do any necessary widening for 16/8 bit operands */
    897          switch (e->Iex.Binop.op) {
    898             case Iop_Shr8:
    899                addInstr(env, X86Instr_Alu32R(
    900                                 Xalu_AND, X86RMI_Imm(0xFF), dst));
    901                break;
    902             case Iop_Shr16:
    903                addInstr(env, X86Instr_Alu32R(
    904                                 Xalu_AND, X86RMI_Imm(0xFFFF), dst));
    905                break;
    906             case Iop_Sar8:
    907                addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
    908                addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
    909                break;
    910             case Iop_Sar16:
    911                addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
    912                addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
    913                break;
    914             default: break;
    915          }
    916 
    917          /* Now consider the shift amount.  If it's a literal, we
    918             can do a much better job than the general case. */
    919          if (e->Iex.Binop.arg2->tag == Iex_Const) {
    920             /* assert that the IR is well-typed */
    921             Int nshift;
    922             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
    923             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
    924 	    vassert(nshift >= 0);
    925 	    if (nshift > 0)
    926                /* Can't allow nshift==0 since that means %cl */
    927                addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
    928          } else {
    929             /* General case; we have to force the amount into %cl. */
    930             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
    931             addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
    932             addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
    933          }
    934          return dst;
    935       }
    936 
    937       /* Handle misc other ops. */
    938 
    939       if (e->Iex.Binop.op == Iop_Max32U) {
    940          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
    941          HReg dst  = newVRegI(env);
    942          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
    943          addInstr(env, mk_iMOVsd_RR(src1,dst));
    944          addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
    945          addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
    946          return dst;
    947       }
    948 
    949       if (e->Iex.Binop.op == Iop_8HLto16) {
    950          HReg hi8  = newVRegI(env);
    951          HReg lo8  = newVRegI(env);
    952          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    953          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    954          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
    955          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
    956          addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
    957          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
    958          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
    959          return hi8;
    960       }
    961 
    962       if (e->Iex.Binop.op == Iop_16HLto32) {
    963          HReg hi16  = newVRegI(env);
    964          HReg lo16  = newVRegI(env);
    965          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    966          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    967          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
    968          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
    969          addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
    970          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
    971          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
    972          return hi16;
    973       }
    974 
    975       if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
    976           || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
    977          HReg a16   = newVRegI(env);
    978          HReg b16   = newVRegI(env);
    979          HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
    980          HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
    981          Int  shift = (e->Iex.Binop.op == Iop_MullS8
    982                        || e->Iex.Binop.op == Iop_MullU8)
    983                          ? 24 : 16;
    984          X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
    985                               || e->Iex.Binop.op == Iop_MullS16)
    986                                 ? Xsh_SAR : Xsh_SHR;
    987 
    988          addInstr(env, mk_iMOVsd_RR(a16s, a16));
    989          addInstr(env, mk_iMOVsd_RR(b16s, b16));
    990          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
    991          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
    992          addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
    993          addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
    994          addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
    995          return b16;
    996       }
    997 
    998       if (e->Iex.Binop.op == Iop_CmpF64) {
    999          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
   1000          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
   1001          HReg dst = newVRegI(env);
   1002          addInstr(env, X86Instr_FpCmp(fL,fR,dst));
   1003          /* shift this right 8 bits so as to conform to CmpF64
   1004             definition. */
   1005          addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
   1006          return dst;
   1007       }
   1008 
   1009       if (e->Iex.Binop.op == Iop_F64toI32S
   1010           || e->Iex.Binop.op == Iop_F64toI16S) {
   1011          Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
   1012          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1013          HReg dst = newVRegI(env);
   1014 
   1015          /* Used several times ... */
   1016          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1017 
   1018 	 /* rf now holds the value to be converted, and rrm holds the
   1019 	    rounding mode value, encoded as per the IRRoundingMode
   1020 	    enum.  The first thing to do is set the FPU's rounding
   1021 	    mode accordingly. */
   1022 
   1023          /* Create a space for the format conversion. */
   1024          /* subl $4, %esp */
   1025          sub_from_esp(env, 4);
   1026 
   1027 	 /* Set host rounding mode */
   1028 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   1029 
   1030          /* gistw/l %rf, 0(%esp) */
   1031          addInstr(env, X86Instr_FpLdStI(False/*store*/,
   1032                                         toUChar(sz), rf, zero_esp));
   1033 
   1034          if (sz == 2) {
   1035             /* movzwl 0(%esp), %dst */
   1036             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
   1037          } else {
   1038             /* movl 0(%esp), %dst */
   1039             vassert(sz == 4);
   1040             addInstr(env, X86Instr_Alu32R(
   1041                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1042          }
   1043 
   1044 	 /* Restore default FPU rounding. */
   1045          set_FPU_rounding_default( env );
   1046 
   1047          /* addl $4, %esp */
   1048 	 add_to_esp(env, 4);
   1049          return dst;
   1050       }
   1051 
   1052       break;
   1053    }
   1054 
   1055    /* --------- UNARY OP --------- */
   1056    case Iex_Unop: {
   1057 
   1058       /* 1Uto8(32to1(expr32)) */
   1059       if (e->Iex.Unop.op == Iop_1Uto8) {
   1060          DECLARE_PATTERN(p_32to1_then_1Uto8);
   1061          DEFINE_PATTERN(p_32to1_then_1Uto8,
   1062                         unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
   1063          if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
   1064             IRExpr* expr32 = mi.bindee[0];
   1065             HReg dst = newVRegI(env);
   1066             HReg src = iselIntExpr_R(env, expr32);
   1067             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1068             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1069                                           X86RMI_Imm(1), dst));
   1070             return dst;
   1071          }
   1072       }
   1073 
   1074       /* 8Uto32(LDle(expr32)) */
   1075       if (e->Iex.Unop.op == Iop_8Uto32) {
   1076          DECLARE_PATTERN(p_LDle8_then_8Uto32);
   1077          DEFINE_PATTERN(p_LDle8_then_8Uto32,
   1078                         unop(Iop_8Uto32,
   1079                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1080          if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
   1081             HReg dst = newVRegI(env);
   1082             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1083             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1084             return dst;
   1085          }
   1086       }
   1087 
   1088       /* 8Sto32(LDle(expr32)) */
   1089       if (e->Iex.Unop.op == Iop_8Sto32) {
   1090          DECLARE_PATTERN(p_LDle8_then_8Sto32);
   1091          DEFINE_PATTERN(p_LDle8_then_8Sto32,
   1092                         unop(Iop_8Sto32,
   1093                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1094          if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
   1095             HReg dst = newVRegI(env);
   1096             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1097             addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
   1098             return dst;
   1099          }
   1100       }
   1101 
   1102       /* 16Uto32(LDle(expr32)) */
   1103       if (e->Iex.Unop.op == Iop_16Uto32) {
   1104          DECLARE_PATTERN(p_LDle16_then_16Uto32);
   1105          DEFINE_PATTERN(p_LDle16_then_16Uto32,
   1106                         unop(Iop_16Uto32,
   1107                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1108          if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
   1109             HReg dst = newVRegI(env);
   1110             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1111             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1112             return dst;
   1113          }
   1114       }
   1115 
   1116       /* 8Uto32(GET:I8) */
   1117       if (e->Iex.Unop.op == Iop_8Uto32) {
   1118          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1119             HReg      dst;
   1120             X86AMode* amode;
   1121             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
   1122             dst = newVRegI(env);
   1123             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1124                                 hregX86_EBP());
   1125             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1126             return dst;
   1127          }
   1128       }
   1129 
   1130       /* 16to32(GET:I16) */
   1131       if (e->Iex.Unop.op == Iop_16Uto32) {
   1132          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1133             HReg      dst;
   1134             X86AMode* amode;
   1135             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
   1136             dst = newVRegI(env);
   1137             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1138                                 hregX86_EBP());
   1139             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1140             return dst;
   1141          }
   1142       }
   1143 
   1144       switch (e->Iex.Unop.op) {
   1145          case Iop_8Uto16:
   1146          case Iop_8Uto32:
   1147          case Iop_16Uto32: {
   1148             HReg dst = newVRegI(env);
   1149             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1150             UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
   1151             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1152             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1153                                           X86RMI_Imm(mask), dst));
   1154             return dst;
   1155          }
   1156          case Iop_8Sto16:
   1157          case Iop_8Sto32:
   1158          case Iop_16Sto32: {
   1159             HReg dst = newVRegI(env);
   1160             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1161             UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
   1162             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1163             addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
   1164             addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
   1165             return dst;
   1166          }
   1167 	 case Iop_Not8:
   1168 	 case Iop_Not16:
   1169          case Iop_Not32: {
   1170             HReg dst = newVRegI(env);
   1171             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1172             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1173             addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
   1174             return dst;
   1175          }
   1176          case Iop_64HIto32: {
   1177             HReg rHi, rLo;
   1178             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1179             return rHi; /* and abandon rLo .. poor wee thing :-) */
   1180          }
   1181          case Iop_64to32: {
   1182             HReg rHi, rLo;
   1183             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1184             return rLo; /* similar stupid comment to the above ... */
   1185          }
   1186          case Iop_16HIto8:
   1187          case Iop_32HIto16: {
   1188             HReg dst  = newVRegI(env);
   1189             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1190             Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
   1191             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1192             addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
   1193             return dst;
   1194          }
   1195          case Iop_1Uto32:
   1196          case Iop_1Uto8: {
   1197             HReg dst         = newVRegI(env);
   1198             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1199             addInstr(env, X86Instr_Set32(cond,dst));
   1200             return dst;
   1201          }
   1202          case Iop_1Sto8:
   1203          case Iop_1Sto16:
   1204          case Iop_1Sto32: {
   1205             /* could do better than this, but for now ... */
   1206             HReg dst         = newVRegI(env);
   1207             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1208             addInstr(env, X86Instr_Set32(cond,dst));
   1209             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
   1210             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1211             return dst;
   1212          }
   1213          case Iop_Ctz32: {
   1214             /* Count trailing zeroes, implemented by x86 'bsfl' */
   1215             HReg dst = newVRegI(env);
   1216             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1217             addInstr(env, X86Instr_Bsfr32(True,src,dst));
   1218             return dst;
   1219          }
   1220          case Iop_Clz32: {
   1221             /* Count leading zeroes.  Do 'bsrl' to establish the index
   1222                of the highest set bit, and subtract that value from
   1223                31. */
   1224             HReg tmp = newVRegI(env);
   1225             HReg dst = newVRegI(env);
   1226             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1227             addInstr(env, X86Instr_Bsfr32(False,src,tmp));
   1228             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
   1229                                           X86RMI_Imm(31), dst));
   1230             addInstr(env, X86Instr_Alu32R(Xalu_SUB,
   1231                                           X86RMI_Reg(tmp), dst));
   1232             return dst;
   1233          }
   1234 
   1235          case Iop_CmpwNEZ32: {
   1236             HReg dst = newVRegI(env);
   1237             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1238             addInstr(env, mk_iMOVsd_RR(src,dst));
   1239             addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
   1240             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   1241                                           X86RMI_Reg(src), dst));
   1242             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1243             return dst;
   1244          }
   1245          case Iop_Left8:
   1246          case Iop_Left16:
   1247          case Iop_Left32: {
   1248             HReg dst = newVRegI(env);
   1249             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1250             addInstr(env, mk_iMOVsd_RR(src, dst));
   1251             addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
   1252             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
   1253             return dst;
   1254          }
   1255 
   1256          case Iop_V128to32: {
   1257             HReg      dst  = newVRegI(env);
   1258             HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
   1259             X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   1260             sub_from_esp(env, 16);
   1261             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   1262             addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
   1263             add_to_esp(env, 16);
   1264             return dst;
   1265          }
   1266 
   1267          /* ReinterpF32asI32(e) */
   1268          /* Given an IEEE754 single, produce an I32 with the same bit
   1269             pattern.  Keep stack 8-aligned even though only using 4
   1270             bytes. */
   1271          case Iop_ReinterpF32asI32: {
   1272             HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
   1273             HReg dst  = newVRegI(env);
   1274             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1275             /* paranoia */
   1276             set_FPU_rounding_default(env);
   1277             /* subl $8, %esp */
   1278             sub_from_esp(env, 8);
   1279             /* gstF %rf, 0(%esp) */
   1280             addInstr(env,
   1281                      X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
   1282             /* movl 0(%esp), %dst */
   1283             addInstr(env,
   1284                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1285             /* addl $8, %esp */
   1286             add_to_esp(env, 8);
   1287             return dst;
   1288          }
   1289 
   1290          case Iop_16to8:
   1291          case Iop_32to8:
   1292          case Iop_32to16:
   1293             /* These are no-ops. */
   1294             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1295 
   1296          default:
   1297             break;
   1298       }
   1299       break;
   1300    }
   1301 
   1302    /* --------- GET --------- */
   1303    case Iex_Get: {
   1304       if (ty == Ity_I32) {
   1305          HReg dst = newVRegI(env);
   1306          addInstr(env, X86Instr_Alu32R(
   1307                           Xalu_MOV,
   1308                           X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1309                                                  hregX86_EBP())),
   1310                           dst));
   1311          return dst;
   1312       }
   1313       if (ty == Ity_I8 || ty == Ity_I16) {
   1314          HReg dst = newVRegI(env);
   1315          addInstr(env, X86Instr_LoadEX(
   1316                           toUChar(ty==Ity_I8 ? 1 : 2),
   1317                           False,
   1318                           X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
   1319                           dst));
   1320          return dst;
   1321       }
   1322       break;
   1323    }
   1324 
   1325    case Iex_GetI: {
   1326       X86AMode* am
   1327          = genGuestArrayOffset(
   1328               env, e->Iex.GetI.descr,
   1329                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1330       HReg dst = newVRegI(env);
   1331       if (ty == Ity_I8) {
   1332          addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
   1333          return dst;
   1334       }
   1335       if (ty == Ity_I32) {
   1336          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
   1337          return dst;
   1338       }
   1339       break;
   1340    }
   1341 
   1342    /* --------- CCALL --------- */
   1343    case Iex_CCall: {
   1344       HReg    dst = newVRegI(env);
   1345       vassert(ty == e->Iex.CCall.retty);
   1346 
   1347       /* be very restrictive for now.  Only 32/64-bit ints allowed
   1348          for args, and 32 bits for return type. */
   1349       if (e->Iex.CCall.retty != Ity_I32)
   1350          goto irreducible;
   1351 
   1352       /* Marshal args, do the call, clear stack. */
   1353       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   1354 
   1355       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
   1356       return dst;
   1357    }
   1358 
   1359    /* --------- LITERAL --------- */
   1360    /* 32/16/8-bit literals */
   1361    case Iex_Const: {
   1362       X86RMI* rmi = iselIntExpr_RMI ( env, e );
   1363       HReg    r   = newVRegI(env);
   1364       addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
   1365       return r;
   1366    }
   1367 
   1368    /* --------- MULTIPLEX --------- */
   1369    case Iex_Mux0X: {
   1370      if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1371          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   1372         X86RM* r8;
   1373         HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
   1374         X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
   1375         HReg   dst = newVRegI(env);
   1376         addInstr(env, mk_iMOVsd_RR(rX,dst));
   1377         r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   1378         addInstr(env, X86Instr_Test32(0xFF, r8));
   1379         addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
   1380         return dst;
   1381       }
   1382       break;
   1383    }
   1384 
   1385    default:
   1386    break;
   1387    } /* switch (e->tag) */
   1388 
   1389    /* We get here if no pattern matched. */
   1390   irreducible:
   1391    ppIRExpr(e);
   1392    vpanic("iselIntExpr_R: cannot reduce tree");
   1393 }
   1394 
   1395 
   1396 /*---------------------------------------------------------*/
   1397 /*--- ISEL: Integer expression auxiliaries              ---*/
   1398 /*---------------------------------------------------------*/
   1399 
   1400 /* --------------------- AMODEs --------------------- */
   1401 
   1402 /* Return an AMode which computes the value of the specified
   1403    expression, possibly also adding insns to the code list as a
   1404    result.  The expression may only be a 32-bit one.
   1405 */
   1406 
   1407 static Bool sane_AMode ( X86AMode* am )
   1408 {
   1409    switch (am->tag) {
   1410       case Xam_IR:
   1411          return
   1412             toBool( hregClass(am->Xam.IR.reg) == HRcInt32
   1413                     && (hregIsVirtual(am->Xam.IR.reg)
   1414                         || am->Xam.IR.reg == hregX86_EBP()) );
   1415       case Xam_IRRS:
   1416          return
   1417             toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
   1418                     && hregIsVirtual(am->Xam.IRRS.base)
   1419                     && hregClass(am->Xam.IRRS.index) == HRcInt32
   1420                     && hregIsVirtual(am->Xam.IRRS.index) );
   1421       default:
   1422         vpanic("sane_AMode: unknown x86 amode tag");
   1423    }
   1424 }
   1425 
   1426 static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1427 {
   1428    X86AMode* am = iselIntExpr_AMode_wrk(env, e);
   1429    vassert(sane_AMode(am));
   1430    return am;
   1431 }
   1432 
   1433 /* DO NOT CALL THIS DIRECTLY ! */
   1434 static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1435 {
   1436    IRType ty = typeOfIRExpr(env->type_env,e);
   1437    vassert(ty == Ity_I32);
   1438 
   1439    /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
   1440    if (e->tag == Iex_Binop
   1441        && e->Iex.Binop.op == Iop_Add32
   1442        && e->Iex.Binop.arg2->tag == Iex_Const
   1443        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
   1444        && e->Iex.Binop.arg1->tag == Iex_Binop
   1445        && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
   1446        && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
   1447        && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1448        && e->Iex.Binop.arg1
   1449            ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1450        && e->Iex.Binop.arg1
   1451            ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1452       UInt shift = e->Iex.Binop.arg1
   1453                     ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1454       UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
   1455       if (shift == 1 || shift == 2 || shift == 3) {
   1456          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
   1457          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
   1458                                        ->Iex.Binop.arg2->Iex.Binop.arg1 );
   1459          return X86AMode_IRRS(imm32, r1, r2, shift);
   1460       }
   1461    }
   1462 
   1463    /* Add32(expr1, Shl32(expr2, imm)) */
   1464    if (e->tag == Iex_Binop
   1465        && e->Iex.Binop.op == Iop_Add32
   1466        && e->Iex.Binop.arg2->tag == Iex_Binop
   1467        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1468        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1469        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1470       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1471       if (shift == 1 || shift == 2 || shift == 3) {
   1472          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1473          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1474          return X86AMode_IRRS(0, r1, r2, shift);
   1475       }
   1476    }
   1477 
   1478    /* Add32(expr,i) */
   1479    if (e->tag == Iex_Binop
   1480        && e->Iex.Binop.op == Iop_Add32
   1481        && e->Iex.Binop.arg2->tag == Iex_Const
   1482        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
   1483       HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
   1484       return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
   1485    }
   1486 
   1487    /* Doesn't match anything in particular.  Generate it into
   1488       a register and use that. */
   1489    {
   1490       HReg r1 = iselIntExpr_R(env, e);
   1491       return X86AMode_IR(0, r1);
   1492    }
   1493 }
   1494 
   1495 
   1496 /* --------------------- RMIs --------------------- */
   1497 
   1498 /* Similarly, calculate an expression into an X86RMI operand.  As with
   1499    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1500 
   1501 static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   1502 {
   1503    X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   1504    /* sanity checks ... */
   1505    switch (rmi->tag) {
   1506       case Xrmi_Imm:
   1507          return rmi;
   1508       case Xrmi_Reg:
   1509          vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
   1510          vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
   1511          return rmi;
   1512       case Xrmi_Mem:
   1513          vassert(sane_AMode(rmi->Xrmi.Mem.am));
   1514          return rmi;
   1515       default:
   1516          vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
   1517    }
   1518 }
   1519 
   1520 /* DO NOT CALL THIS DIRECTLY ! */
   1521 static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   1522 {
   1523    IRType ty = typeOfIRExpr(env->type_env,e);
   1524    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1525 
   1526    /* special case: immediate */
   1527    if (e->tag == Iex_Const) {
   1528       UInt u;
   1529       switch (e->Iex.Const.con->tag) {
   1530          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1531          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1532          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1533          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1534       }
   1535       return X86RMI_Imm(u);
   1536    }
   1537 
   1538    /* special case: 32-bit GET */
   1539    if (e->tag == Iex_Get && ty == Ity_I32) {
   1540       return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1541                                     hregX86_EBP()));
   1542    }
   1543 
   1544    /* special case: 32-bit load from memory */
   1545    if (e->tag == Iex_Load && ty == Ity_I32
   1546        && e->Iex.Load.end == Iend_LE) {
   1547       X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1548       return X86RMI_Mem(am);
   1549    }
   1550 
   1551    /* default case: calculate into a register and return that */
   1552    {
   1553       HReg r = iselIntExpr_R ( env, e );
   1554       return X86RMI_Reg(r);
   1555    }
   1556 }
   1557 
   1558 
   1559 /* --------------------- RIs --------------------- */
   1560 
   1561 /* Calculate an expression into an X86RI operand.  As with
   1562    iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
   1563 
   1564 static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   1565 {
   1566    X86RI* ri = iselIntExpr_RI_wrk(env, e);
   1567    /* sanity checks ... */
   1568    switch (ri->tag) {
   1569       case Xri_Imm:
   1570          return ri;
   1571       case Xri_Reg:
   1572          vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
   1573          vassert(hregIsVirtual(ri->Xri.Reg.reg));
   1574          return ri;
   1575       default:
   1576          vpanic("iselIntExpr_RI: unknown x86 RI tag");
   1577    }
   1578 }
   1579 
   1580 /* DO NOT CALL THIS DIRECTLY ! */
   1581 static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   1582 {
   1583    IRType ty = typeOfIRExpr(env->type_env,e);
   1584    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1585 
   1586    /* special case: immediate */
   1587    if (e->tag == Iex_Const) {
   1588       UInt u;
   1589       switch (e->Iex.Const.con->tag) {
   1590          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1591          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1592          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1593          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1594       }
   1595       return X86RI_Imm(u);
   1596    }
   1597 
   1598    /* default case: calculate into a register and return that */
   1599    {
   1600       HReg r = iselIntExpr_R ( env, e );
   1601       return X86RI_Reg(r);
   1602    }
   1603 }
   1604 
   1605 
   1606 /* --------------------- RMs --------------------- */
   1607 
   1608 /* Similarly, calculate an expression into an X86RM operand.  As with
   1609    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1610 
   1611 static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   1612 {
   1613    X86RM* rm = iselIntExpr_RM_wrk(env, e);
   1614    /* sanity checks ... */
   1615    switch (rm->tag) {
   1616       case Xrm_Reg:
   1617          vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
   1618          vassert(hregIsVirtual(rm->Xrm.Reg.reg));
   1619          return rm;
   1620       case Xrm_Mem:
   1621          vassert(sane_AMode(rm->Xrm.Mem.am));
   1622          return rm;
   1623       default:
   1624          vpanic("iselIntExpr_RM: unknown x86 RM tag");
   1625    }
   1626 }
   1627 
   1628 /* DO NOT CALL THIS DIRECTLY ! */
   1629 static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   1630 {
   1631    IRType ty = typeOfIRExpr(env->type_env,e);
   1632    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1633 
   1634    /* special case: 32-bit GET */
   1635    if (e->tag == Iex_Get && ty == Ity_I32) {
   1636       return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
   1637                                    hregX86_EBP()));
   1638    }
   1639 
   1640    /* special case: load from memory */
   1641 
   1642    /* default case: calculate into a register and return that */
   1643    {
   1644       HReg r = iselIntExpr_R ( env, e );
   1645       return X86RM_Reg(r);
   1646    }
   1647 }
   1648 
   1649 
   1650 /* --------------------- CONDCODE --------------------- */
   1651 
   1652 /* Generate code to evaluated a bit-typed expression, returning the
   1653    condition code which would correspond when the expression would
   1654    notionally have returned 1. */
   1655 
   1656 static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   1657 {
   1658    /* Uh, there's nothing we can sanity check here, unfortunately. */
   1659    return iselCondCode_wrk(env,e);
   1660 }
   1661 
   1662 /* DO NOT CALL THIS DIRECTLY ! */
   1663 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   1664 {
   1665    MatchInfo mi;
   1666 
   1667    vassert(e);
   1668    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   1669 
   1670    /* var */
   1671    if (e->tag == Iex_RdTmp) {
   1672       HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   1673       /* Test32 doesn't modify r32; so this is OK. */
   1674       addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
   1675       return Xcc_NZ;
   1676    }
   1677 
   1678    /* Constant 1:Bit */
   1679    if (e->tag == Iex_Const) {
   1680       HReg r;
   1681       vassert(e->Iex.Const.con->tag == Ico_U1);
   1682       vassert(e->Iex.Const.con->Ico.U1 == True
   1683               || e->Iex.Const.con->Ico.U1 == False);
   1684       r = newVRegI(env);
   1685       addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
   1686       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
   1687       return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
   1688    }
   1689 
   1690    /* Not1(e) */
   1691    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   1692       /* Generate code for the arg, and negate the test condition */
   1693       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   1694    }
   1695 
   1696    /* --- patterns rooted at: 32to1 --- */
   1697 
   1698    if (e->tag == Iex_Unop
   1699        && e->Iex.Unop.op == Iop_32to1) {
   1700       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1701       addInstr(env, X86Instr_Test32(1,rm));
   1702       return Xcc_NZ;
   1703    }
   1704 
   1705    /* --- patterns rooted at: CmpNEZ8 --- */
   1706 
   1707    /* CmpNEZ8(x) */
   1708    if (e->tag == Iex_Unop
   1709        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   1710       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1711       addInstr(env, X86Instr_Test32(0xFF,rm));
   1712       return Xcc_NZ;
   1713    }
   1714 
   1715    /* --- patterns rooted at: CmpNEZ16 --- */
   1716 
   1717    /* CmpNEZ16(x) */
   1718    if (e->tag == Iex_Unop
   1719        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   1720       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1721       addInstr(env, X86Instr_Test32(0xFFFF,rm));
   1722       return Xcc_NZ;
   1723    }
   1724 
   1725    /* --- patterns rooted at: CmpNEZ32 --- */
   1726 
   1727    /* CmpNEZ32(And32(x,y)) */
   1728    {
   1729       DECLARE_PATTERN(p_CmpNEZ32_And32);
   1730       DEFINE_PATTERN(p_CmpNEZ32_And32,
   1731                      unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
   1732       if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
   1733          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1734          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1735          HReg    tmp  = newVRegI(env);
   1736          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1737          addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
   1738          return Xcc_NZ;
   1739       }
   1740    }
   1741 
   1742    /* CmpNEZ32(Or32(x,y)) */
   1743    {
   1744       DECLARE_PATTERN(p_CmpNEZ32_Or32);
   1745       DEFINE_PATTERN(p_CmpNEZ32_Or32,
   1746                      unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
   1747       if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
   1748          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1749          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1750          HReg    tmp  = newVRegI(env);
   1751          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1752          addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
   1753          return Xcc_NZ;
   1754       }
   1755    }
   1756 
   1757    /* CmpNEZ32(GET(..):I32) */
   1758    if (e->tag == Iex_Unop
   1759        && e->Iex.Unop.op == Iop_CmpNEZ32
   1760        && e->Iex.Unop.arg->tag == Iex_Get) {
   1761       X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1762                                  hregX86_EBP());
   1763       addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
   1764       return Xcc_NZ;
   1765    }
   1766 
   1767    /* CmpNEZ32(x) */
   1768    if (e->tag == Iex_Unop
   1769        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   1770       HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   1771       X86RMI* rmi2 = X86RMI_Imm(0);
   1772       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1773       return Xcc_NZ;
   1774    }
   1775 
   1776    /* --- patterns rooted at: CmpNEZ64 --- */
   1777 
   1778    /* CmpNEZ64(Or64(x,y)) */
   1779    {
   1780       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   1781       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   1782                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   1783       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   1784          HReg    hi1, lo1, hi2, lo2;
   1785          HReg    tmp  = newVRegI(env);
   1786          iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
   1787          addInstr(env, mk_iMOVsd_RR(hi1, tmp));
   1788          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
   1789          iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
   1790          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
   1791          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
   1792          return Xcc_NZ;
   1793       }
   1794    }
   1795 
   1796    /* CmpNEZ64(x) */
   1797    if (e->tag == Iex_Unop
   1798        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   1799       HReg hi, lo;
   1800       HReg tmp = newVRegI(env);
   1801       iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
   1802       addInstr(env, mk_iMOVsd_RR(hi, tmp));
   1803       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
   1804       return Xcc_NZ;
   1805    }
   1806 
   1807    /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
   1808 
   1809    /* CmpEQ8 / CmpNE8 */
   1810    if (e->tag == Iex_Binop
   1811        && (e->Iex.Binop.op == Iop_CmpEQ8
   1812            || e->Iex.Binop.op == Iop_CmpNE8
   1813            || e->Iex.Binop.op == Iop_CasCmpEQ8
   1814            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   1815       if (isZeroU8(e->Iex.Binop.arg2)) {
   1816          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1817          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
   1818          switch (e->Iex.Binop.op) {
   1819             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1820             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1821             default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
   1822          }
   1823       } else {
   1824          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1825          X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1826          HReg    r    = newVRegI(env);
   1827          addInstr(env, mk_iMOVsd_RR(r1,r));
   1828          addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1829          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
   1830          switch (e->Iex.Binop.op) {
   1831             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1832             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1833             default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
   1834          }
   1835       }
   1836    }
   1837 
   1838    /* CmpEQ16 / CmpNE16 */
   1839    if (e->tag == Iex_Binop
   1840        && (e->Iex.Binop.op == Iop_CmpEQ16
   1841            || e->Iex.Binop.op == Iop_CmpNE16
   1842            || e->Iex.Binop.op == Iop_CasCmpEQ16
   1843            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   1844       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1845       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1846       HReg    r    = newVRegI(env);
   1847       addInstr(env, mk_iMOVsd_RR(r1,r));
   1848       addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1849       addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
   1850       switch (e->Iex.Binop.op) {
   1851          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
   1852          case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
   1853          default: vpanic("iselCondCode(x86): CmpXX16");
   1854       }
   1855    }
   1856 
   1857    /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
   1858       Saves a "movl %eax, %tmp" compared to the default route. */
   1859    if (e->tag == Iex_Binop
   1860        && e->Iex.Binop.op == Iop_CmpNE32
   1861        && e->Iex.Binop.arg1->tag == Iex_CCall
   1862        && e->Iex.Binop.arg2->tag == Iex_Const) {
   1863       IRExpr* cal = e->Iex.Binop.arg1;
   1864       IRExpr* con = e->Iex.Binop.arg2;
   1865       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   1866       vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
   1867       vassert(con->Iex.Const.con->tag == Ico_U32);
   1868       /* Marshal args, do the call. */
   1869       doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
   1870       addInstr(env, X86Instr_Alu32R(Xalu_CMP,
   1871                                     X86RMI_Imm(con->Iex.Const.con->Ico.U32),
   1872                                     hregX86_EAX()));
   1873       return Xcc_NZ;
   1874    }
   1875 
   1876    /* Cmp*32*(x,y) */
   1877    if (e->tag == Iex_Binop
   1878        && (e->Iex.Binop.op == Iop_CmpEQ32
   1879            || e->Iex.Binop.op == Iop_CmpNE32
   1880            || e->Iex.Binop.op == Iop_CmpLT32S
   1881            || e->Iex.Binop.op == Iop_CmpLT32U
   1882            || e->Iex.Binop.op == Iop_CmpLE32S
   1883            || e->Iex.Binop.op == Iop_CmpLE32U
   1884            || e->Iex.Binop.op == Iop_CasCmpEQ32
   1885            || e->Iex.Binop.op == Iop_CasCmpNE32)) {
   1886       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1887       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1888       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1889       switch (e->Iex.Binop.op) {
   1890          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
   1891          case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
   1892          case Iop_CmpLT32S: return Xcc_L;
   1893          case Iop_CmpLT32U: return Xcc_B;
   1894          case Iop_CmpLE32S: return Xcc_LE;
   1895          case Iop_CmpLE32U: return Xcc_BE;
   1896          default: vpanic("iselCondCode(x86): CmpXX32");
   1897       }
   1898    }
   1899 
   1900    /* CmpNE64 */
   1901    if (e->tag == Iex_Binop
   1902        && (e->Iex.Binop.op == Iop_CmpNE64
   1903            || e->Iex.Binop.op == Iop_CmpEQ64)) {
   1904       HReg hi1, hi2, lo1, lo2;
   1905       HReg tHi = newVRegI(env);
   1906       HReg tLo = newVRegI(env);
   1907       iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
   1908       iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
   1909       addInstr(env, mk_iMOVsd_RR(hi1, tHi));
   1910       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
   1911       addInstr(env, mk_iMOVsd_RR(lo1, tLo));
   1912       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
   1913       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
   1914       switch (e->Iex.Binop.op) {
   1915          case Iop_CmpNE64: return Xcc_NZ;
   1916          case Iop_CmpEQ64: return Xcc_Z;
   1917          default: vpanic("iselCondCode(x86): CmpXX64");
   1918       }
   1919    }
   1920 
   1921    ppIRExpr(e);
   1922    vpanic("iselCondCode");
   1923 }
   1924 
   1925 
   1926 /*---------------------------------------------------------*/
   1927 /*--- ISEL: Integer expressions (64 bit)                ---*/
   1928 /*---------------------------------------------------------*/
   1929 
   1930 /* Compute a 64-bit value into a register pair, which is returned as
   1931    the first two parameters.  As with iselIntExpr_R, these may be
   1932    either real or virtual regs; in any case they must not be changed
   1933    by subsequent code emitted by the caller.  */
   1934 
   1935 static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1936 {
   1937    iselInt64Expr_wrk(rHi, rLo, env, e);
   1938 #  if 0
   1939    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   1940 #  endif
   1941    vassert(hregClass(*rHi) == HRcInt32);
   1942    vassert(hregIsVirtual(*rHi));
   1943    vassert(hregClass(*rLo) == HRcInt32);
   1944    vassert(hregIsVirtual(*rLo));
   1945 }
   1946 
   1947 /* DO NOT CALL THIS DIRECTLY ! */
   1948 static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1949 {
   1950    MatchInfo mi;
   1951    HWord fn = 0; /* helper fn for most SIMD64 stuff */
   1952    vassert(e);
   1953    vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
   1954 
   1955    /* 64-bit literal */
   1956    if (e->tag == Iex_Const) {
   1957       ULong w64 = e->Iex.Const.con->Ico.U64;
   1958       UInt  wHi = toUInt(w64 >> 32);
   1959       UInt  wLo = toUInt(w64);
   1960       HReg  tLo = newVRegI(env);
   1961       HReg  tHi = newVRegI(env);
   1962       vassert(e->Iex.Const.con->tag == Ico_U64);
   1963       if (wLo == wHi) {
   1964          /* Save a precious Int register in this special case. */
   1965          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1966          *rHi = tLo;
   1967          *rLo = tLo;
   1968       } else {
   1969          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
   1970          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1971          *rHi = tHi;
   1972          *rLo = tLo;
   1973       }
   1974       return;
   1975    }
   1976 
   1977    /* read 64-bit IRTemp */
   1978    if (e->tag == Iex_RdTmp) {
   1979       lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
   1980       return;
   1981    }
   1982 
   1983    /* 64-bit load */
   1984    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   1985       HReg     tLo, tHi;
   1986       X86AMode *am0, *am4;
   1987       vassert(e->Iex.Load.ty == Ity_I64);
   1988       tLo = newVRegI(env);
   1989       tHi = newVRegI(env);
   1990       am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1991       am4 = advance4(am0);
   1992       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
   1993       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1994       *rHi = tHi;
   1995       *rLo = tLo;
   1996       return;
   1997    }
   1998 
   1999    /* 64-bit GET */
   2000    if (e->tag == Iex_Get) {
   2001       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
   2002       X86AMode* am4 = advance4(am);
   2003       HReg tLo = newVRegI(env);
   2004       HReg tHi = newVRegI(env);
   2005       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   2006       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2007       *rHi = tHi;
   2008       *rLo = tLo;
   2009       return;
   2010    }
   2011 
   2012    /* 64-bit GETI */
   2013    if (e->tag == Iex_GetI) {
   2014       X86AMode* am
   2015          = genGuestArrayOffset( env, e->Iex.GetI.descr,
   2016                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
   2017       X86AMode* am4 = advance4(am);
   2018       HReg tLo = newVRegI(env);
   2019       HReg tHi = newVRegI(env);
   2020       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   2021       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2022       *rHi = tHi;
   2023       *rLo = tLo;
   2024       return;
   2025    }
   2026 
   2027    /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
   2028    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
   2029       X86RM* r8;
   2030       HReg e0Lo, e0Hi;
   2031       HReg tLo = newVRegI(env);
   2032       HReg tHi = newVRegI(env);
   2033       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2034       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2035       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2036       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   2037       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2038       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2039       addInstr(env, X86Instr_Test32(0xFF, r8));
   2040       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
   2041       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
   2042       add_to_esp(env, 4);
   2043       *rHi = tHi;
   2044       *rLo = tLo;
   2045       return;
   2046    }
   2047    /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
   2048    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
   2049       X86RM* r8;
   2050       HReg e0Lo, e0Hi;
   2051       HReg tLo = newVRegI(env);
   2052       HReg tHi = newVRegI(env);
   2053       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2054       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
   2055       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2056       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   2057       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2058       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2059       addInstr(env, X86Instr_Test32(0xFF, r8));
   2060       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
   2061       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
   2062       add_to_esp(env, 4);
   2063       *rHi = tHi;
   2064       *rLo = tLo;
   2065       return;
   2066    }
   2067 
   2068    /* 64-bit Mux0X: Mux0X(g, expr, expr) */
   2069    if (e->tag == Iex_Mux0X) {
   2070       X86RM* r8;
   2071       HReg e0Lo, e0Hi, eXLo, eXHi;
   2072       HReg tLo = newVRegI(env);
   2073       HReg tHi = newVRegI(env);
   2074       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2075       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
   2076       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
   2077       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
   2078       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2079       addInstr(env, X86Instr_Test32(0xFF, r8));
   2080       /* This assumes the first cmov32 doesn't trash the condition
   2081          codes, so they are still available for the second cmov32 */
   2082       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
   2083       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
   2084       *rHi = tHi;
   2085       *rLo = tLo;
   2086       return;
   2087    }
   2088 
   2089    /* --------- BINARY ops --------- */
   2090    if (e->tag == Iex_Binop) {
   2091       switch (e->Iex.Binop.op) {
   2092          /* 32 x 32 -> 64 multiply */
   2093          case Iop_MullU32:
   2094          case Iop_MullS32: {
   2095             /* get one operand into %eax, and the other into a R/M.
   2096                Need to make an educated guess about which is better in
   2097                which. */
   2098             HReg   tLo    = newVRegI(env);
   2099             HReg   tHi    = newVRegI(env);
   2100             Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
   2101             X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2102             HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2103             addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
   2104             addInstr(env, X86Instr_MulL(syned, rmLeft));
   2105             /* Result is now in EDX:EAX.  Tell the caller. */
   2106             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2107             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2108             *rHi = tHi;
   2109             *rLo = tLo;
   2110             return;
   2111          }
   2112 
   2113          /* 64 x 32 -> (32(rem),32(div)) division */
   2114          case Iop_DivModU64to32:
   2115          case Iop_DivModS64to32: {
   2116             /* Get the 64-bit operand into edx:eax, and the other into
   2117                any old R/M. */
   2118             HReg sHi, sLo;
   2119             HReg   tLo     = newVRegI(env);
   2120             HReg   tHi     = newVRegI(env);
   2121             Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   2122             X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2123             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2124             addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
   2125             addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
   2126             addInstr(env, X86Instr_Div(syned, rmRight));
   2127             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2128             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2129             *rHi = tHi;
   2130             *rLo = tLo;
   2131             return;
   2132          }
   2133 
   2134          /* Or64/And64/Xor64 */
   2135          case Iop_Or64:
   2136          case Iop_And64:
   2137          case Iop_Xor64: {
   2138             HReg xLo, xHi, yLo, yHi;
   2139             HReg tLo = newVRegI(env);
   2140             HReg tHi = newVRegI(env);
   2141             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
   2142                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
   2143                           : Xalu_XOR;
   2144             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2145             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2146             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2147             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
   2148             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2149             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
   2150             *rHi = tHi;
   2151             *rLo = tLo;
   2152             return;
   2153          }
   2154 
   2155          /* Add64/Sub64 */
   2156          case Iop_Add64:
   2157             if (e->Iex.Binop.arg2->tag == Iex_Const) {
   2158                /* special case Add64(e, const) */
   2159                ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
   2160                UInt  wHi = toUInt(w64 >> 32);
   2161                UInt  wLo = toUInt(w64);
   2162                HReg  tLo = newVRegI(env);
   2163                HReg  tHi = newVRegI(env);
   2164                HReg  xLo, xHi;
   2165                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
   2166                iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2167                addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2168                addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2169                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
   2170                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
   2171                *rHi = tHi;
   2172                *rLo = tLo;
   2173                return;
   2174             }
   2175             /* else fall through to the generic case */
   2176          case Iop_Sub64: {
   2177             HReg xLo, xHi, yLo, yHi;
   2178             HReg tLo = newVRegI(env);
   2179             HReg tHi = newVRegI(env);
   2180             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2181             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2182             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2183             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2184             if (e->Iex.Binop.op==Iop_Add64) {
   2185                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
   2186                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
   2187             } else {
   2188                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2189                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2190             }
   2191             *rHi = tHi;
   2192             *rLo = tLo;
   2193             return;
   2194          }
   2195 
   2196          /* 32HLto64(e1,e2) */
   2197          case Iop_32HLto64:
   2198             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2199             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2200             return;
   2201 
   2202          /* 64-bit shifts */
   2203          case Iop_Shl64: {
   2204             /* We use the same ingenious scheme as gcc.  Put the value
   2205                to be shifted into %hi:%lo, and the shift amount into
   2206                %cl.  Then (dsts on right, a la ATT syntax):
   2207 
   2208                shldl %cl, %lo, %hi   -- make %hi be right for the
   2209                                      -- shift amt %cl % 32
   2210                shll  %cl, %lo        -- make %lo be right for the
   2211                                      -- shift amt %cl % 32
   2212 
   2213                Now, if (shift amount % 64) is in the range 32 .. 63,
   2214                we have to do a fixup, which puts the result low half
   2215                into the result high half, and zeroes the low half:
   2216 
   2217                testl $32, %ecx
   2218 
   2219                cmovnz %lo, %hi
   2220                movl $0, %tmp         -- sigh; need yet another reg
   2221                cmovnz %tmp, %lo
   2222             */
   2223             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2224             tLo = newVRegI(env);
   2225             tHi = newVRegI(env);
   2226             tTemp = newVRegI(env);
   2227             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2228             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2229             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2230             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2231             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2232             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2233                and those regs are legitimately modifiable. */
   2234             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
   2235             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
   2236             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2237             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
   2238             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2239             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
   2240             *rHi = tHi;
   2241             *rLo = tLo;
   2242             return;
   2243          }
   2244 
   2245          case Iop_Shr64: {
   2246             /* We use the same ingenious scheme as gcc.  Put the value
   2247                to be shifted into %hi:%lo, and the shift amount into
   2248                %cl.  Then:
   2249 
   2250                shrdl %cl, %hi, %lo   -- make %lo be right for the
   2251                                      -- shift amt %cl % 32
   2252                shrl  %cl, %hi        -- make %hi be right for the
   2253                                      -- shift amt %cl % 32
   2254 
   2255                Now, if (shift amount % 64) is in the range 32 .. 63,
   2256                we have to do a fixup, which puts the result high half
   2257                into the result low half, and zeroes the high half:
   2258 
   2259                testl $32, %ecx
   2260 
   2261                cmovnz %hi, %lo
   2262                movl $0, %tmp         -- sigh; need yet another reg
   2263                cmovnz %tmp, %hi
   2264             */
   2265             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2266             tLo = newVRegI(env);
   2267             tHi = newVRegI(env);
   2268             tTemp = newVRegI(env);
   2269             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2270             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2271             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2272             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2273             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2274             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2275                and those regs are legitimately modifiable. */
   2276             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
   2277             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
   2278             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2279             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
   2280             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2281             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
   2282             *rHi = tHi;
   2283             *rLo = tLo;
   2284             return;
   2285          }
   2286 
   2287          /* F64 -> I64 */
   2288          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
   2289             case.  Unfortunately I see no easy way to avoid the
   2290             duplication. */
   2291          case Iop_F64toI64S: {
   2292             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2293             HReg tLo = newVRegI(env);
   2294             HReg tHi = newVRegI(env);
   2295 
   2296             /* Used several times ... */
   2297             /* Careful ... this sharing is only safe because
   2298 	       zero_esp/four_esp do not hold any registers which the
   2299 	       register allocator could attempt to swizzle later. */
   2300             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2301             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2302 
   2303             /* rf now holds the value to be converted, and rrm holds
   2304                the rounding mode value, encoded as per the
   2305                IRRoundingMode enum.  The first thing to do is set the
   2306                FPU's rounding mode accordingly. */
   2307 
   2308             /* Create a space for the format conversion. */
   2309             /* subl $8, %esp */
   2310             sub_from_esp(env, 8);
   2311 
   2312             /* Set host rounding mode */
   2313             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2314 
   2315             /* gistll %rf, 0(%esp) */
   2316             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
   2317 
   2318             /* movl 0(%esp), %dstLo */
   2319             /* movl 4(%esp), %dstHi */
   2320             addInstr(env, X86Instr_Alu32R(
   2321                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2322             addInstr(env, X86Instr_Alu32R(
   2323                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2324 
   2325             /* Restore default FPU rounding. */
   2326             set_FPU_rounding_default( env );
   2327 
   2328             /* addl $8, %esp */
   2329             add_to_esp(env, 8);
   2330 
   2331             *rHi = tHi;
   2332             *rLo = tLo;
   2333             return;
   2334          }
   2335 
   2336          case Iop_Add8x8:
   2337             fn = (HWord)h_generic_calc_Add8x8; goto binnish;
   2338          case Iop_Add16x4:
   2339             fn = (HWord)h_generic_calc_Add16x4; goto binnish;
   2340          case Iop_Add32x2:
   2341             fn = (HWord)h_generic_calc_Add32x2; goto binnish;
   2342 
   2343          case Iop_Avg8Ux8:
   2344             fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
   2345          case Iop_Avg16Ux4:
   2346             fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
   2347 
   2348          case Iop_CmpEQ8x8:
   2349             fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
   2350          case Iop_CmpEQ16x4:
   2351             fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
   2352          case Iop_CmpEQ32x2:
   2353             fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
   2354 
   2355          case Iop_CmpGT8Sx8:
   2356             fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
   2357          case Iop_CmpGT16Sx4:
   2358             fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
   2359          case Iop_CmpGT32Sx2:
   2360             fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
   2361 
   2362          case Iop_InterleaveHI8x8:
   2363             fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
   2364          case Iop_InterleaveLO8x8:
   2365             fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
   2366          case Iop_InterleaveHI16x4:
   2367             fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
   2368          case Iop_InterleaveLO16x4:
   2369             fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
   2370          case Iop_InterleaveHI32x2:
   2371             fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
   2372          case Iop_InterleaveLO32x2:
   2373             fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
   2374          case Iop_CatOddLanes16x4:
   2375             fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
   2376          case Iop_CatEvenLanes16x4:
   2377             fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
   2378          case Iop_Perm8x8:
   2379             fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
   2380 
   2381          case Iop_Max8Ux8:
   2382             fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
   2383          case Iop_Max16Sx4:
   2384             fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
   2385          case Iop_Min8Ux8:
   2386             fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
   2387          case Iop_Min16Sx4:
   2388             fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
   2389 
   2390          case Iop_Mul16x4:
   2391             fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
   2392          case Iop_Mul32x2:
   2393             fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
   2394          case Iop_MulHi16Sx4:
   2395             fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
   2396          case Iop_MulHi16Ux4:
   2397             fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
   2398 
   2399          case Iop_QAdd8Sx8:
   2400             fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
   2401          case Iop_QAdd16Sx4:
   2402             fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
   2403          case Iop_QAdd8Ux8:
   2404             fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
   2405          case Iop_QAdd16Ux4:
   2406             fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
   2407 
   2408          case Iop_QNarrowBin32Sto16Sx4:
   2409             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
   2410          case Iop_QNarrowBin16Sto8Sx8:
   2411             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
   2412          case Iop_QNarrowBin16Sto8Ux8:
   2413             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
   2414          case Iop_NarrowBin16to8x8:
   2415             fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
   2416          case Iop_NarrowBin32to16x4:
   2417             fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
   2418 
   2419          case Iop_QSub8Sx8:
   2420             fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
   2421          case Iop_QSub16Sx4:
   2422             fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
   2423          case Iop_QSub8Ux8:
   2424             fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
   2425          case Iop_QSub16Ux4:
   2426             fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
   2427 
   2428          case Iop_Sub8x8:
   2429             fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
   2430          case Iop_Sub16x4:
   2431             fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
   2432          case Iop_Sub32x2:
   2433             fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
   2434 
   2435          binnish: {
   2436             /* Note: the following assumes all helpers are of
   2437                signature
   2438                   ULong fn ( ULong, ULong ), and they are
   2439                not marked as regparm functions.
   2440             */
   2441             HReg xLo, xHi, yLo, yHi;
   2442             HReg tLo = newVRegI(env);
   2443             HReg tHi = newVRegI(env);
   2444             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2445             addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
   2446             addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
   2447             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2448             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2449             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2450             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2451             add_to_esp(env, 4*4);
   2452             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2453             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2454             *rHi = tHi;
   2455             *rLo = tLo;
   2456             return;
   2457          }
   2458 
   2459          case Iop_ShlN32x2:
   2460             fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
   2461          case Iop_ShlN16x4:
   2462             fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
   2463          case Iop_ShlN8x8:
   2464             fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
   2465          case Iop_ShrN32x2:
   2466             fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
   2467          case Iop_ShrN16x4:
   2468             fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
   2469          case Iop_SarN32x2:
   2470             fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
   2471          case Iop_SarN16x4:
   2472             fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
   2473          case Iop_SarN8x8:
   2474             fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
   2475          shifty: {
   2476             /* Note: the following assumes all helpers are of
   2477                signature
   2478                   ULong fn ( ULong, UInt ), and they are
   2479                not marked as regparm functions.
   2480             */
   2481             HReg xLo, xHi;
   2482             HReg tLo = newVRegI(env);
   2483             HReg tHi = newVRegI(env);
   2484             X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2485             addInstr(env, X86Instr_Push(y));
   2486             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2487             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2488             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2489             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2490             add_to_esp(env, 3*4);
   2491             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2492             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2493             *rHi = tHi;
   2494             *rLo = tLo;
   2495             return;
   2496          }
   2497 
   2498          default:
   2499             break;
   2500       }
   2501    } /* if (e->tag == Iex_Binop) */
   2502 
   2503 
   2504    /* --------- UNARY ops --------- */
   2505    if (e->tag == Iex_Unop) {
   2506       switch (e->Iex.Unop.op) {
   2507 
   2508          /* 32Sto64(e) */
   2509          case Iop_32Sto64: {
   2510             HReg tLo = newVRegI(env);
   2511             HReg tHi = newVRegI(env);
   2512             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2513             addInstr(env, mk_iMOVsd_RR(src,tHi));
   2514             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2515             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
   2516             *rHi = tHi;
   2517             *rLo = tLo;
   2518             return;
   2519          }
   2520 
   2521          /* 32Uto64(e) */
   2522          case Iop_32Uto64: {
   2523             HReg tLo = newVRegI(env);
   2524             HReg tHi = newVRegI(env);
   2525             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2526             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2527             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2528             *rHi = tHi;
   2529             *rLo = tLo;
   2530             return;
   2531          }
   2532 
   2533          /* 16Uto64(e) */
   2534          case Iop_16Uto64: {
   2535             HReg tLo = newVRegI(env);
   2536             HReg tHi = newVRegI(env);
   2537             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2538             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2539             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   2540                                           X86RMI_Imm(0xFFFF), tLo));
   2541             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2542             *rHi = tHi;
   2543             *rLo = tLo;
   2544             return;
   2545          }
   2546 
   2547          /* V128{HI}to64 */
   2548          case Iop_V128HIto64:
   2549          case Iop_V128to64: {
   2550             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
   2551             HReg tLo = newVRegI(env);
   2552             HReg tHi = newVRegI(env);
   2553             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   2554             X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
   2555             X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
   2556             X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
   2557             sub_from_esp(env, 16);
   2558             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   2559             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2560                                            X86RMI_Mem(espLO), tLo ));
   2561             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2562                                            X86RMI_Mem(espHI), tHi ));
   2563             add_to_esp(env, 16);
   2564             *rHi = tHi;
   2565             *rLo = tLo;
   2566             return;
   2567          }
   2568 
   2569          /* could do better than this, but for now ... */
   2570          case Iop_1Sto64: {
   2571             HReg tLo = newVRegI(env);
   2572             HReg tHi = newVRegI(env);
   2573             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   2574             addInstr(env, X86Instr_Set32(cond,tLo));
   2575             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
   2576             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
   2577             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
   2578             *rHi = tHi;
   2579             *rLo = tLo;
   2580             return;
   2581          }
   2582 
   2583          /* Not64(e) */
   2584          case Iop_Not64: {
   2585             HReg tLo = newVRegI(env);
   2586             HReg tHi = newVRegI(env);
   2587             HReg sHi, sLo;
   2588             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
   2589             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2590             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2591             addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
   2592             addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
   2593             *rHi = tHi;
   2594             *rLo = tLo;
   2595             return;
   2596          }
   2597 
   2598          /* Left64(e) */
   2599          case Iop_Left64: {
   2600             HReg yLo, yHi;
   2601             HReg tLo = newVRegI(env);
   2602             HReg tHi = newVRegI(env);
   2603             /* yHi:yLo = arg */
   2604             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
   2605             /* tLo = 0 - yLo, and set carry */
   2606             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
   2607             addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2608             /* tHi = 0 - yHi - carry */
   2609             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2610             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2611             /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
   2612                back in, so as to give the final result
   2613                tHi:tLo = arg | -arg. */
   2614             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
   2615             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
   2616             *rHi = tHi;
   2617             *rLo = tLo;
   2618             return;
   2619          }
   2620 
   2621          /* --- patterns rooted at: CmpwNEZ64 --- */
   2622 
   2623          /* CmpwNEZ64(e) */
   2624          case Iop_CmpwNEZ64: {
   2625 
   2626          DECLARE_PATTERN(p_CmpwNEZ64_Or64);
   2627          DEFINE_PATTERN(p_CmpwNEZ64_Or64,
   2628                         unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
   2629          if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
   2630             /* CmpwNEZ64(Or64(x,y)) */
   2631             HReg xHi,xLo,yHi,yLo;
   2632             HReg xBoth = newVRegI(env);
   2633             HReg merged = newVRegI(env);
   2634             HReg tmp2 = newVRegI(env);
   2635 
   2636             iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
   2637             addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
   2638             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2639                                           X86RMI_Reg(xLo),xBoth));
   2640 
   2641             iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
   2642             addInstr(env, mk_iMOVsd_RR(yHi,merged));
   2643             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2644                                           X86RMI_Reg(yLo),merged));
   2645             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2646                                              X86RMI_Reg(xBoth),merged));
   2647 
   2648             /* tmp2 = (merged | -merged) >>s 31 */
   2649             addInstr(env, mk_iMOVsd_RR(merged,tmp2));
   2650             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2651             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2652                                           X86RMI_Reg(merged), tmp2));
   2653             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2654             *rHi = tmp2;
   2655             *rLo = tmp2;
   2656             return;
   2657          } else {
   2658             /* CmpwNEZ64(e) */
   2659             HReg srcLo, srcHi;
   2660             HReg tmp1  = newVRegI(env);
   2661             HReg tmp2  = newVRegI(env);
   2662             /* srcHi:srcLo = arg */
   2663             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
   2664             /* tmp1 = srcHi | srcLo */
   2665             addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
   2666             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2667                                           X86RMI_Reg(srcLo), tmp1));
   2668             /* tmp2 = (tmp1 | -tmp1) >>s 31 */
   2669             addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
   2670             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2671             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2672                                           X86RMI_Reg(tmp1), tmp2));
   2673             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2674             *rHi = tmp2;
   2675             *rLo = tmp2;
   2676             return;
   2677          }
   2678          }
   2679 
   2680          /* ReinterpF64asI64(e) */
   2681          /* Given an IEEE754 double, produce an I64 with the same bit
   2682             pattern. */
   2683          case Iop_ReinterpF64asI64: {
   2684             HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
   2685             HReg tLo  = newVRegI(env);
   2686             HReg tHi  = newVRegI(env);
   2687             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2688             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2689             /* paranoia */
   2690             set_FPU_rounding_default(env);
   2691             /* subl $8, %esp */
   2692             sub_from_esp(env, 8);
   2693             /* gstD %rf, 0(%esp) */
   2694             addInstr(env,
   2695                      X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
   2696             /* movl 0(%esp), %tLo */
   2697             addInstr(env,
   2698                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2699             /* movl 4(%esp), %tHi */
   2700             addInstr(env,
   2701                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2702             /* addl $8, %esp */
   2703             add_to_esp(env, 8);
   2704             *rHi = tHi;
   2705             *rLo = tLo;
   2706             return;
   2707          }
   2708 
   2709          case Iop_CmpNEZ32x2:
   2710             fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
   2711          case Iop_CmpNEZ16x4:
   2712             fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
   2713          case Iop_CmpNEZ8x8:
   2714             fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
   2715          unish: {
   2716             /* Note: the following assumes all helpers are of
   2717                signature
   2718                   ULong fn ( ULong ), and they are
   2719                not marked as regparm functions.
   2720             */
   2721             HReg xLo, xHi;
   2722             HReg tLo = newVRegI(env);
   2723             HReg tHi = newVRegI(env);
   2724             iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
   2725             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2726             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2727             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2728             add_to_esp(env, 2*4);
   2729             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2730             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2731             *rHi = tHi;
   2732             *rLo = tLo;
   2733             return;
   2734          }
   2735 
   2736          default:
   2737             break;
   2738       }
   2739    } /* if (e->tag == Iex_Unop) */
   2740 
   2741 
   2742    /* --------- CCALL --------- */
   2743    if (e->tag == Iex_CCall) {
   2744       HReg tLo = newVRegI(env);
   2745       HReg tHi = newVRegI(env);
   2746 
   2747       /* Marshal args, do the call, clear stack. */
   2748       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   2749 
   2750       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2751       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2752       *rHi = tHi;
   2753       *rLo = tLo;
   2754       return;
   2755    }
   2756 
   2757    ppIRExpr(e);
   2758    vpanic("iselInt64Expr");
   2759 }
   2760 
   2761 
   2762 /*---------------------------------------------------------*/
   2763 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2764 /*---------------------------------------------------------*/
   2765 
   2766 /* Nothing interesting here; really just wrappers for
   2767    64-bit stuff. */
   2768 
   2769 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2770 {
   2771    HReg r = iselFltExpr_wrk( env, e );
   2772 #  if 0
   2773    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2774 #  endif
   2775    vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
   2776    vassert(hregIsVirtual(r));
   2777    return r;
   2778 }
   2779 
   2780 /* DO NOT CALL THIS DIRECTLY */
   2781 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2782 {
   2783    IRType ty = typeOfIRExpr(env->type_env,e);
   2784    vassert(ty == Ity_F32);
   2785 
   2786    if (e->tag == Iex_RdTmp) {
   2787       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2788    }
   2789 
   2790    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2791       X86AMode* am;
   2792       HReg res = newVRegF(env);
   2793       vassert(e->Iex.Load.ty == Ity_F32);
   2794       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2795       addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
   2796       return res;
   2797    }
   2798 
   2799    if (e->tag == Iex_Binop
   2800        && e->Iex.Binop.op == Iop_F64toF32) {
   2801       /* Although the result is still held in a standard FPU register,
   2802          we need to round it to reflect the loss of accuracy/range
   2803          entailed in casting it to a 32-bit float. */
   2804       HReg dst = newVRegF(env);
   2805       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2806       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2807       addInstr(env, X86Instr_Fp64to32(src,dst));
   2808       set_FPU_rounding_default( env );
   2809       return dst;
   2810    }
   2811 
   2812    if (e->tag == Iex_Get) {
   2813       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2814                                   hregX86_EBP() );
   2815       HReg res = newVRegF(env);
   2816       addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
   2817       return res;
   2818    }
   2819 
   2820    if (e->tag == Iex_Unop
   2821        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2822        /* Given an I32, produce an IEEE754 float with the same bit
   2823           pattern. */
   2824       HReg    dst = newVRegF(env);
   2825       X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   2826       /* paranoia */
   2827       addInstr(env, X86Instr_Push(rmi));
   2828       addInstr(env, X86Instr_FpLdSt(
   2829                        True/*load*/, 4, dst,
   2830                        X86AMode_IR(0, hregX86_ESP())));
   2831       add_to_esp(env, 4);
   2832       return dst;
   2833    }
   2834 
   2835    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2836       HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
   2837       HReg dst = newVRegF(env);
   2838 
   2839       /* rf now holds the value to be rounded.  The first thing to do
   2840          is set the FPU's rounding mode accordingly. */
   2841 
   2842       /* Set host rounding mode */
   2843       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2844 
   2845       /* grndint %rf, %dst */
   2846       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   2847 
   2848       /* Restore default FPU rounding. */
   2849       set_FPU_rounding_default( env );
   2850 
   2851       return dst;
   2852    }
   2853 
   2854    ppIRExpr(e);
   2855    vpanic("iselFltExpr_wrk");
   2856 }
   2857 
   2858 
   2859 /*---------------------------------------------------------*/
   2860 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2861 /*---------------------------------------------------------*/
   2862 
   2863 /* Compute a 64-bit floating point value into a register, the identity
   2864    of which is returned.  As with iselIntExpr_R, the reg may be either
   2865    real or virtual; in any case it must not be changed by subsequent
   2866    code emitted by the caller.  */
   2867 
   2868 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2869 
   2870     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2871     ----                  ---------   -----------   -----------
   2872     signalling NaN        u           2047 (max)    .0uuuuu---u
   2873                                                     (with at least
   2874                                                      one 1 bit)
   2875     quiet NaN             u           2047 (max)    .1uuuuu---u
   2876 
   2877     negative infinity     1           2047 (max)    .000000---0
   2878 
   2879     positive infinity     0           2047 (max)    .000000---0
   2880 
   2881     negative zero         1           0             .000000---0
   2882 
   2883     positive zero         0           0             .000000---0
   2884 */
   2885 
   2886 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2887 {
   2888    HReg r = iselDblExpr_wrk( env, e );
   2889 #  if 0
   2890    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2891 #  endif
   2892    vassert(hregClass(r) == HRcFlt64);
   2893    vassert(hregIsVirtual(r));
   2894    return r;
   2895 }
   2896 
   2897 /* DO NOT CALL THIS DIRECTLY */
   2898 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2899 {
   2900    IRType ty = typeOfIRExpr(env->type_env,e);
   2901    vassert(e);
   2902    vassert(ty == Ity_F64);
   2903 
   2904    if (e->tag == Iex_RdTmp) {
   2905       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2906    }
   2907 
   2908    if (e->tag == Iex_Const) {
   2909       union { UInt u32x2[2]; ULong u64; Double f64; } u;
   2910       HReg freg = newVRegF(env);
   2911       vassert(sizeof(u) == 8);
   2912       vassert(sizeof(u.u64) == 8);
   2913       vassert(sizeof(u.f64) == 8);
   2914       vassert(sizeof(u.u32x2) == 8);
   2915 
   2916       if (e->Iex.Const.con->tag == Ico_F64) {
   2917          u.f64 = e->Iex.Const.con->Ico.F64;
   2918       }
   2919       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2920          u.u64 = e->Iex.Const.con->Ico.F64i;
   2921       }
   2922       else
   2923          vpanic("iselDblExpr(x86): const");
   2924 
   2925       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
   2926       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
   2927       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
   2928                                     X86AMode_IR(0, hregX86_ESP())));
   2929       add_to_esp(env, 8);
   2930       return freg;
   2931    }
   2932 
   2933    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2934       X86AMode* am;
   2935       HReg res = newVRegF(env);
   2936       vassert(e->Iex.Load.ty == Ity_F64);
   2937       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2938       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
   2939       return res;
   2940    }
   2941 
   2942    if (e->tag == Iex_Get) {
   2943       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2944                                   hregX86_EBP() );
   2945       HReg res = newVRegF(env);
   2946       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2947       return res;
   2948    }
   2949 
   2950    if (e->tag == Iex_GetI) {
   2951       X86AMode* am
   2952          = genGuestArrayOffset(
   2953               env, e->Iex.GetI.descr,
   2954                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2955       HReg res = newVRegF(env);
   2956       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2957       return res;
   2958    }
   2959 
   2960    if (e->tag == Iex_Triop) {
   2961       X86FpOp fpop = Xfp_INVALID;
   2962       IRTriop *triop = e->Iex.Triop.details;
   2963       switch (triop->op) {
   2964          case Iop_AddF64:    fpop = Xfp_ADD; break;
   2965          case Iop_SubF64:    fpop = Xfp_SUB; break;
   2966          case Iop_MulF64:    fpop = Xfp_MUL; break;
   2967          case Iop_DivF64:    fpop = Xfp_DIV; break;
   2968          case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
   2969          case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
   2970          case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
   2971          case Iop_AtanF64:   fpop = Xfp_ATAN; break;
   2972          case Iop_PRemF64:   fpop = Xfp_PREM; break;
   2973          case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
   2974          default: break;
   2975       }
   2976       if (fpop != Xfp_INVALID) {
   2977          HReg res  = newVRegF(env);
   2978          HReg srcL = iselDblExpr(env, triop->arg2);
   2979          HReg srcR = iselDblExpr(env, triop->arg3);
   2980          /* XXXROUNDINGFIXME */
   2981          /* set roundingmode here */
   2982          addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
   2983 	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
   2984 	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
   2985             roundToF64(env, res);
   2986          return res;
   2987       }
   2988    }
   2989 
   2990    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2991       HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2992       HReg dst = newVRegF(env);
   2993 
   2994       /* rf now holds the value to be rounded.  The first thing to do
   2995          is set the FPU's rounding mode accordingly. */
   2996 
   2997       /* Set host rounding mode */
   2998       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2999 
   3000       /* grndint %rf, %dst */
   3001       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   3002 
   3003       /* Restore default FPU rounding. */
   3004       set_FPU_rounding_default( env );
   3005 
   3006       return dst;
   3007    }
   3008 
   3009    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   3010       HReg dst = newVRegF(env);
   3011       HReg rHi,rLo;
   3012       iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
   3013       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3014       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3015 
   3016       /* Set host rounding mode */
   3017       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   3018 
   3019       addInstr(env, X86Instr_FpLdStI(
   3020                        True/*load*/, 8, dst,
   3021                        X86AMode_IR(0, hregX86_ESP())));
   3022 
   3023       /* Restore default FPU rounding. */
   3024       set_FPU_rounding_default( env );
   3025 
   3026       add_to_esp(env, 8);
   3027       return dst;
   3028    }
   3029 
   3030    if (e->tag == Iex_Binop) {
   3031       X86FpOp fpop = Xfp_INVALID;
   3032       switch (e->Iex.Binop.op) {
   3033          case Iop_SinF64:  fpop = Xfp_SIN; break;
   3034          case Iop_CosF64:  fpop = Xfp_COS; break;
   3035          case Iop_TanF64:  fpop = Xfp_TAN; break;
   3036          case Iop_2xm1F64: fpop = Xfp_2XM1; break;
   3037          case Iop_SqrtF64: fpop = Xfp_SQRT; break;
   3038          default: break;
   3039       }
   3040       if (fpop != Xfp_INVALID) {
   3041          HReg res = newVRegF(env);
   3042          HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   3043          /* XXXROUNDINGFIXME */
   3044          /* set roundingmode here */
   3045          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3046 	 if (fpop != Xfp_SQRT
   3047              && fpop != Xfp_NEG && fpop != Xfp_ABS)
   3048             roundToF64(env, res);
   3049          return res;
   3050       }
   3051    }
   3052 
   3053    if (e->tag == Iex_Unop) {
   3054       X86FpOp fpop = Xfp_INVALID;
   3055       switch (e->Iex.Unop.op) {
   3056          case Iop_NegF64:  fpop = Xfp_NEG; break;
   3057          case Iop_AbsF64:  fpop = Xfp_ABS; break;
   3058          default: break;
   3059       }
   3060       if (fpop != Xfp_INVALID) {
   3061          HReg res = newVRegF(env);
   3062          HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   3063          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3064 	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
   3065             roundToF64(env, res);
   3066          return res;
   3067       }
   3068    }
   3069 
   3070    if (e->tag == Iex_Unop) {
   3071       switch (e->Iex.Unop.op) {
   3072          case Iop_I32StoF64: {
   3073             HReg dst = newVRegF(env);
   3074             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3075             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3076             set_FPU_rounding_default(env);
   3077             addInstr(env, X86Instr_FpLdStI(
   3078                              True/*load*/, 4, dst,
   3079                              X86AMode_IR(0, hregX86_ESP())));
   3080 	    add_to_esp(env, 4);
   3081             return dst;
   3082          }
   3083          case Iop_ReinterpI64asF64: {
   3084             /* Given an I64, produce an IEEE754 double with the same
   3085                bit pattern. */
   3086             HReg dst = newVRegF(env);
   3087             HReg rHi, rLo;
   3088 	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
   3089             /* paranoia */
   3090             set_FPU_rounding_default(env);
   3091             addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3092             addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3093             addInstr(env, X86Instr_FpLdSt(
   3094                              True/*load*/, 8, dst,
   3095                              X86AMode_IR(0, hregX86_ESP())));
   3096 	    add_to_esp(env, 8);
   3097             return dst;
   3098 	 }
   3099          case Iop_F32toF64: {
   3100             /* this is a no-op */
   3101             HReg res = iselFltExpr(env, e->Iex.Unop.arg);
   3102             return res;
   3103 	 }
   3104          default:
   3105             break;
   3106       }
   3107    }
   3108 
   3109    /* --------- MULTIPLEX --------- */
   3110    if (e->tag == Iex_Mux0X) {
   3111      if (ty == Ity_F64
   3112          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   3113         X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3114         HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
   3115         HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
   3116         HReg dst = newVRegF(env);
   3117         addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
   3118         addInstr(env, X86Instr_Test32(0xFF, r8));
   3119         addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
   3120         return dst;
   3121       }
   3122    }
   3123 
   3124    ppIRExpr(e);
   3125    vpanic("iselDblExpr_wrk");
   3126 }
   3127 
   3128 
   3129 /*---------------------------------------------------------*/
   3130 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3131 /*---------------------------------------------------------*/
   3132 
   3133 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3134 {
   3135    HReg r = iselVecExpr_wrk( env, e );
   3136 #  if 0
   3137    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3138 #  endif
   3139    vassert(hregClass(r) == HRcVec128);
   3140    vassert(hregIsVirtual(r));
   3141    return r;
   3142 }
   3143 
   3144 
   3145 /* DO NOT CALL THIS DIRECTLY */
   3146 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3147 {
   3148 
   3149 #  define REQUIRE_SSE1                                    \
   3150       do { if (env->hwcaps == 0/*baseline, no sse*/)      \
   3151               goto vec_fail;                              \
   3152       } while (0)
   3153 
   3154 #  define REQUIRE_SSE2                                    \
   3155       do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
   3156               goto vec_fail;                              \
   3157       } while (0)
   3158 
   3159 #  define SSE2_OR_ABOVE                                   \
   3160        (env->hwcaps & VEX_HWCAPS_X86_SSE2)
   3161 
   3162    HWord     fn = 0; /* address of helper fn, if required */
   3163    MatchInfo mi;
   3164    Bool      arg1isEReg = False;
   3165    X86SseOp  op = Xsse_INVALID;
   3166    IRType    ty = typeOfIRExpr(env->type_env,e);
   3167    vassert(e);
   3168    vassert(ty == Ity_V128);
   3169 
   3170    REQUIRE_SSE1;
   3171 
   3172    if (e->tag == Iex_RdTmp) {
   3173       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3174    }
   3175 
   3176    if (e->tag == Iex_Get) {
   3177       HReg dst = newVRegV(env);
   3178       addInstr(env, X86Instr_SseLdSt(
   3179                        True/*load*/,
   3180                        dst,
   3181                        X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
   3182                     )
   3183               );
   3184       return dst;
   3185    }
   3186 
   3187    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3188       HReg      dst = newVRegV(env);
   3189       X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3190       addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
   3191       return dst;
   3192    }
   3193 
   3194    if (e->tag == Iex_Const) {
   3195       HReg dst = newVRegV(env);
   3196       vassert(e->Iex.Const.con->tag == Ico_V128);
   3197       addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
   3198       return dst;
   3199    }
   3200 
   3201    if (e->tag == Iex_Unop) {
   3202 
   3203    if (SSE2_OR_ABOVE) {
   3204       /* 64UtoV128(LDle:I64(addr)) */
   3205       DECLARE_PATTERN(p_zwiden_load64);
   3206       DEFINE_PATTERN(p_zwiden_load64,
   3207                      unop(Iop_64UtoV128,
   3208                           IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
   3209       if (matchIRExpr(&mi, p_zwiden_load64, e)) {
   3210          X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
   3211          HReg dst = newVRegV(env);
   3212          addInstr(env, X86Instr_SseLdzLO(8, dst, am));
   3213          return dst;
   3214       }
   3215    }
   3216 
   3217    switch (e->Iex.Unop.op) {
   3218 
   3219       case Iop_NotV128: {
   3220          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3221          return do_sse_Not128(env, arg);
   3222       }
   3223 
   3224       case Iop_CmpNEZ64x2: {
   3225          /* We can use SSE2 instructions for this. */
   3226          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3227             the operand.  Problem is no such insn exists.  Solution
   3228             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3229             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3230             let the not'd result of this initial comparison be a:b:c:d.
   3231             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3232             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3233             giving the required result.
   3234 
   3235             The required selection sequence is 2,3,0,1, which
   3236             according to Intel's documentation means the pshufd
   3237             literal value is 0xB1, that is,
   3238             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3239          */
   3240          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3241          HReg tmp  = newVRegV(env);
   3242          HReg dst  = newVRegV(env);
   3243          REQUIRE_SSE2;
   3244          addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
   3245          addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
   3246          tmp = do_sse_Not128(env, tmp);
   3247          addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
   3248          addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
   3249          return dst;
   3250       }
   3251 
   3252       case Iop_CmpNEZ32x4: {
   3253          /* Sigh, we have to generate lousy code since this has to
   3254             work on SSE1 hosts */
   3255          /* basically, the idea is: for each lane:
   3256                movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
   3257                sbbl %r, %r               (now %r = 1Sto32(CF))
   3258                movl %r, lane
   3259          */
   3260          Int       i;
   3261          X86AMode* am;
   3262          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3263          HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3264          HReg      dst  = newVRegV(env);
   3265          HReg      r32  = newVRegI(env);
   3266          sub_from_esp(env, 16);
   3267          addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
   3268          for (i = 0; i < 4; i++) {
   3269             am = X86AMode_IR(i*4, hregX86_ESP());
   3270             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
   3271             addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
   3272             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
   3273             addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
   3274          }
   3275          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3276          add_to_esp(env, 16);
   3277          return dst;
   3278       }
   3279 
   3280       case Iop_CmpNEZ8x16:
   3281       case Iop_CmpNEZ16x8: {
   3282          /* We can use SSE2 instructions for this. */
   3283          HReg arg;
   3284          HReg vec0 = newVRegV(env);
   3285          HReg vec1 = newVRegV(env);
   3286          HReg dst  = newVRegV(env);
   3287          X86SseOp cmpOp
   3288             = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
   3289                                              : Xsse_CMPEQ8;
   3290          REQUIRE_SSE2;
   3291          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
   3292          addInstr(env, mk_vMOVsd_RR(vec0, vec1));
   3293          addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
   3294          /* defer arg computation to here so as to give CMPEQF as long
   3295             as possible to complete */
   3296          arg = iselVecExpr(env, e->Iex.Unop.arg);
   3297          /* vec0 is all 0s; vec1 is all 1s */
   3298          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3299          /* 16x8 or 8x16 comparison == */
   3300          addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
   3301          /* invert result */
   3302          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
   3303          return dst;
   3304       }
   3305 
   3306       case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
   3307       case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
   3308       case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
   3309       do_32Fx4_unary:
   3310       {
   3311          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3312          HReg dst = newVRegV(env);
   3313          addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
   3314          return dst;
   3315       }
   3316 
   3317       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
   3318       case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
   3319       case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
   3320       do_64Fx2_unary:
   3321       {
   3322          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3323          HReg dst = newVRegV(env);
   3324          REQUIRE_SSE2;
   3325          addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
   3326          return dst;
   3327       }
   3328 
   3329       case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
   3330       case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
   3331       case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
   3332       do_32F0x4_unary:
   3333       {
   3334          /* A bit subtle.  We have to copy the arg to the result
   3335             register first, because actually doing the SSE scalar insn
   3336             leaves the upper 3/4 of the destination register
   3337             unchanged.  Whereas the required semantics of these
   3338             primops is that the upper 3/4 is simply copied in from the
   3339             argument. */
   3340          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3341          HReg dst = newVRegV(env);
   3342          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3343          addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
   3344          return dst;
   3345       }
   3346 
   3347       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
   3348       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
   3349       case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
   3350       do_64F0x2_unary:
   3351       {
   3352          /* A bit subtle.  We have to copy the arg to the result
   3353             register first, because actually doing the SSE scalar insn
   3354             leaves the upper half of the destination register
   3355             unchanged.  Whereas the required semantics of these
   3356             primops is that the upper half is simply copied in from the
   3357             argument. */
   3358          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3359          HReg dst = newVRegV(env);
   3360          REQUIRE_SSE2;
   3361          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3362          addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
   3363          return dst;
   3364       }
   3365 
   3366       case Iop_32UtoV128: {
   3367          HReg      dst  = newVRegV(env);
   3368          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3369          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3370          addInstr(env, X86Instr_Push(rmi));
   3371 	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
   3372          add_to_esp(env, 4);
   3373          return dst;
   3374       }
   3375 
   3376       case Iop_64UtoV128: {
   3377          HReg      rHi, rLo;
   3378          HReg      dst  = newVRegV(env);
   3379          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3380          iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
   3381          addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3382          addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3383 	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
   3384          add_to_esp(env, 8);
   3385          return dst;
   3386       }
   3387 
   3388       default:
   3389          break;
   3390    } /* switch (e->Iex.Unop.op) */
   3391    } /* if (e->tag == Iex_Unop) */
   3392 
   3393    if (e->tag == Iex_Binop) {
   3394    switch (e->Iex.Binop.op) {
   3395 
   3396       case Iop_SetV128lo32: {
   3397          HReg dst = newVRegV(env);
   3398          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3399          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3400          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3401          sub_from_esp(env, 16);
   3402          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3403          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
   3404          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3405          add_to_esp(env, 16);
   3406          return dst;
   3407       }
   3408 
   3409       case Iop_SetV128lo64: {
   3410          HReg dst = newVRegV(env);
   3411          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3412          HReg srcIhi, srcIlo;
   3413          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3414          X86AMode* esp4 = advance4(esp0);
   3415          iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
   3416          sub_from_esp(env, 16);
   3417          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3418          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
   3419          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
   3420          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3421          add_to_esp(env, 16);
   3422          return dst;
   3423       }
   3424 
   3425       case Iop_64HLtoV128: {
   3426          HReg r3, r2, r1, r0;
   3427          X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
   3428          X86AMode* esp4  = advance4(esp0);
   3429          X86AMode* esp8  = advance4(esp4);
   3430          X86AMode* esp12 = advance4(esp8);
   3431          HReg dst = newVRegV(env);
   3432 	 /* do this via the stack (easy, convenient, etc) */
   3433          sub_from_esp(env, 16);
   3434          /* Do the less significant 64 bits */
   3435          iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
   3436          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
   3437          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
   3438          /* Do the more significant 64 bits */
   3439          iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
   3440          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
   3441          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
   3442 	 /* Fetch result back from stack. */
   3443          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3444          add_to_esp(env, 16);
   3445          return dst;
   3446       }
   3447 
   3448       case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
   3449       case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
   3450       case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
   3451       case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
   3452       case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
   3453       case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
   3454       case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
   3455       case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
   3456       case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
   3457       case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
   3458       do_32Fx4:
   3459       {
   3460          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3461          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3462          HReg dst = newVRegV(env);
   3463          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3464          addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
   3465          return dst;
   3466       }
   3467 
   3468       case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
   3469       case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
   3470       case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
   3471       case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
   3472       case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
   3473       case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
   3474       case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
   3475       case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
   3476       case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
   3477       case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
   3478       do_64Fx2:
   3479       {
   3480          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3481          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3482          HReg dst = newVRegV(env);
   3483          REQUIRE_SSE2;
   3484          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3485          addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
   3486          return dst;
   3487       }
   3488 
   3489       case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
   3490       case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
   3491       case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
   3492       case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
   3493       case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
   3494       case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
   3495       case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
   3496       case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
   3497       case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
   3498       case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
   3499       do_32F0x4: {
   3500          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3501          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3502          HReg dst = newVRegV(env);
   3503          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3504          addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
   3505          return dst;
   3506       }
   3507 
   3508       case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
   3509       case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
   3510       case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
   3511       case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
   3512       case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
   3513       case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
   3514       case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
   3515       case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
   3516       case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
   3517       case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
   3518       do_64F0x2: {
   3519          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3520          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3521          HReg dst = newVRegV(env);
   3522          REQUIRE_SSE2;
   3523          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3524          addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
   3525          return dst;
   3526       }
   3527 
   3528       case Iop_QNarrowBin32Sto16Sx8:
   3529          op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3530       case Iop_QNarrowBin16Sto8Sx16:
   3531          op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3532       case Iop_QNarrowBin16Sto8Ux16:
   3533          op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3534 
   3535       case Iop_InterleaveHI8x16:
   3536          op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3537       case Iop_InterleaveHI16x8:
   3538          op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3539       case Iop_InterleaveHI32x4:
   3540          op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3541       case Iop_InterleaveHI64x2:
   3542          op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3543 
   3544       case Iop_InterleaveLO8x16:
   3545          op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3546       case Iop_InterleaveLO16x8:
   3547          op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3548       case Iop_InterleaveLO32x4:
   3549          op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3550       case Iop_InterleaveLO64x2:
   3551          op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3552 
   3553       case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
   3554       case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
   3555       case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
   3556       case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
   3557       case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
   3558       case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
   3559       case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
   3560       case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
   3561       case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
   3562       case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
   3563       case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
   3564       case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
   3565       case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
   3566       case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
   3567       case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
   3568       case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
   3569       case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
   3570       case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
   3571       case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
   3572       case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
   3573       case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
   3574       case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
   3575       case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
   3576       case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
   3577       case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
   3578       case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
   3579       case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
   3580       case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
   3581       case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
   3582       case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
   3583       case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
   3584       case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
   3585       case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
   3586       case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
   3587       do_SseReRg: {
   3588          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3589          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3590          HReg dst = newVRegV(env);
   3591          if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
   3592             REQUIRE_SSE2;
   3593          if (arg1isEReg) {
   3594             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3595             addInstr(env, X86Instr_SseReRg(op, arg1, dst));
   3596          } else {
   3597             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3598             addInstr(env, X86Instr_SseReRg(op, arg2, dst));
   3599          }
   3600          return dst;
   3601       }
   3602 
   3603       case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
   3604       case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
   3605       case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
   3606       case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
   3607       case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
   3608       case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
   3609       case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
   3610       case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
   3611       do_SseShift: {
   3612          HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3613          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3614          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3615          HReg      ereg = newVRegV(env);
   3616          HReg      dst  = newVRegV(env);
   3617          REQUIRE_SSE2;
   3618          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3619          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3620          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3621          addInstr(env, X86Instr_Push(rmi));
   3622          addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
   3623 	 addInstr(env, mk_vMOVsd_RR(greg, dst));
   3624          addInstr(env, X86Instr_SseReRg(op, ereg, dst));
   3625          add_to_esp(env, 16);
   3626          return dst;
   3627       }
   3628 
   3629       case Iop_NarrowBin32to16x8:
   3630          fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3631          goto do_SseAssistedBinary;
   3632       case Iop_NarrowBin16to8x16:
   3633          fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3634          goto do_SseAssistedBinary;
   3635       do_SseAssistedBinary: {
   3636          /* As with the amd64 case (where this is copied from) we
   3637             generate pretty bad code. */
   3638          vassert(fn != 0);
   3639          HReg dst = newVRegV(env);
   3640          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3641          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3642          HReg argp = newVRegI(env);
   3643          /* subl $112, %esp         -- make a space */
   3644          sub_from_esp(env, 112);
   3645          /* leal 48(%esp), %r_argp  -- point into it */
   3646          addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
   3647                                       argp));
   3648          /* andl $-16, %r_argp      -- 16-align the pointer */
   3649          addInstr(env, X86Instr_Alu32R(Xalu_AND,
   3650                                        X86RMI_Imm( ~(UInt)15 ),
   3651                                        argp));
   3652          /* Prepare 3 arg regs:
   3653             leal  0(%r_argp), %eax
   3654             leal 16(%r_argp), %edx
   3655             leal 32(%r_argp), %ecx
   3656          */
   3657          addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
   3658                                       hregX86_EAX()));
   3659          addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
   3660                                       hregX86_EDX()));
   3661          addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
   3662                                       hregX86_ECX()));
   3663          /* Store the two args, at (%edx) and (%ecx):
   3664             movupd  %argL, 0(%edx)
   3665             movupd  %argR, 0(%ecx)
   3666          */
   3667          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
   3668                                         X86AMode_IR(0, hregX86_EDX())));
   3669          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
   3670                                         X86AMode_IR(0, hregX86_ECX())));
   3671          /* call the helper */
   3672          addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn, 3 ));
   3673          /* fetch the result from memory, using %r_argp, which the
   3674             register allocator will keep alive across the call. */
   3675          addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
   3676                                         X86AMode_IR(0, argp)));
   3677          /* and finally, clear the space */
   3678          add_to_esp(env, 112);
   3679          return dst;
   3680       }
   3681 
   3682       default:
   3683          break;
   3684    } /* switch (e->Iex.Binop.op) */
   3685    } /* if (e->tag == Iex_Binop) */
   3686 
   3687    if (e->tag == Iex_Mux0X) {
   3688       X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3689       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
   3690       HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
   3691       HReg dst = newVRegV(env);
   3692       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3693       addInstr(env, X86Instr_Test32(0xFF, r8));
   3694       addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
   3695       return dst;
   3696    }
   3697 
   3698    vec_fail:
   3699    vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
   3700               LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
   3701    ppIRExpr(e);
   3702    vpanic("iselVecExpr_wrk");
   3703 
   3704 #  undef REQUIRE_SSE1
   3705 #  undef REQUIRE_SSE2
   3706 #  undef SSE2_OR_ABOVE
   3707 }
   3708 
   3709 
   3710 /*---------------------------------------------------------*/
   3711 /*--- ISEL: Statements                                  ---*/
   3712 /*---------------------------------------------------------*/
   3713 
   3714 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   3715 {
   3716    if (vex_traceflags & VEX_TRACE_VCODE) {
   3717       vex_printf("\n-- ");
   3718       ppIRStmt(stmt);
   3719       vex_printf("\n");
   3720    }
   3721 
   3722    switch (stmt->tag) {
   3723 
   3724    /* --------- STORE --------- */
   3725    case Ist_Store: {
   3726       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   3727       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   3728       IREndness end   = stmt->Ist.Store.end;
   3729 
   3730       if (tya != Ity_I32 || end != Iend_LE)
   3731          goto stmt_fail;
   3732 
   3733       if (tyd == Ity_I32) {
   3734          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3735          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   3736          addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
   3737          return;
   3738       }
   3739       if (tyd == Ity_I8 || tyd == Ity_I16) {
   3740          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3741          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   3742          addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
   3743                                        r,am ));
   3744          return;
   3745       }
   3746       if (tyd == Ity_F64) {
   3747          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3748          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   3749          addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
   3750          return;
   3751       }
   3752       if (tyd == Ity_F32) {
   3753          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3754          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   3755          addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
   3756          return;
   3757       }
   3758       if (tyd == Ity_I64) {
   3759          HReg vHi, vLo, rA;
   3760          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
   3761          rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
   3762          addInstr(env, X86Instr_Alu32M(
   3763                           Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
   3764          addInstr(env, X86Instr_Alu32M(
   3765                           Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
   3766          return;
   3767       }
   3768       if (tyd == Ity_V128) {
   3769          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3770          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   3771          addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
   3772          return;
   3773       }
   3774       break;
   3775    }
   3776 
   3777    /* --------- PUT --------- */
   3778    case Ist_Put: {
   3779       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   3780       if (ty == Ity_I32) {
   3781          /* We're going to write to memory, so compute the RHS into an
   3782             X86RI. */
   3783          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   3784          addInstr(env,
   3785                   X86Instr_Alu32M(
   3786                      Xalu_MOV,
   3787                      ri,
   3788                      X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
   3789                  ));
   3790          return;
   3791       }
   3792       if (ty == Ity_I8 || ty == Ity_I16) {
   3793          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   3794          addInstr(env, X86Instr_Store(
   3795                           toUChar(ty==Ity_I8 ? 1 : 2),
   3796                           r,
   3797                           X86AMode_IR(stmt->Ist.Put.offset,
   3798                                       hregX86_EBP())));
   3799          return;
   3800       }
   3801       if (ty == Ity_I64) {
   3802          HReg vHi, vLo;
   3803          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3804          X86AMode* am4 = advance4(am);
   3805          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
   3806          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
   3807          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
   3808          return;
   3809       }
   3810       if (ty == Ity_V128) {
   3811          HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
   3812          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3813          addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
   3814          return;
   3815       }
   3816       if (ty == Ity_F32) {
   3817          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   3818          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3819          set_FPU_rounding_default(env); /* paranoia */
   3820          addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
   3821          return;
   3822       }
   3823       if (ty == Ity_F64) {
   3824          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   3825          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3826          set_FPU_rounding_default(env); /* paranoia */
   3827          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
   3828          return;
   3829       }
   3830       break;
   3831    }
   3832 
   3833    /* --------- Indexed PUT --------- */
   3834    case Ist_PutI: {
   3835       IRPutI *puti = stmt->Ist.PutI.details;
   3836 
   3837       X86AMode* am
   3838          = genGuestArrayOffset(
   3839               env, puti->descr,
   3840                    puti->ix, puti->bias );
   3841 
   3842       IRType ty = typeOfIRExpr(env->type_env, puti->data);
   3843       if (ty == Ity_F64) {
   3844          HReg val = iselDblExpr(env, puti->data);
   3845          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
   3846          return;
   3847       }
   3848       if (ty == Ity_I8) {
   3849          HReg r = iselIntExpr_R(env, puti->data);
   3850          addInstr(env, X86Instr_Store( 1, r, am ));
   3851          return;
   3852       }
   3853       if (ty == Ity_I32) {
   3854          HReg r = iselIntExpr_R(env, puti->data);
   3855          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
   3856          return;
   3857       }
   3858       if (ty == Ity_I64) {
   3859          HReg rHi, rLo;
   3860          X86AMode* am4 = advance4(am);
   3861          iselInt64Expr(&rHi, &rLo, env, puti->data);
   3862          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
   3863          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
   3864          return;
   3865       }
   3866       break;
   3867    }
   3868 
   3869    /* --------- TMP --------- */
   3870    case Ist_WrTmp: {
   3871       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   3872       IRType ty = typeOfIRTemp(env->type_env, tmp);
   3873 
   3874       /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
   3875          compute it into an AMode and then use LEA.  This usually
   3876          produces fewer instructions, often because (for memcheck
   3877          created IR) we get t = address-expression, (t is later used
   3878          twice) and so doing this naturally turns address-expression
   3879          back into an X86 amode. */
   3880       if (ty == Ity_I32
   3881           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   3882           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
   3883          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   3884          HReg dst = lookupIRTemp(env, tmp);
   3885          if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
   3886             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   3887                value into a register.  Just emit a normal reg-reg move
   3888                so reg-alloc can coalesce it away in the usual way. */
   3889             HReg src = am->Xam.IR.reg;
   3890             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
   3891          } else {
   3892             addInstr(env, X86Instr_Lea32(am,dst));
   3893          }
   3894          return;
   3895       }
   3896 
   3897       if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
   3898          X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   3899          HReg dst = lookupIRTemp(env, tmp);
   3900          addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
   3901          return;
   3902       }
   3903       if (ty == Ity_I64) {
   3904          HReg rHi, rLo, dstHi, dstLo;
   3905          iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3906          lookupIRTemp64( &dstHi, &dstLo, env, tmp);
   3907          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   3908          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   3909          return;
   3910       }
   3911       if (ty == Ity_I1) {
   3912          X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   3913          HReg dst = lookupIRTemp(env, tmp);
   3914          addInstr(env, X86Instr_Set32(cond, dst));
   3915          return;
   3916       }
   3917       if (ty == Ity_F64) {
   3918          HReg dst = lookupIRTemp(env, tmp);
   3919          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   3920          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3921          return;
   3922       }
   3923       if (ty == Ity_F32) {
   3924          HReg dst = lookupIRTemp(env, tmp);
   3925          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   3926          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3927          return;
   3928       }
   3929       if (ty == Ity_V128) {
   3930          HReg dst = lookupIRTemp(env, tmp);
   3931          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   3932          addInstr(env, mk_vMOVsd_RR(src,dst));
   3933          return;
   3934       }
   3935       break;
   3936    }
   3937 
   3938    /* --------- Call to DIRTY helper --------- */
   3939    case Ist_Dirty: {
   3940       IRType   retty;
   3941       IRDirty* d = stmt->Ist.Dirty.details;
   3942       Bool     passBBP = False;
   3943 
   3944       if (d->nFxState == 0)
   3945          vassert(!d->needsBBP);
   3946 
   3947       passBBP = toBool(d->nFxState > 0 && d->needsBBP);
   3948 
   3949       /* Marshal args, do the call, clear stack. */
   3950       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
   3951 
   3952       /* Now figure out what to do with the returned value, if any. */
   3953       if (d->tmp == IRTemp_INVALID)
   3954          /* No return value.  Nothing to do. */
   3955          return;
   3956 
   3957       retty = typeOfIRTemp(env->type_env, d->tmp);
   3958       if (retty == Ity_I64) {
   3959          HReg dstHi, dstLo;
   3960          /* The returned value is in %edx:%eax.  Park it in the
   3961             register-pair associated with tmp. */
   3962          lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
   3963          addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
   3964          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
   3965          return;
   3966       }
   3967       if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
   3968          /* The returned value is in %eax.  Park it in the register
   3969             associated with tmp. */
   3970          HReg dst = lookupIRTemp(env, d->tmp);
   3971          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
   3972          return;
   3973       }
   3974       break;
   3975    }
   3976 
   3977    /* --------- MEM FENCE --------- */
   3978    case Ist_MBE:
   3979       switch (stmt->Ist.MBE.event) {
   3980          case Imbe_Fence:
   3981             addInstr(env, X86Instr_MFence(env->hwcaps));
   3982             return;
   3983          default:
   3984             break;
   3985       }
   3986       break;
   3987 
   3988    /* --------- ACAS --------- */
   3989    case Ist_CAS:
   3990       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   3991          /* "normal" singleton CAS */
   3992          UChar  sz;
   3993          IRCAS* cas = stmt->Ist.CAS.details;
   3994          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3995          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   3996          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   3997          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   3998          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   3999          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4000          vassert(cas->expdHi == NULL);
   4001          vassert(cas->dataHi == NULL);
   4002          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4003          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   4004          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   4005          switch (ty) {
   4006             case Ity_I32: sz = 4; break;
   4007             case Ity_I16: sz = 2; break;
   4008             case Ity_I8:  sz = 1; break;
   4009             default: goto unhandled_cas;
   4010          }
   4011          addInstr(env, X86Instr_ACAS(am, sz));
   4012          addInstr(env,
   4013                   X86Instr_CMov32(Xcc_NZ,
   4014                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   4015          return;
   4016       } else {
   4017          /* double CAS */
   4018          IRCAS* cas = stmt->Ist.CAS.details;
   4019          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4020          /* only 32-bit allowed in this case */
   4021          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   4022          /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
   4023          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   4024          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   4025          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   4026          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   4027          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   4028          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   4029          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4030          if (ty != Ity_I32)
   4031             goto unhandled_cas;
   4032          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   4033          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4034          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
   4035          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   4036          addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
   4037          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   4038          addInstr(env, X86Instr_DACAS(am));
   4039          addInstr(env,
   4040                   X86Instr_CMov32(Xcc_NZ,
   4041                                   X86RM_Reg(hregX86_EDX()), rOldHi));
   4042          addInstr(env,
   4043                   X86Instr_CMov32(Xcc_NZ,
   4044                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   4045          return;
   4046       }
   4047       unhandled_cas:
   4048       break;
   4049 
   4050    /* --------- INSTR MARK --------- */
   4051    /* Doesn't generate any executable code ... */
   4052    case Ist_IMark:
   4053        return;
   4054 
   4055    /* --------- NO-OP --------- */
   4056    /* Fairly self-explanatory, wouldn't you say? */
   4057    case Ist_NoOp:
   4058        return;
   4059 
   4060    /* --------- EXIT --------- */
   4061    case Ist_Exit: {
   4062       if (stmt->Ist.Exit.dst->tag != Ico_U32)
   4063          vpanic("iselStmt(x86): Ist_Exit: dst is not a 32-bit value");
   4064 
   4065       X86CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
   4066       X86AMode*   amEIP = X86AMode_IR(stmt->Ist.Exit.offsIP,
   4067                                       hregX86_EBP());
   4068 
   4069       /* Case: boring transfer to known address */
   4070       if (stmt->Ist.Exit.jk == Ijk_Boring) {
   4071          if (env->chainingAllowed) {
   4072             /* .. almost always true .. */
   4073             /* Skip the event check at the dst if this is a forwards
   4074                edge. */
   4075             Bool toFastEP
   4076                = ((Addr32)stmt->Ist.Exit.dst->Ico.U32) > env->max_ga;
   4077             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
   4078             addInstr(env, X86Instr_XDirect(stmt->Ist.Exit.dst->Ico.U32,
   4079                                            amEIP, cc, toFastEP));
   4080          } else {
   4081             /* .. very occasionally .. */
   4082             /* We can't use chaining, so ask for an assisted transfer,
   4083                as that's the only alternative that is allowable. */
   4084             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4085             addInstr(env, X86Instr_XAssisted(r, amEIP, cc, Ijk_Boring));
   4086          }
   4087          return;
   4088       }
   4089 
   4090       /* Case: assisted transfer to arbitrary address */
   4091       switch (stmt->Ist.Exit.jk) {
   4092          /* Keep this list in sync with that in iselNext below */
   4093          case Ijk_ClientReq:
   4094          case Ijk_EmWarn:
   4095          case Ijk_MapFail:
   4096          case Ijk_NoDecode:
   4097          case Ijk_NoRedir:
   4098          case Ijk_SigSEGV:
   4099          case Ijk_SigTRAP:
   4100          case Ijk_Sys_int128:
   4101          case Ijk_Sys_int129:
   4102          case Ijk_Sys_int130:
   4103          case Ijk_Sys_sysenter:
   4104          case Ijk_TInval:
   4105          case Ijk_Yield:
   4106          {
   4107             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4108             addInstr(env, X86Instr_XAssisted(r, amEIP, cc, stmt->Ist.Exit.jk));
   4109             return;
   4110          }
   4111          default:
   4112             break;
   4113       }
   4114 
   4115       /* Do we ever expect to see any other kind? */
   4116       goto stmt_fail;
   4117    }
   4118 
   4119    default: break;
   4120    }
   4121   stmt_fail:
   4122    ppIRStmt(stmt);
   4123    vpanic("iselStmt");
   4124 }
   4125 
   4126 
   4127 /*---------------------------------------------------------*/
   4128 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4129 /*---------------------------------------------------------*/
   4130 
   4131 static void iselNext ( ISelEnv* env,
   4132                        IRExpr* next, IRJumpKind jk, Int offsIP )
   4133 {
   4134    if (vex_traceflags & VEX_TRACE_VCODE) {
   4135       vex_printf( "\n-- PUT(%d) = ", offsIP);
   4136       ppIRExpr( next );
   4137       vex_printf( "; exit-");
   4138       ppIRJumpKind(jk);
   4139       vex_printf( "\n");
   4140    }
   4141 
   4142    /* Case: boring transfer to known address */
   4143    if (next->tag == Iex_Const) {
   4144       IRConst* cdst = next->Iex.Const.con;
   4145       vassert(cdst->tag == Ico_U32);
   4146       if (jk == Ijk_Boring || jk == Ijk_Call) {
   4147          /* Boring transfer to known address */
   4148          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
   4149          if (env->chainingAllowed) {
   4150             /* .. almost always true .. */
   4151             /* Skip the event check at the dst if this is a forwards
   4152                edge. */
   4153             Bool toFastEP
   4154                = ((Addr64)cdst->Ico.U32) > env->max_ga;
   4155             if (0) vex_printf("%s", toFastEP ? "X" : ".");
   4156             addInstr(env, X86Instr_XDirect(cdst->Ico.U32,
   4157                                            amEIP, Xcc_ALWAYS,
   4158                                            toFastEP));
   4159          } else {
   4160             /* .. very occasionally .. */
   4161             /* We can't use chaining, so ask for an assisted transfer,
   4162                as that's the only alternative that is allowable. */
   4163             HReg r = iselIntExpr_R(env, next);
   4164             addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
   4165                                              Ijk_Boring));
   4166          }
   4167          return;
   4168       }
   4169    }
   4170 
   4171    /* Case: call/return (==boring) transfer to any address */
   4172    switch (jk) {
   4173       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
   4174          HReg      r     = iselIntExpr_R(env, next);
   4175          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
   4176          if (env->chainingAllowed) {
   4177             addInstr(env, X86Instr_XIndir(r, amEIP, Xcc_ALWAYS));
   4178          } else {
   4179             addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS,
   4180                                                Ijk_Boring));
   4181          }
   4182          return;
   4183       }
   4184       default:
   4185          break;
   4186    }
   4187 
   4188    /* Case: assisted transfer to arbitrary address */
   4189    switch (jk) {
   4190       /* Keep this list in sync with that for Ist_Exit above */
   4191       case Ijk_ClientReq:
   4192       case Ijk_EmWarn:
   4193       case Ijk_MapFail:
   4194       case Ijk_NoDecode:
   4195       case Ijk_NoRedir:
   4196       case Ijk_SigSEGV:
   4197       case Ijk_SigTRAP:
   4198       case Ijk_Sys_int128:
   4199       case Ijk_Sys_int129:
   4200       case Ijk_Sys_int130:
   4201       case Ijk_Sys_sysenter:
   4202       case Ijk_TInval:
   4203       case Ijk_Yield:
   4204       {
   4205          HReg      r     = iselIntExpr_R(env, next);
   4206          X86AMode* amEIP = X86AMode_IR(offsIP, hregX86_EBP());
   4207          addInstr(env, X86Instr_XAssisted(r, amEIP, Xcc_ALWAYS, jk));
   4208          return;
   4209       }
   4210       default:
   4211          break;
   4212    }
   4213 
   4214    vex_printf( "\n-- PUT(%d) = ", offsIP);
   4215    ppIRExpr( next );
   4216    vex_printf( "; exit-");
   4217    ppIRJumpKind(jk);
   4218    vex_printf( "\n");
   4219    vassert(0); // are we expecting any other kind?
   4220 }
   4221 
   4222 
   4223 /*---------------------------------------------------------*/
   4224 /*--- Insn selector top-level                           ---*/
   4225 /*---------------------------------------------------------*/
   4226 
   4227 /* Translate an entire SB to x86 code. */
   4228 
   4229 HInstrArray* iselSB_X86 ( IRSB* bb,
   4230                           VexArch      arch_host,
   4231                           VexArchInfo* archinfo_host,
   4232                           VexAbiInfo*  vbi/*UNUSED*/,
   4233                           Int offs_Host_EvC_Counter,
   4234                           Int offs_Host_EvC_FailAddr,
   4235                           Bool chainingAllowed,
   4236                           Bool addProfInc,
   4237                           Addr64 max_ga )
   4238 {
   4239    Int      i, j;
   4240    HReg     hreg, hregHI;
   4241    ISelEnv* env;
   4242    UInt     hwcaps_host = archinfo_host->hwcaps;
   4243    X86AMode *amCounter, *amFailAddr;
   4244 
   4245    /* sanity ... */
   4246    vassert(arch_host == VexArchX86);
   4247    vassert(0 == (hwcaps_host
   4248                  & ~(VEX_HWCAPS_X86_SSE1
   4249                      | VEX_HWCAPS_X86_SSE2
   4250                      | VEX_HWCAPS_X86_SSE3
   4251                      | VEX_HWCAPS_X86_LZCNT)));
   4252    vassert(sizeof(max_ga) == 8);
   4253    vassert((max_ga >> 32) == 0);
   4254 
   4255    /* Make up an initial environment to use. */
   4256    env = LibVEX_Alloc(sizeof(ISelEnv));
   4257    env->vreg_ctr = 0;
   4258 
   4259    /* Set up output code array. */
   4260    env->code = newHInstrArray();
   4261 
   4262    /* Copy BB's type env. */
   4263    env->type_env = bb->tyenv;
   4264 
   4265    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4266       change as we go along. */
   4267    env->n_vregmap = bb->tyenv->types_used;
   4268    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4269    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4270 
   4271    /* and finally ... */
   4272    env->chainingAllowed = chainingAllowed;
   4273    env->hwcaps          = hwcaps_host;
   4274    env->max_ga          = max_ga;
   4275 
   4276    /* For each IR temporary, allocate a suitably-kinded virtual
   4277       register. */
   4278    j = 0;
   4279    for (i = 0; i < env->n_vregmap; i++) {
   4280       hregHI = hreg = INVALID_HREG;
   4281       switch (bb->tyenv->types[i]) {
   4282          case Ity_I1:
   4283          case Ity_I8:
   4284          case Ity_I16:
   4285          case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
   4286          case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
   4287                         hregHI = mkHReg(j++, HRcInt32, True); break;
   4288          case Ity_F32:
   4289          case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
   4290          case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
   4291          default: ppIRType(bb->tyenv->types[i]);
   4292                   vpanic("iselBB: IRTemp type");
   4293       }
   4294       env->vregmap[i]   = hreg;
   4295       env->vregmapHI[i] = hregHI;
   4296    }
   4297    env->vreg_ctr = j;
   4298 
   4299    /* The very first instruction must be an event check. */
   4300    amCounter  = X86AMode_IR(offs_Host_EvC_Counter,  hregX86_EBP());
   4301    amFailAddr = X86AMode_IR(offs_Host_EvC_FailAddr, hregX86_EBP());
   4302    addInstr(env, X86Instr_EvCheck(amCounter, amFailAddr));
   4303 
   4304    /* Possibly a block counter increment (for profiling).  At this
   4305       point we don't know the address of the counter, so just pretend
   4306       it is zero.  It will have to be patched later, but before this
   4307       translation is used, by a call to LibVEX_patchProfCtr. */
   4308    if (addProfInc) {
   4309       addInstr(env, X86Instr_ProfInc());
   4310    }
   4311 
   4312    /* Ok, finally we can iterate over the statements. */
   4313    for (i = 0; i < bb->stmts_used; i++)
   4314       iselStmt(env, bb->stmts[i]);
   4315 
   4316    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
   4317 
   4318    /* record the number of vregs we used. */
   4319    env->code->n_vregs = env->vreg_ctr;
   4320    return env->code;
   4321 }
   4322 
   4323 
   4324 /*---------------------------------------------------------------*/
   4325 /*--- end                                     host_x86_isel.c ---*/
   4326 /*---------------------------------------------------------------*/
   4327