Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                   host_x86_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2011 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_x86_defs.h"
     47 
     48 /* TODO 21 Apr 2005:
     49 
     50    -- (Really an assembler issue) don't emit CMov32 as a cmov
     51       insn, since that's expensive on P4 and conditional branch
     52       is cheaper if (as we expect) the condition is highly predictable
     53 
     54    -- preserve xmm registers across function calls (by declaring them
     55       as trashed by call insns)
     56 
     57    -- preserve x87 ST stack discipline across function calls.  Sigh.
     58 
     59    -- Check doHelperCall: if a call is conditional, we cannot safely
     60       compute any regparm args directly to registers.  Hence, the
     61       fast-regparm marshalling should be restricted to unconditional
     62       calls only.
     63 */
     64 
     65 /*---------------------------------------------------------*/
     66 /*--- x87 control word stuff                            ---*/
     67 /*---------------------------------------------------------*/
     68 
     69 /* Vex-generated code expects to run with the FPU set as follows: all
     70    exceptions masked, round-to-nearest, precision = 53 bits.  This
     71    corresponds to a FPU control word value of 0x027F.
     72 
     73    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     74 
     75    %fpucw and %mxcsr should have these values on entry to
     76    Vex-generated code, and should those values should be
     77    unchanged at exit.
     78 */
     79 
     80 #define DEFAULT_FPUCW 0x027F
     81 
     82 /* debugging only, do not use */
     83 /* define DEFAULT_FPUCW 0x037F */
     84 
     85 
     86 /*---------------------------------------------------------*/
     87 /*--- misc helpers                                      ---*/
     88 /*---------------------------------------------------------*/
     89 
     90 /* These are duplicated in guest-x86/toIR.c */
     91 static IRExpr* unop ( IROp op, IRExpr* a )
     92 {
     93    return IRExpr_Unop(op, a);
     94 }
     95 
     96 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     97 {
     98    return IRExpr_Binop(op, a1, a2);
     99 }
    100 
    101 static IRExpr* bind ( Int binder )
    102 {
    103    return IRExpr_Binder(binder);
    104 }
    105 
    106 static Bool isZeroU8 ( IRExpr* e )
    107 {
    108    return e->tag == Iex_Const
    109           && e->Iex.Const.con->tag == Ico_U8
    110           && e->Iex.Const.con->Ico.U8 == 0;
    111 }
    112 
    113 static Bool isZeroU32 ( IRExpr* e )
    114 {
    115    return e->tag == Iex_Const
    116           && e->Iex.Const.con->tag == Ico_U32
    117           && e->Iex.Const.con->Ico.U32 == 0;
    118 }
    119 
    120 static Bool isZeroU64 ( IRExpr* e )
    121 {
    122    return e->tag == Iex_Const
    123           && e->Iex.Const.con->tag == Ico_U64
    124           && e->Iex.Const.con->Ico.U64 == 0ULL;
    125 }
    126 
    127 
    128 /*---------------------------------------------------------*/
    129 /*--- ISelEnv                                           ---*/
    130 /*---------------------------------------------------------*/
    131 
    132 /* This carries around:
    133 
    134    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    135      might encounter.  This is computed before insn selection starts,
    136      and does not change.
    137 
    138    - A mapping from IRTemp to HReg.  This tells the insn selector
    139      which virtual register(s) are associated with each IRTemp
    140      temporary.  This is computed before insn selection starts, and
    141      does not change.  We expect this mapping to map precisely the
    142      same set of IRTemps as the type mapping does.
    143 
    144         - vregmap   holds the primary register for the IRTemp.
    145         - vregmapHI is only used for 64-bit integer-typed
    146              IRTemps.  It holds the identity of a second
    147              32-bit virtual HReg, which holds the high half
    148              of the value.
    149 
    150    - The code array, that is, the insns selected so far.
    151 
    152    - A counter, for generating new virtual registers.
    153 
    154    - The host subarchitecture we are selecting insns for.
    155      This is set at the start and does not change.
    156 
    157    Note, this is all host-independent.  */
    158 
    159 typedef
    160    struct {
    161       IRTypeEnv*   type_env;
    162 
    163       HReg*        vregmap;
    164       HReg*        vregmapHI;
    165       Int          n_vregmap;
    166 
    167       HInstrArray* code;
    168 
    169       Int          vreg_ctr;
    170 
    171       UInt         hwcaps;
    172    }
    173    ISelEnv;
    174 
    175 
    176 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    177 {
    178    vassert(tmp >= 0);
    179    vassert(tmp < env->n_vregmap);
    180    return env->vregmap[tmp];
    181 }
    182 
    183 static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
    184 {
    185    vassert(tmp >= 0);
    186    vassert(tmp < env->n_vregmap);
    187    vassert(env->vregmapHI[tmp] != INVALID_HREG);
    188    *vrLO = env->vregmap[tmp];
    189    *vrHI = env->vregmapHI[tmp];
    190 }
    191 
    192 static void addInstr ( ISelEnv* env, X86Instr* instr )
    193 {
    194    addHInstr(env->code, instr);
    195    if (vex_traceflags & VEX_TRACE_VCODE) {
    196       ppX86Instr(instr, False);
    197       vex_printf("\n");
    198    }
    199 }
    200 
    201 static HReg newVRegI ( ISelEnv* env )
    202 {
    203    HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
    204    env->vreg_ctr++;
    205    return reg;
    206 }
    207 
    208 static HReg newVRegF ( ISelEnv* env )
    209 {
    210    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
    211    env->vreg_ctr++;
    212    return reg;
    213 }
    214 
    215 static HReg newVRegV ( ISelEnv* env )
    216 {
    217    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    218    env->vreg_ctr++;
    219    return reg;
    220 }
    221 
    222 
    223 /*---------------------------------------------------------*/
    224 /*--- ISEL: Forward declarations                        ---*/
    225 /*---------------------------------------------------------*/
    226 
    227 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    228    iselXXX_wrk do the real work, but are not to be called directly.
    229    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    230    checks that all returned registers are virtual.  You should not
    231    call the _wrk version directly.
    232 */
    233 static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    234 static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    235 
    236 static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
    237 static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
    238 
    239 static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
    240 static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
    241 
    242 static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
    243 static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
    244 
    245 static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    246 static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    247 
    248 static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
    249                                        ISelEnv* env, IRExpr* e );
    250 static void        iselInt64Expr     ( HReg* rHi, HReg* rLo,
    251                                        ISelEnv* env, IRExpr* e );
    252 
    253 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
    254 static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
    255 
    256 static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
    257 static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
    258 
    259 static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
    260 static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
    261 
    262 static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
    263 static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
    264 
    265 
    266 /*---------------------------------------------------------*/
    267 /*--- ISEL: Misc helpers                                ---*/
    268 /*---------------------------------------------------------*/
    269 
    270 /* Make a int reg-reg move. */
    271 
    272 static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    273 {
    274    vassert(hregClass(src) == HRcInt32);
    275    vassert(hregClass(dst) == HRcInt32);
    276    return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
    277 }
    278 
    279 
    280 /* Make a vector reg-reg move. */
    281 
    282 static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    283 {
    284    vassert(hregClass(src) == HRcVec128);
    285    vassert(hregClass(dst) == HRcVec128);
    286    return X86Instr_SseReRg(Xsse_MOV, src, dst);
    287 }
    288 
    289 /* Advance/retreat %esp by n. */
    290 
    291 static void add_to_esp ( ISelEnv* env, Int n )
    292 {
    293    vassert(n > 0 && n < 256 && (n%4) == 0);
    294    addInstr(env,
    295             X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
    296 }
    297 
    298 static void sub_from_esp ( ISelEnv* env, Int n )
    299 {
    300    vassert(n > 0 && n < 256 && (n%4) == 0);
    301    addInstr(env,
    302             X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
    303 }
    304 
    305 
    306 /* Given an amode, return one which references 4 bytes further
    307    along. */
    308 
    309 static X86AMode* advance4 ( X86AMode* am )
    310 {
    311    X86AMode* am4 = dopyX86AMode(am);
    312    switch (am4->tag) {
    313       case Xam_IRRS:
    314          am4->Xam.IRRS.imm += 4; break;
    315       case Xam_IR:
    316          am4->Xam.IR.imm += 4; break;
    317       default:
    318          vpanic("advance4(x86,host)");
    319    }
    320    return am4;
    321 }
    322 
    323 
    324 /* Push an arg onto the host stack, in preparation for a call to a
    325    helper function of some kind.  Returns the number of 32-bit words
    326    pushed. */
    327 
    328 static Int pushArg ( ISelEnv* env, IRExpr* arg )
    329 {
    330    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
    331    if (arg_ty == Ity_I32) {
    332       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
    333       return 1;
    334    } else
    335    if (arg_ty == Ity_I64) {
    336       HReg rHi, rLo;
    337       iselInt64Expr(&rHi, &rLo, env, arg);
    338       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
    339       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
    340       return 2;
    341    }
    342    ppIRExpr(arg);
    343    vpanic("pushArg(x86): can't handle arg of this type");
    344 }
    345 
    346 
    347 /* Complete the call to a helper function, by calling the
    348    helper and clearing the args off the stack. */
    349 
    350 static
    351 void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc,
    352                               IRCallee* cee, Int n_arg_ws )
    353 {
    354    /* Complication.  Need to decide which reg to use as the fn address
    355       pointer, in a way that doesn't trash regparm-passed
    356       parameters. */
    357    vassert(sizeof(void*) == 4);
    358 
    359    addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
    360                                     cee->regparms));
    361    if (n_arg_ws > 0)
    362       add_to_esp(env, 4*n_arg_ws);
    363 }
    364 
    365 
    366 /* Used only in doHelperCall.  See big comment in doHelperCall re
    367    handling of regparm args.  This function figures out whether
    368    evaluation of an expression might require use of a fixed register.
    369    If in doubt return True (safe but suboptimal).
    370 */
    371 static
    372 Bool mightRequireFixedRegs ( IRExpr* e )
    373 {
    374    switch (e->tag) {
    375       case Iex_RdTmp: case Iex_Const: case Iex_Get:
    376          return False;
    377       default:
    378          return True;
    379    }
    380 }
    381 
    382 
    383 /* Do a complete function call.  guard is a Ity_Bit expression
    384    indicating whether or not the call happens.  If guard==NULL, the
    385    call is unconditional. */
    386 
    387 static
    388 void doHelperCall ( ISelEnv* env,
    389                     Bool passBBP,
    390                     IRExpr* guard, IRCallee* cee, IRExpr** args )
    391 {
    392    X86CondCode cc;
    393    HReg        argregs[3];
    394    HReg        tmpregs[3];
    395    Bool        danger;
    396    Int         not_done_yet, n_args, n_arg_ws, stack_limit,
    397                i, argreg, argregX;
    398 
    399    /* Marshal args for a call, do the call, and clear the stack.
    400       Complexities to consider:
    401 
    402       * if passBBP is True, %ebp (the baseblock pointer) is to be
    403         passed as the first arg.
    404 
    405       * If the callee claims regparmness of 1, 2 or 3, we must pass the
    406         first 1, 2 or 3 args in registers (EAX, EDX, and ECX
    407         respectively).  To keep things relatively simple, only args of
    408         type I32 may be passed as regparms -- just bomb out if anything
    409         else turns up.  Clearly this depends on the front ends not
    410         trying to pass any other types as regparms.
    411    */
    412 
    413    /* 16 Nov 2004: the regparm handling is complicated by the
    414       following problem.
    415 
    416       Consider a call two a function with two regparm parameters:
    417       f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
    418       Suppose code is first generated to compute e1 into %eax.  Then,
    419       code is generated to compute e2 into %edx.  Unfortunately, if
    420       the latter code sequence uses %eax, it will trash the value of
    421       e1 computed by the former sequence.  This could happen if (for
    422       example) e2 itself involved a function call.  In the code below,
    423       args are evaluated right-to-left, not left-to-right, but the
    424       principle and the problem are the same.
    425 
    426       One solution is to compute all regparm-bound args into vregs
    427       first, and once they are all done, move them to the relevant
    428       real regs.  This always gives correct code, but it also gives
    429       a bunch of vreg-to-rreg moves which are usually redundant but
    430       are hard for the register allocator to get rid of.
    431 
    432       A compromise is to first examine all regparm'd argument
    433       expressions.  If they are all so simple that it is clear
    434       they will be evaluated without use of any fixed registers,
    435       use the old compute-directly-to-fixed-target scheme.  If not,
    436       be safe and use the via-vregs scheme.
    437 
    438       Note this requires being able to examine an expression and
    439       determine whether or not evaluation of it might use a fixed
    440       register.  That requires knowledge of how the rest of this
    441       insn selector works.  Currently just the following 3 are
    442       regarded as safe -- hopefully they cover the majority of
    443       arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    444    */
    445    vassert(cee->regparms >= 0 && cee->regparms <= 3);
    446 
    447    n_args = n_arg_ws = 0;
    448    while (args[n_args]) n_args++;
    449 
    450    not_done_yet = n_args;
    451    if (passBBP)
    452       not_done_yet++;
    453 
    454    stack_limit = cee->regparms;
    455    if (cee->regparms > 0 && passBBP) stack_limit--;
    456 
    457    /* ------ BEGIN marshall all arguments ------ */
    458 
    459    /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
    460    for (i = n_args-1; i >= stack_limit; i--) {
    461       n_arg_ws += pushArg(env, args[i]);
    462       not_done_yet--;
    463    }
    464 
    465    /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
    466       registers. */
    467 
    468    if (cee->regparms > 0) {
    469 
    470       /* ------ BEGIN deal with regparms ------ */
    471 
    472       /* deal with regparms, not forgetting %ebp if needed. */
    473       argregs[0] = hregX86_EAX();
    474       argregs[1] = hregX86_EDX();
    475       argregs[2] = hregX86_ECX();
    476       tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
    477 
    478       argreg = cee->regparms;
    479 
    480       /* In keeping with big comment above, detect potential danger
    481          and use the via-vregs scheme if needed. */
    482       danger = False;
    483       for (i = stack_limit-1; i >= 0; i--) {
    484          if (mightRequireFixedRegs(args[i])) {
    485             danger = True;
    486             break;
    487          }
    488       }
    489 
    490       if (danger) {
    491 
    492          /* Move via temporaries */
    493          argregX = argreg;
    494          for (i = stack_limit-1; i >= 0; i--) {
    495 
    496             if (0) {
    497                vex_printf("x86 host: register param is complex: ");
    498                ppIRExpr(args[i]);
    499                vex_printf("\n");
    500             }
    501 
    502             argreg--;
    503             vassert(argreg >= 0);
    504             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    505             tmpregs[argreg] = iselIntExpr_R(env, args[i]);
    506             not_done_yet--;
    507          }
    508          for (i = stack_limit-1; i >= 0; i--) {
    509             argregX--;
    510             vassert(argregX >= 0);
    511             addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
    512          }
    513 
    514       } else {
    515          /* It's safe to compute all regparm args directly into their
    516             target registers. */
    517          for (i = stack_limit-1; i >= 0; i--) {
    518             argreg--;
    519             vassert(argreg >= 0);
    520             vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
    521             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    522                                           iselIntExpr_RMI(env, args[i]),
    523                                           argregs[argreg]));
    524             not_done_yet--;
    525          }
    526 
    527       }
    528 
    529       /* Not forgetting %ebp if needed. */
    530       if (passBBP) {
    531          vassert(argreg == 1);
    532          addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
    533          not_done_yet--;
    534       }
    535 
    536       /* ------ END deal with regparms ------ */
    537 
    538    } else {
    539 
    540       /* No regparms.  Heave %ebp on the stack if needed. */
    541       if (passBBP) {
    542          addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
    543          n_arg_ws++;
    544          not_done_yet--;
    545       }
    546 
    547    }
    548 
    549    vassert(not_done_yet == 0);
    550 
    551    /* ------ END marshall all arguments ------ */
    552 
    553    /* Now we can compute the condition.  We can't do it earlier
    554       because the argument computations could trash the condition
    555       codes.  Be a bit clever to handle the common case where the
    556       guard is 1:Bit. */
    557    cc = Xcc_ALWAYS;
    558    if (guard) {
    559       if (guard->tag == Iex_Const
    560           && guard->Iex.Const.con->tag == Ico_U1
    561           && guard->Iex.Const.con->Ico.U1 == True) {
    562          /* unconditional -- do nothing */
    563       } else {
    564          cc = iselCondCode( env, guard );
    565       }
    566    }
    567 
    568    /* call the helper, and get the args off the stack afterwards. */
    569    callHelperAndClearArgs( env, cc, cee, n_arg_ws );
    570 }
    571 
    572 
    573 /* Given a guest-state array descriptor, an index expression and a
    574    bias, generate an X86AMode holding the relevant guest state
    575    offset. */
    576 
    577 static
    578 X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    579                                 IRExpr* off, Int bias )
    580 {
    581    HReg tmp, roff;
    582    Int  elemSz = sizeofIRType(descr->elemTy);
    583    Int  nElems = descr->nElems;
    584    Int  shift  = 0;
    585 
    586    /* throw out any cases not generated by an x86 front end.  In
    587       theory there might be a day where we need to handle them -- if
    588       we ever run non-x86-guest on x86 host. */
    589 
    590    if (nElems != 8)
    591       vpanic("genGuestArrayOffset(x86 host)(1)");
    592 
    593    switch (elemSz) {
    594       case 1:  shift = 0; break;
    595       case 4:  shift = 2; break;
    596       case 8:  shift = 3; break;
    597       default: vpanic("genGuestArrayOffset(x86 host)(2)");
    598    }
    599 
    600    /* Compute off into a reg, %off.  Then return:
    601 
    602          movl %off, %tmp
    603          addl $bias, %tmp  (if bias != 0)
    604          andl %tmp, 7
    605          ... base(%ebp, %tmp, shift) ...
    606    */
    607    tmp  = newVRegI(env);
    608    roff = iselIntExpr_R(env, off);
    609    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    610    if (bias != 0) {
    611       addInstr(env,
    612                X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
    613    }
    614    addInstr(env,
    615             X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
    616    return
    617       X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
    618 }
    619 
    620 
    621 /* Mess with the FPU's rounding mode: set to the default rounding mode
    622    (DEFAULT_FPUCW). */
    623 static
    624 void set_FPU_rounding_default ( ISelEnv* env )
    625 {
    626    /* pushl $DEFAULT_FPUCW
    627       fldcw 0(%esp)
    628       addl $4, %esp
    629    */
    630    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    631    addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
    632    addInstr(env, X86Instr_FpLdCW(zero_esp));
    633    add_to_esp(env, 4);
    634 }
    635 
    636 
    637 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    638    expression denoting a value in the range 0 .. 3, indicating a round
    639    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    640    the same rounding.
    641 */
    642 static
    643 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    644 {
    645    HReg rrm  = iselIntExpr_R(env, mode);
    646    HReg rrm2 = newVRegI(env);
    647    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    648 
    649    /* movl  %rrm, %rrm2
    650       andl  $3, %rrm2   -- shouldn't be needed; paranoia
    651       shll  $10, %rrm2
    652       orl   $DEFAULT_FPUCW, %rrm2
    653       pushl %rrm2
    654       fldcw 0(%esp)
    655       addl  $4, %esp
    656    */
    657    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    658    addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
    659    addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
    660    addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
    661    addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
    662    addInstr(env, X86Instr_FpLdCW(zero_esp));
    663    add_to_esp(env, 4);
    664 }
    665 
    666 
    667 /* Generate !src into a new vector register, and be sure that the code
    668    is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
    669    way to do this.
    670 */
    671 static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
    672 {
    673    HReg dst = newVRegV(env);
    674    /* Set dst to zero.  If dst contains a NaN then all hell might
    675       break loose after the comparison.  So, first zero it. */
    676    addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
    677    /* And now make it all 1s ... */
    678    addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
    679    /* Finally, xor 'src' into it. */
    680    addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
    681    /* Doesn't that just totally suck? */
    682    return dst;
    683 }
    684 
    685 
    686 /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
    687    after most non-simple FPU operations (simple = +, -, *, / and
    688    sqrt).
    689 
    690    This could be done a lot more efficiently if needed, by loading
    691    zero and adding it to the value to be rounded (fldz ; faddp?).
    692 */
    693 static void roundToF64 ( ISelEnv* env, HReg reg )
    694 {
    695    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    696    sub_from_esp(env, 8);
    697    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
    698    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
    699    add_to_esp(env, 8);
    700 }
    701 
    702 
    703 /*---------------------------------------------------------*/
    704 /*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
    705 /*---------------------------------------------------------*/
    706 
    707 /* Select insns for an integer-typed expression, and add them to the
    708    code list.  Return a reg holding the result.  This reg will be a
    709    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    710    want to modify it, ask for a new vreg, copy it in there, and modify
    711    the copy.  The register allocator will do its best to map both
    712    vregs to the same real register, so the copies will often disappear
    713    later in the game.
    714 
    715    This should handle expressions of 32, 16 and 8-bit type.  All
    716    results are returned in a 32-bit register.  For 16- and 8-bit
    717    expressions, the upper 16/24 bits are arbitrary, so you should mask
    718    or sign extend partial values if necessary.
    719 */
    720 
    721 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    722 {
    723    HReg r = iselIntExpr_R_wrk(env, e);
    724    /* sanity checks ... */
    725 #  if 0
    726    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
    727 #  endif
    728    vassert(hregClass(r) == HRcInt32);
    729    vassert(hregIsVirtual(r));
    730    return r;
    731 }
    732 
    733 /* DO NOT CALL THIS DIRECTLY ! */
    734 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    735 {
    736    MatchInfo mi;
    737 
    738    IRType ty = typeOfIRExpr(env->type_env,e);
    739    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
    740 
    741    switch (e->tag) {
    742 
    743    /* --------- TEMP --------- */
    744    case Iex_RdTmp: {
    745       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    746    }
    747 
    748    /* --------- LOAD --------- */
    749    case Iex_Load: {
    750       HReg dst = newVRegI(env);
    751       X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    752 
    753       /* We can't handle big-endian loads, nor load-linked. */
    754       if (e->Iex.Load.end != Iend_LE)
    755          goto irreducible;
    756 
    757       if (ty == Ity_I32) {
    758          addInstr(env, X86Instr_Alu32R(Xalu_MOV,
    759                                        X86RMI_Mem(amode), dst) );
    760          return dst;
    761       }
    762       if (ty == Ity_I16) {
    763          addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
    764          return dst;
    765       }
    766       if (ty == Ity_I8) {
    767          addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
    768          return dst;
    769       }
    770       break;
    771    }
    772 
    773    /* --------- TERNARY OP --------- */
    774    case Iex_Triop: {
    775       /* C3210 flags following FPU partial remainder (fprem), both
    776          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
    777       if (e->Iex.Triop.op == Iop_PRemC3210F64
    778           || e->Iex.Triop.op == Iop_PRem1C3210F64) {
    779          HReg junk = newVRegF(env);
    780          HReg dst  = newVRegI(env);
    781          HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
    782          HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
    783          /* XXXROUNDINGFIXME */
    784          /* set roundingmode here */
    785          addInstr(env, X86Instr_FpBinary(
    786                            e->Iex.Binop.op==Iop_PRemC3210F64
    787                               ? Xfp_PREM : Xfp_PREM1,
    788                            srcL,srcR,junk
    789                  ));
    790          /* The previous pseudo-insn will have left the FPU's C3210
    791             flags set correctly.  So bag them. */
    792          addInstr(env, X86Instr_FpStSW_AX());
    793          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
    794          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
    795          return dst;
    796       }
    797 
    798       break;
    799    }
    800 
    801    /* --------- BINARY OP --------- */
    802    case Iex_Binop: {
    803       X86AluOp   aluOp;
    804       X86ShiftOp shOp;
    805 
    806       /* Pattern: Sub32(0,x) */
    807       if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
    808          HReg dst = newVRegI(env);
    809          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    810          addInstr(env, mk_iMOVsd_RR(reg,dst));
    811          addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
    812          return dst;
    813       }
    814 
    815       /* Is it an addition or logical style op? */
    816       switch (e->Iex.Binop.op) {
    817          case Iop_Add8: case Iop_Add16: case Iop_Add32:
    818             aluOp = Xalu_ADD; break;
    819          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32:
    820             aluOp = Xalu_SUB; break;
    821          case Iop_And8: case Iop_And16: case Iop_And32:
    822             aluOp = Xalu_AND; break;
    823          case Iop_Or8: case Iop_Or16: case Iop_Or32:
    824             aluOp = Xalu_OR; break;
    825          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32:
    826             aluOp = Xalu_XOR; break;
    827          case Iop_Mul16: case Iop_Mul32:
    828             aluOp = Xalu_MUL; break;
    829          default:
    830             aluOp = Xalu_INVALID; break;
    831       }
    832       /* For commutative ops we assume any literal
    833          values are on the second operand. */
    834       if (aluOp != Xalu_INVALID) {
    835          HReg dst    = newVRegI(env);
    836          HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
    837          X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
    838          addInstr(env, mk_iMOVsd_RR(reg,dst));
    839          addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
    840          return dst;
    841       }
    842       /* Could do better here; forcing the first arg into a reg
    843          isn't always clever.
    844          -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
    845                         LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
    846                         t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
    847             movl 0xFFFFFFA0(%vr41),%vr107
    848             movl 0xFFFFFFA4(%vr41),%vr108
    849             movl %vr107,%vr106
    850             xorl %vr108,%vr106
    851             movl 0xFFFFFFA8(%vr41),%vr109
    852             movl %vr106,%vr105
    853             andl %vr109,%vr105
    854             movl 0xFFFFFFA0(%vr41),%vr110
    855             movl %vr105,%vr104
    856             xorl %vr110,%vr104
    857             movl %vr104,%vr70
    858       */
    859 
    860       /* Perhaps a shift op? */
    861       switch (e->Iex.Binop.op) {
    862          case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    863             shOp = Xsh_SHL; break;
    864          case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
    865             shOp = Xsh_SHR; break;
    866          case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
    867             shOp = Xsh_SAR; break;
    868          default:
    869             shOp = Xsh_INVALID; break;
    870       }
    871       if (shOp != Xsh_INVALID) {
    872          HReg dst = newVRegI(env);
    873 
    874          /* regL = the value to be shifted */
    875          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
    876          addInstr(env, mk_iMOVsd_RR(regL,dst));
    877 
    878          /* Do any necessary widening for 16/8 bit operands */
    879          switch (e->Iex.Binop.op) {
    880             case Iop_Shr8:
    881                addInstr(env, X86Instr_Alu32R(
    882                                 Xalu_AND, X86RMI_Imm(0xFF), dst));
    883                break;
    884             case Iop_Shr16:
    885                addInstr(env, X86Instr_Alu32R(
    886                                 Xalu_AND, X86RMI_Imm(0xFFFF), dst));
    887                break;
    888             case Iop_Sar8:
    889                addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
    890                addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
    891                break;
    892             case Iop_Sar16:
    893                addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
    894                addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
    895                break;
    896             default: break;
    897          }
    898 
    899          /* Now consider the shift amount.  If it's a literal, we
    900             can do a much better job than the general case. */
    901          if (e->Iex.Binop.arg2->tag == Iex_Const) {
    902             /* assert that the IR is well-typed */
    903             Int nshift;
    904             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
    905             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
    906 	    vassert(nshift >= 0);
    907 	    if (nshift > 0)
    908                /* Can't allow nshift==0 since that means %cl */
    909                addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
    910          } else {
    911             /* General case; we have to force the amount into %cl. */
    912             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
    913             addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
    914             addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
    915          }
    916          return dst;
    917       }
    918 
    919       /* Handle misc other ops. */
    920 
    921       if (e->Iex.Binop.op == Iop_Max32U) {
    922          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
    923          HReg dst  = newVRegI(env);
    924          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
    925          addInstr(env, mk_iMOVsd_RR(src1,dst));
    926          addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
    927          addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
    928          return dst;
    929       }
    930 
    931       if (e->Iex.Binop.op == Iop_8HLto16) {
    932          HReg hi8  = newVRegI(env);
    933          HReg lo8  = newVRegI(env);
    934          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    935          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    936          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
    937          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
    938          addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
    939          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
    940          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
    941          return hi8;
    942       }
    943 
    944       if (e->Iex.Binop.op == Iop_16HLto32) {
    945          HReg hi16  = newVRegI(env);
    946          HReg lo16  = newVRegI(env);
    947          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
    948          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
    949          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
    950          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
    951          addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
    952          addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
    953          addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
    954          return hi16;
    955       }
    956 
    957       if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
    958           || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
    959          HReg a16   = newVRegI(env);
    960          HReg b16   = newVRegI(env);
    961          HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
    962          HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
    963          Int  shift = (e->Iex.Binop.op == Iop_MullS8
    964                        || e->Iex.Binop.op == Iop_MullU8)
    965                          ? 24 : 16;
    966          X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8
    967                               || e->Iex.Binop.op == Iop_MullS16)
    968                                 ? Xsh_SAR : Xsh_SHR;
    969 
    970          addInstr(env, mk_iMOVsd_RR(a16s, a16));
    971          addInstr(env, mk_iMOVsd_RR(b16s, b16));
    972          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
    973          addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
    974          addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
    975          addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
    976          addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
    977          return b16;
    978       }
    979 
    980       if (e->Iex.Binop.op == Iop_CmpF64) {
    981          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
    982          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
    983          HReg dst = newVRegI(env);
    984          addInstr(env, X86Instr_FpCmp(fL,fR,dst));
    985          /* shift this right 8 bits so as to conform to CmpF64
    986             definition. */
    987          addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
    988          return dst;
    989       }
    990 
    991       if (e->Iex.Binop.op == Iop_F64toI32S
    992           || e->Iex.Binop.op == Iop_F64toI16S) {
    993          Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
    994          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
    995          HReg dst = newVRegI(env);
    996 
    997          /* Used several times ... */
    998          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    999 
   1000 	 /* rf now holds the value to be converted, and rrm holds the
   1001 	    rounding mode value, encoded as per the IRRoundingMode
   1002 	    enum.  The first thing to do is set the FPU's rounding
   1003 	    mode accordingly. */
   1004 
   1005          /* Create a space for the format conversion. */
   1006          /* subl $4, %esp */
   1007          sub_from_esp(env, 4);
   1008 
   1009 	 /* Set host rounding mode */
   1010 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   1011 
   1012          /* gistw/l %rf, 0(%esp) */
   1013          addInstr(env, X86Instr_FpLdStI(False/*store*/,
   1014                                         toUChar(sz), rf, zero_esp));
   1015 
   1016          if (sz == 2) {
   1017             /* movzwl 0(%esp), %dst */
   1018             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
   1019          } else {
   1020             /* movl 0(%esp), %dst */
   1021             vassert(sz == 4);
   1022             addInstr(env, X86Instr_Alu32R(
   1023                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1024          }
   1025 
   1026 	 /* Restore default FPU rounding. */
   1027          set_FPU_rounding_default( env );
   1028 
   1029          /* addl $4, %esp */
   1030 	 add_to_esp(env, 4);
   1031          return dst;
   1032       }
   1033 
   1034       break;
   1035    }
   1036 
   1037    /* --------- UNARY OP --------- */
   1038    case Iex_Unop: {
   1039 
   1040       /* 1Uto8(32to1(expr32)) */
   1041       if (e->Iex.Unop.op == Iop_1Uto8) {
   1042          DECLARE_PATTERN(p_32to1_then_1Uto8);
   1043          DEFINE_PATTERN(p_32to1_then_1Uto8,
   1044                         unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
   1045          if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
   1046             IRExpr* expr32 = mi.bindee[0];
   1047             HReg dst = newVRegI(env);
   1048             HReg src = iselIntExpr_R(env, expr32);
   1049             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1050             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1051                                           X86RMI_Imm(1), dst));
   1052             return dst;
   1053          }
   1054       }
   1055 
   1056       /* 8Uto32(LDle(expr32)) */
   1057       if (e->Iex.Unop.op == Iop_8Uto32) {
   1058          DECLARE_PATTERN(p_LDle8_then_8Uto32);
   1059          DEFINE_PATTERN(p_LDle8_then_8Uto32,
   1060                         unop(Iop_8Uto32,
   1061                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1062          if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
   1063             HReg dst = newVRegI(env);
   1064             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1065             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1066             return dst;
   1067          }
   1068       }
   1069 
   1070       /* 8Sto32(LDle(expr32)) */
   1071       if (e->Iex.Unop.op == Iop_8Sto32) {
   1072          DECLARE_PATTERN(p_LDle8_then_8Sto32);
   1073          DEFINE_PATTERN(p_LDle8_then_8Sto32,
   1074                         unop(Iop_8Sto32,
   1075                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1076          if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
   1077             HReg dst = newVRegI(env);
   1078             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1079             addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
   1080             return dst;
   1081          }
   1082       }
   1083 
   1084       /* 16Uto32(LDle(expr32)) */
   1085       if (e->Iex.Unop.op == Iop_16Uto32) {
   1086          DECLARE_PATTERN(p_LDle16_then_16Uto32);
   1087          DEFINE_PATTERN(p_LDle16_then_16Uto32,
   1088                         unop(Iop_16Uto32,
   1089                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1090          if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
   1091             HReg dst = newVRegI(env);
   1092             X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1093             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1094             return dst;
   1095          }
   1096       }
   1097 
   1098       /* 8Uto32(GET:I8) */
   1099       if (e->Iex.Unop.op == Iop_8Uto32) {
   1100          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1101             HReg      dst;
   1102             X86AMode* amode;
   1103             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
   1104             dst = newVRegI(env);
   1105             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1106                                 hregX86_EBP());
   1107             addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
   1108             return dst;
   1109          }
   1110       }
   1111 
   1112       /* 16to32(GET:I16) */
   1113       if (e->Iex.Unop.op == Iop_16Uto32) {
   1114          if (e->Iex.Unop.arg->tag == Iex_Get) {
   1115             HReg      dst;
   1116             X86AMode* amode;
   1117             vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
   1118             dst = newVRegI(env);
   1119             amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1120                                 hregX86_EBP());
   1121             addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
   1122             return dst;
   1123          }
   1124       }
   1125 
   1126       switch (e->Iex.Unop.op) {
   1127          case Iop_8Uto16:
   1128          case Iop_8Uto32:
   1129          case Iop_16Uto32: {
   1130             HReg dst = newVRegI(env);
   1131             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1132             UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
   1133             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1134             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   1135                                           X86RMI_Imm(mask), dst));
   1136             return dst;
   1137          }
   1138          case Iop_8Sto16:
   1139          case Iop_8Sto32:
   1140          case Iop_16Sto32: {
   1141             HReg dst = newVRegI(env);
   1142             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1143             UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
   1144             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1145             addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
   1146             addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
   1147             return dst;
   1148          }
   1149 	 case Iop_Not8:
   1150 	 case Iop_Not16:
   1151          case Iop_Not32: {
   1152             HReg dst = newVRegI(env);
   1153             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1154             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1155             addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
   1156             return dst;
   1157          }
   1158          case Iop_64HIto32: {
   1159             HReg rHi, rLo;
   1160             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1161             return rHi; /* and abandon rLo .. poor wee thing :-) */
   1162          }
   1163          case Iop_64to32: {
   1164             HReg rHi, rLo;
   1165             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1166             return rLo; /* similar stupid comment to the above ... */
   1167          }
   1168          case Iop_16HIto8:
   1169          case Iop_32HIto16: {
   1170             HReg dst  = newVRegI(env);
   1171             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1172             Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
   1173             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1174             addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
   1175             return dst;
   1176          }
   1177          case Iop_1Uto32:
   1178          case Iop_1Uto8: {
   1179             HReg dst         = newVRegI(env);
   1180             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1181             addInstr(env, X86Instr_Set32(cond,dst));
   1182             return dst;
   1183          }
   1184          case Iop_1Sto8:
   1185          case Iop_1Sto16:
   1186          case Iop_1Sto32: {
   1187             /* could do better than this, but for now ... */
   1188             HReg dst         = newVRegI(env);
   1189             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1190             addInstr(env, X86Instr_Set32(cond,dst));
   1191             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
   1192             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1193             return dst;
   1194          }
   1195          case Iop_Ctz32: {
   1196             /* Count trailing zeroes, implemented by x86 'bsfl' */
   1197             HReg dst = newVRegI(env);
   1198             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1199             addInstr(env, X86Instr_Bsfr32(True,src,dst));
   1200             return dst;
   1201          }
   1202          case Iop_Clz32: {
   1203             /* Count leading zeroes.  Do 'bsrl' to establish the index
   1204                of the highest set bit, and subtract that value from
   1205                31. */
   1206             HReg tmp = newVRegI(env);
   1207             HReg dst = newVRegI(env);
   1208             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1209             addInstr(env, X86Instr_Bsfr32(False,src,tmp));
   1210             addInstr(env, X86Instr_Alu32R(Xalu_MOV,
   1211                                           X86RMI_Imm(31), dst));
   1212             addInstr(env, X86Instr_Alu32R(Xalu_SUB,
   1213                                           X86RMI_Reg(tmp), dst));
   1214             return dst;
   1215          }
   1216 
   1217          case Iop_CmpwNEZ32: {
   1218             HReg dst = newVRegI(env);
   1219             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1220             addInstr(env, mk_iMOVsd_RR(src,dst));
   1221             addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
   1222             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   1223                                           X86RMI_Reg(src), dst));
   1224             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
   1225             return dst;
   1226          }
   1227          case Iop_Left8:
   1228          case Iop_Left16:
   1229          case Iop_Left32: {
   1230             HReg dst = newVRegI(env);
   1231             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1232             addInstr(env, mk_iMOVsd_RR(src, dst));
   1233             addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
   1234             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
   1235             return dst;
   1236          }
   1237 
   1238          case Iop_V128to32: {
   1239             HReg      dst  = newVRegI(env);
   1240             HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
   1241             X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   1242             sub_from_esp(env, 16);
   1243             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   1244             addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
   1245             add_to_esp(env, 16);
   1246             return dst;
   1247          }
   1248 
   1249          /* ReinterpF32asI32(e) */
   1250          /* Given an IEEE754 single, produce an I32 with the same bit
   1251             pattern.  Keep stack 8-aligned even though only using 4
   1252             bytes. */
   1253          case Iop_ReinterpF32asI32: {
   1254             HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
   1255             HReg dst  = newVRegI(env);
   1256             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1257             /* paranoia */
   1258             set_FPU_rounding_default(env);
   1259             /* subl $8, %esp */
   1260             sub_from_esp(env, 8);
   1261             /* gstF %rf, 0(%esp) */
   1262             addInstr(env,
   1263                      X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
   1264             /* movl 0(%esp), %dst */
   1265             addInstr(env,
   1266                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1267             /* addl $8, %esp */
   1268             add_to_esp(env, 8);
   1269             return dst;
   1270          }
   1271 
   1272          case Iop_16to8:
   1273          case Iop_32to8:
   1274          case Iop_32to16:
   1275             /* These are no-ops. */
   1276             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1277 
   1278          default:
   1279             break;
   1280       }
   1281       break;
   1282    }
   1283 
   1284    /* --------- GET --------- */
   1285    case Iex_Get: {
   1286       if (ty == Ity_I32) {
   1287          HReg dst = newVRegI(env);
   1288          addInstr(env, X86Instr_Alu32R(
   1289                           Xalu_MOV,
   1290                           X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1291                                                  hregX86_EBP())),
   1292                           dst));
   1293          return dst;
   1294       }
   1295       if (ty == Ity_I8 || ty == Ity_I16) {
   1296          HReg dst = newVRegI(env);
   1297          addInstr(env, X86Instr_LoadEX(
   1298                           toUChar(ty==Ity_I8 ? 1 : 2),
   1299                           False,
   1300                           X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
   1301                           dst));
   1302          return dst;
   1303       }
   1304       break;
   1305    }
   1306 
   1307    case Iex_GetI: {
   1308       X86AMode* am
   1309          = genGuestArrayOffset(
   1310               env, e->Iex.GetI.descr,
   1311                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1312       HReg dst = newVRegI(env);
   1313       if (ty == Ity_I8) {
   1314          addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
   1315          return dst;
   1316       }
   1317       if (ty == Ity_I32) {
   1318          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
   1319          return dst;
   1320       }
   1321       break;
   1322    }
   1323 
   1324    /* --------- CCALL --------- */
   1325    case Iex_CCall: {
   1326       HReg    dst = newVRegI(env);
   1327       vassert(ty == e->Iex.CCall.retty);
   1328 
   1329       /* be very restrictive for now.  Only 32/64-bit ints allowed
   1330          for args, and 32 bits for return type. */
   1331       if (e->Iex.CCall.retty != Ity_I32)
   1332          goto irreducible;
   1333 
   1334       /* Marshal args, do the call, clear stack. */
   1335       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   1336 
   1337       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
   1338       return dst;
   1339    }
   1340 
   1341    /* --------- LITERAL --------- */
   1342    /* 32/16/8-bit literals */
   1343    case Iex_Const: {
   1344       X86RMI* rmi = iselIntExpr_RMI ( env, e );
   1345       HReg    r   = newVRegI(env);
   1346       addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
   1347       return r;
   1348    }
   1349 
   1350    /* --------- MULTIPLEX --------- */
   1351    case Iex_Mux0X: {
   1352      if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1353          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   1354         X86RM* r8;
   1355         HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
   1356         X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
   1357         HReg   dst = newVRegI(env);
   1358         addInstr(env, mk_iMOVsd_RR(rX,dst));
   1359         r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   1360         addInstr(env, X86Instr_Test32(0xFF, r8));
   1361         addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
   1362         return dst;
   1363       }
   1364       break;
   1365    }
   1366 
   1367    default:
   1368    break;
   1369    } /* switch (e->tag) */
   1370 
   1371    /* We get here if no pattern matched. */
   1372   irreducible:
   1373    ppIRExpr(e);
   1374    vpanic("iselIntExpr_R: cannot reduce tree");
   1375 }
   1376 
   1377 
   1378 /*---------------------------------------------------------*/
   1379 /*--- ISEL: Integer expression auxiliaries              ---*/
   1380 /*---------------------------------------------------------*/
   1381 
   1382 /* --------------------- AMODEs --------------------- */
   1383 
   1384 /* Return an AMode which computes the value of the specified
   1385    expression, possibly also adding insns to the code list as a
   1386    result.  The expression may only be a 32-bit one.
   1387 */
   1388 
   1389 static Bool sane_AMode ( X86AMode* am )
   1390 {
   1391    switch (am->tag) {
   1392       case Xam_IR:
   1393          return
   1394             toBool( hregClass(am->Xam.IR.reg) == HRcInt32
   1395                     && (hregIsVirtual(am->Xam.IR.reg)
   1396                         || am->Xam.IR.reg == hregX86_EBP()) );
   1397       case Xam_IRRS:
   1398          return
   1399             toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
   1400                     && hregIsVirtual(am->Xam.IRRS.base)
   1401                     && hregClass(am->Xam.IRRS.index) == HRcInt32
   1402                     && hregIsVirtual(am->Xam.IRRS.index) );
   1403       default:
   1404         vpanic("sane_AMode: unknown x86 amode tag");
   1405    }
   1406 }
   1407 
   1408 static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1409 {
   1410    X86AMode* am = iselIntExpr_AMode_wrk(env, e);
   1411    vassert(sane_AMode(am));
   1412    return am;
   1413 }
   1414 
   1415 /* DO NOT CALL THIS DIRECTLY ! */
   1416 static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1417 {
   1418    IRType ty = typeOfIRExpr(env->type_env,e);
   1419    vassert(ty == Ity_I32);
   1420 
   1421    /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
   1422    if (e->tag == Iex_Binop
   1423        && e->Iex.Binop.op == Iop_Add32
   1424        && e->Iex.Binop.arg2->tag == Iex_Const
   1425        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
   1426        && e->Iex.Binop.arg1->tag == Iex_Binop
   1427        && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
   1428        && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
   1429        && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1430        && e->Iex.Binop.arg1
   1431            ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1432        && e->Iex.Binop.arg1
   1433            ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1434       UInt shift = e->Iex.Binop.arg1
   1435                     ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1436       UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
   1437       if (shift == 1 || shift == 2 || shift == 3) {
   1438          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
   1439          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
   1440                                        ->Iex.Binop.arg2->Iex.Binop.arg1 );
   1441          return X86AMode_IRRS(imm32, r1, r2, shift);
   1442       }
   1443    }
   1444 
   1445    /* Add32(expr1, Shl32(expr2, imm)) */
   1446    if (e->tag == Iex_Binop
   1447        && e->Iex.Binop.op == Iop_Add32
   1448        && e->Iex.Binop.arg2->tag == Iex_Binop
   1449        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
   1450        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1451        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1452       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1453       if (shift == 1 || shift == 2 || shift == 3) {
   1454          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1455          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1456          return X86AMode_IRRS(0, r1, r2, shift);
   1457       }
   1458    }
   1459 
   1460    /* Add32(expr,i) */
   1461    if (e->tag == Iex_Binop
   1462        && e->Iex.Binop.op == Iop_Add32
   1463        && e->Iex.Binop.arg2->tag == Iex_Const
   1464        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
   1465       HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
   1466       return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
   1467    }
   1468 
   1469    /* Doesn't match anything in particular.  Generate it into
   1470       a register and use that. */
   1471    {
   1472       HReg r1 = iselIntExpr_R(env, e);
   1473       return X86AMode_IR(0, r1);
   1474    }
   1475 }
   1476 
   1477 
   1478 /* --------------------- RMIs --------------------- */
   1479 
   1480 /* Similarly, calculate an expression into an X86RMI operand.  As with
   1481    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1482 
   1483 static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   1484 {
   1485    X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   1486    /* sanity checks ... */
   1487    switch (rmi->tag) {
   1488       case Xrmi_Imm:
   1489          return rmi;
   1490       case Xrmi_Reg:
   1491          vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
   1492          vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
   1493          return rmi;
   1494       case Xrmi_Mem:
   1495          vassert(sane_AMode(rmi->Xrmi.Mem.am));
   1496          return rmi;
   1497       default:
   1498          vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
   1499    }
   1500 }
   1501 
   1502 /* DO NOT CALL THIS DIRECTLY ! */
   1503 static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   1504 {
   1505    IRType ty = typeOfIRExpr(env->type_env,e);
   1506    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1507 
   1508    /* special case: immediate */
   1509    if (e->tag == Iex_Const) {
   1510       UInt u;
   1511       switch (e->Iex.Const.con->tag) {
   1512          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1513          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1514          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1515          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1516       }
   1517       return X86RMI_Imm(u);
   1518    }
   1519 
   1520    /* special case: 32-bit GET */
   1521    if (e->tag == Iex_Get && ty == Ity_I32) {
   1522       return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
   1523                                     hregX86_EBP()));
   1524    }
   1525 
   1526    /* special case: 32-bit load from memory */
   1527    if (e->tag == Iex_Load && ty == Ity_I32
   1528        && e->Iex.Load.end == Iend_LE) {
   1529       X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1530       return X86RMI_Mem(am);
   1531    }
   1532 
   1533    /* default case: calculate into a register and return that */
   1534    {
   1535       HReg r = iselIntExpr_R ( env, e );
   1536       return X86RMI_Reg(r);
   1537    }
   1538 }
   1539 
   1540 
   1541 /* --------------------- RIs --------------------- */
   1542 
   1543 /* Calculate an expression into an X86RI operand.  As with
   1544    iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
   1545 
   1546 static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   1547 {
   1548    X86RI* ri = iselIntExpr_RI_wrk(env, e);
   1549    /* sanity checks ... */
   1550    switch (ri->tag) {
   1551       case Xri_Imm:
   1552          return ri;
   1553       case Xri_Reg:
   1554          vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
   1555          vassert(hregIsVirtual(ri->Xri.Reg.reg));
   1556          return ri;
   1557       default:
   1558          vpanic("iselIntExpr_RI: unknown x86 RI tag");
   1559    }
   1560 }
   1561 
   1562 /* DO NOT CALL THIS DIRECTLY ! */
   1563 static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   1564 {
   1565    IRType ty = typeOfIRExpr(env->type_env,e);
   1566    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1567 
   1568    /* special case: immediate */
   1569    if (e->tag == Iex_Const) {
   1570       UInt u;
   1571       switch (e->Iex.Const.con->tag) {
   1572          case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
   1573          case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
   1574          case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
   1575          default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
   1576       }
   1577       return X86RI_Imm(u);
   1578    }
   1579 
   1580    /* default case: calculate into a register and return that */
   1581    {
   1582       HReg r = iselIntExpr_R ( env, e );
   1583       return X86RI_Reg(r);
   1584    }
   1585 }
   1586 
   1587 
   1588 /* --------------------- RMs --------------------- */
   1589 
   1590 /* Similarly, calculate an expression into an X86RM operand.  As with
   1591    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1592 
   1593 static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   1594 {
   1595    X86RM* rm = iselIntExpr_RM_wrk(env, e);
   1596    /* sanity checks ... */
   1597    switch (rm->tag) {
   1598       case Xrm_Reg:
   1599          vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
   1600          vassert(hregIsVirtual(rm->Xrm.Reg.reg));
   1601          return rm;
   1602       case Xrm_Mem:
   1603          vassert(sane_AMode(rm->Xrm.Mem.am));
   1604          return rm;
   1605       default:
   1606          vpanic("iselIntExpr_RM: unknown x86 RM tag");
   1607    }
   1608 }
   1609 
   1610 /* DO NOT CALL THIS DIRECTLY ! */
   1611 static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   1612 {
   1613    IRType ty = typeOfIRExpr(env->type_env,e);
   1614    vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   1615 
   1616    /* special case: 32-bit GET */
   1617    if (e->tag == Iex_Get && ty == Ity_I32) {
   1618       return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
   1619                                    hregX86_EBP()));
   1620    }
   1621 
   1622    /* special case: load from memory */
   1623 
   1624    /* default case: calculate into a register and return that */
   1625    {
   1626       HReg r = iselIntExpr_R ( env, e );
   1627       return X86RM_Reg(r);
   1628    }
   1629 }
   1630 
   1631 
   1632 /* --------------------- CONDCODE --------------------- */
   1633 
   1634 /* Generate code to evaluated a bit-typed expression, returning the
   1635    condition code which would correspond when the expression would
   1636    notionally have returned 1. */
   1637 
   1638 static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   1639 {
   1640    /* Uh, there's nothing we can sanity check here, unfortunately. */
   1641    return iselCondCode_wrk(env,e);
   1642 }
   1643 
   1644 /* DO NOT CALL THIS DIRECTLY ! */
   1645 static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   1646 {
   1647    MatchInfo mi;
   1648 
   1649    vassert(e);
   1650    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   1651 
   1652    /* var */
   1653    if (e->tag == Iex_RdTmp) {
   1654       HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   1655       /* Test32 doesn't modify r32; so this is OK. */
   1656       addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
   1657       return Xcc_NZ;
   1658    }
   1659 
   1660    /* Constant 1:Bit */
   1661    if (e->tag == Iex_Const) {
   1662       HReg r;
   1663       vassert(e->Iex.Const.con->tag == Ico_U1);
   1664       vassert(e->Iex.Const.con->Ico.U1 == True
   1665               || e->Iex.Const.con->Ico.U1 == False);
   1666       r = newVRegI(env);
   1667       addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
   1668       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
   1669       return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
   1670    }
   1671 
   1672    /* Not1(e) */
   1673    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   1674       /* Generate code for the arg, and negate the test condition */
   1675       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   1676    }
   1677 
   1678    /* --- patterns rooted at: 32to1 --- */
   1679 
   1680    if (e->tag == Iex_Unop
   1681        && e->Iex.Unop.op == Iop_32to1) {
   1682       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1683       addInstr(env, X86Instr_Test32(1,rm));
   1684       return Xcc_NZ;
   1685    }
   1686 
   1687    /* --- patterns rooted at: CmpNEZ8 --- */
   1688 
   1689    /* CmpNEZ8(x) */
   1690    if (e->tag == Iex_Unop
   1691        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   1692       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1693       addInstr(env, X86Instr_Test32(0xFF,rm));
   1694       return Xcc_NZ;
   1695    }
   1696 
   1697    /* --- patterns rooted at: CmpNEZ16 --- */
   1698 
   1699    /* CmpNEZ16(x) */
   1700    if (e->tag == Iex_Unop
   1701        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   1702       X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
   1703       addInstr(env, X86Instr_Test32(0xFFFF,rm));
   1704       return Xcc_NZ;
   1705    }
   1706 
   1707    /* --- patterns rooted at: CmpNEZ32 --- */
   1708 
   1709    /* CmpNEZ32(And32(x,y)) */
   1710    {
   1711       DECLARE_PATTERN(p_CmpNEZ32_And32);
   1712       DEFINE_PATTERN(p_CmpNEZ32_And32,
   1713                      unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
   1714       if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
   1715          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1716          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1717          HReg    tmp  = newVRegI(env);
   1718          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1719          addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
   1720          return Xcc_NZ;
   1721       }
   1722    }
   1723 
   1724    /* CmpNEZ32(Or32(x,y)) */
   1725    {
   1726       DECLARE_PATTERN(p_CmpNEZ32_Or32);
   1727       DEFINE_PATTERN(p_CmpNEZ32_Or32,
   1728                      unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
   1729       if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
   1730          HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
   1731          X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   1732          HReg    tmp  = newVRegI(env);
   1733          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   1734          addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
   1735          return Xcc_NZ;
   1736       }
   1737    }
   1738 
   1739    /* CmpNEZ32(GET(..):I32) */
   1740    if (e->tag == Iex_Unop
   1741        && e->Iex.Unop.op == Iop_CmpNEZ32
   1742        && e->Iex.Unop.arg->tag == Iex_Get) {
   1743       X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
   1744                                  hregX86_EBP());
   1745       addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
   1746       return Xcc_NZ;
   1747    }
   1748 
   1749    /* CmpNEZ32(x) */
   1750    if (e->tag == Iex_Unop
   1751        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   1752       HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   1753       X86RMI* rmi2 = X86RMI_Imm(0);
   1754       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1755       return Xcc_NZ;
   1756    }
   1757 
   1758    /* --- patterns rooted at: CmpNEZ64 --- */
   1759 
   1760    /* CmpNEZ64(Or64(x,y)) */
   1761    {
   1762       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   1763       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   1764                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   1765       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   1766          HReg    hi1, lo1, hi2, lo2;
   1767          HReg    tmp  = newVRegI(env);
   1768          iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
   1769          addInstr(env, mk_iMOVsd_RR(hi1, tmp));
   1770          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
   1771          iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
   1772          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
   1773          addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
   1774          return Xcc_NZ;
   1775       }
   1776    }
   1777 
   1778    /* CmpNEZ64(x) */
   1779    if (e->tag == Iex_Unop
   1780        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   1781       HReg hi, lo;
   1782       HReg tmp = newVRegI(env);
   1783       iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
   1784       addInstr(env, mk_iMOVsd_RR(hi, tmp));
   1785       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
   1786       return Xcc_NZ;
   1787    }
   1788 
   1789    /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
   1790 
   1791    /* CmpEQ8 / CmpNE8 */
   1792    if (e->tag == Iex_Binop
   1793        && (e->Iex.Binop.op == Iop_CmpEQ8
   1794            || e->Iex.Binop.op == Iop_CmpNE8
   1795            || e->Iex.Binop.op == Iop_CasCmpEQ8
   1796            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   1797       if (isZeroU8(e->Iex.Binop.arg2)) {
   1798          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1799          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
   1800          switch (e->Iex.Binop.op) {
   1801             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1802             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1803             default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
   1804          }
   1805       } else {
   1806          HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1807          X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1808          HReg    r    = newVRegI(env);
   1809          addInstr(env, mk_iMOVsd_RR(r1,r));
   1810          addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1811          addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
   1812          switch (e->Iex.Binop.op) {
   1813             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
   1814             case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
   1815             default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
   1816          }
   1817       }
   1818    }
   1819 
   1820    /* CmpEQ16 / CmpNE16 */
   1821    if (e->tag == Iex_Binop
   1822        && (e->Iex.Binop.op == Iop_CmpEQ16
   1823            || e->Iex.Binop.op == Iop_CmpNE16
   1824            || e->Iex.Binop.op == Iop_CasCmpEQ16
   1825            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   1826       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1827       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1828       HReg    r    = newVRegI(env);
   1829       addInstr(env, mk_iMOVsd_RR(r1,r));
   1830       addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
   1831       addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
   1832       switch (e->Iex.Binop.op) {
   1833          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
   1834          case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
   1835          default: vpanic("iselCondCode(x86): CmpXX16");
   1836       }
   1837    }
   1838 
   1839    /* CmpNE32(ccall, 32-bit constant) (--smc-check=all optimisation).
   1840       Saves a "movl %eax, %tmp" compared to the default route. */
   1841    if (e->tag == Iex_Binop
   1842        && e->Iex.Binop.op == Iop_CmpNE32
   1843        && e->Iex.Binop.arg1->tag == Iex_CCall
   1844        && e->Iex.Binop.arg2->tag == Iex_Const) {
   1845       IRExpr* cal = e->Iex.Binop.arg1;
   1846       IRExpr* con = e->Iex.Binop.arg2;
   1847       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   1848       vassert(cal->Iex.CCall.retty == Ity_I32); /* else ill-typed IR */
   1849       vassert(con->Iex.Const.con->tag == Ico_U32);
   1850       /* Marshal args, do the call. */
   1851       doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
   1852       addInstr(env, X86Instr_Alu32R(Xalu_CMP,
   1853                                     X86RMI_Imm(con->Iex.Const.con->Ico.U32),
   1854                                     hregX86_EAX()));
   1855       return Xcc_NZ;
   1856    }
   1857 
   1858    /* Cmp*32*(x,y) */
   1859    if (e->tag == Iex_Binop
   1860        && (e->Iex.Binop.op == Iop_CmpEQ32
   1861            || e->Iex.Binop.op == Iop_CmpNE32
   1862            || e->Iex.Binop.op == Iop_CmpLT32S
   1863            || e->Iex.Binop.op == Iop_CmpLT32U
   1864            || e->Iex.Binop.op == Iop_CmpLE32S
   1865            || e->Iex.Binop.op == Iop_CmpLE32U
   1866            || e->Iex.Binop.op == Iop_CasCmpEQ32
   1867            || e->Iex.Binop.op == Iop_CasCmpNE32)) {
   1868       HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1869       X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1870       addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
   1871       switch (e->Iex.Binop.op) {
   1872          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
   1873          case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
   1874          case Iop_CmpLT32S: return Xcc_L;
   1875          case Iop_CmpLT32U: return Xcc_B;
   1876          case Iop_CmpLE32S: return Xcc_LE;
   1877          case Iop_CmpLE32U: return Xcc_BE;
   1878          default: vpanic("iselCondCode(x86): CmpXX32");
   1879       }
   1880    }
   1881 
   1882    /* CmpNE64 */
   1883    if (e->tag == Iex_Binop
   1884        && (e->Iex.Binop.op == Iop_CmpNE64
   1885            || e->Iex.Binop.op == Iop_CmpEQ64)) {
   1886       HReg hi1, hi2, lo1, lo2;
   1887       HReg tHi = newVRegI(env);
   1888       HReg tLo = newVRegI(env);
   1889       iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
   1890       iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
   1891       addInstr(env, mk_iMOVsd_RR(hi1, tHi));
   1892       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
   1893       addInstr(env, mk_iMOVsd_RR(lo1, tLo));
   1894       addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
   1895       addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
   1896       switch (e->Iex.Binop.op) {
   1897          case Iop_CmpNE64: return Xcc_NZ;
   1898          case Iop_CmpEQ64: return Xcc_Z;
   1899          default: vpanic("iselCondCode(x86): CmpXX64");
   1900       }
   1901    }
   1902 
   1903    ppIRExpr(e);
   1904    vpanic("iselCondCode");
   1905 }
   1906 
   1907 
   1908 /*---------------------------------------------------------*/
   1909 /*--- ISEL: Integer expressions (64 bit)                ---*/
   1910 /*---------------------------------------------------------*/
   1911 
   1912 /* Compute a 64-bit value into a register pair, which is returned as
   1913    the first two parameters.  As with iselIntExpr_R, these may be
   1914    either real or virtual regs; in any case they must not be changed
   1915    by subsequent code emitted by the caller.  */
   1916 
   1917 static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1918 {
   1919    iselInt64Expr_wrk(rHi, rLo, env, e);
   1920 #  if 0
   1921    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   1922 #  endif
   1923    vassert(hregClass(*rHi) == HRcInt32);
   1924    vassert(hregIsVirtual(*rHi));
   1925    vassert(hregClass(*rLo) == HRcInt32);
   1926    vassert(hregIsVirtual(*rLo));
   1927 }
   1928 
   1929 /* DO NOT CALL THIS DIRECTLY ! */
   1930 static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
   1931 {
   1932    MatchInfo mi;
   1933    HWord fn = 0; /* helper fn for most SIMD64 stuff */
   1934    vassert(e);
   1935    vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
   1936 
   1937    /* 64-bit literal */
   1938    if (e->tag == Iex_Const) {
   1939       ULong w64 = e->Iex.Const.con->Ico.U64;
   1940       UInt  wHi = toUInt(w64 >> 32);
   1941       UInt  wLo = toUInt(w64);
   1942       HReg  tLo = newVRegI(env);
   1943       HReg  tHi = newVRegI(env);
   1944       vassert(e->Iex.Const.con->tag == Ico_U64);
   1945       if (wLo == wHi) {
   1946          /* Save a precious Int register in this special case. */
   1947          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1948          *rHi = tLo;
   1949          *rLo = tLo;
   1950       } else {
   1951          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
   1952          addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   1953          *rHi = tHi;
   1954          *rLo = tLo;
   1955       }
   1956       return;
   1957    }
   1958 
   1959    /* read 64-bit IRTemp */
   1960    if (e->tag == Iex_RdTmp) {
   1961       lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
   1962       return;
   1963    }
   1964 
   1965    /* 64-bit load */
   1966    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   1967       HReg     tLo, tHi;
   1968       X86AMode *am0, *am4;
   1969       vassert(e->Iex.Load.ty == Ity_I64);
   1970       tLo = newVRegI(env);
   1971       tHi = newVRegI(env);
   1972       am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1973       am4 = advance4(am0);
   1974       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
   1975       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1976       *rHi = tHi;
   1977       *rLo = tLo;
   1978       return;
   1979    }
   1980 
   1981    /* 64-bit GET */
   1982    if (e->tag == Iex_Get) {
   1983       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
   1984       X86AMode* am4 = advance4(am);
   1985       HReg tLo = newVRegI(env);
   1986       HReg tHi = newVRegI(env);
   1987       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   1988       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   1989       *rHi = tHi;
   1990       *rLo = tLo;
   1991       return;
   1992    }
   1993 
   1994    /* 64-bit GETI */
   1995    if (e->tag == Iex_GetI) {
   1996       X86AMode* am
   1997          = genGuestArrayOffset( env, e->Iex.GetI.descr,
   1998                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
   1999       X86AMode* am4 = advance4(am);
   2000       HReg tLo = newVRegI(env);
   2001       HReg tHi = newVRegI(env);
   2002       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   2003       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2004       *rHi = tHi;
   2005       *rLo = tLo;
   2006       return;
   2007    }
   2008 
   2009    /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
   2010    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
   2011       X86RM* r8;
   2012       HReg e0Lo, e0Hi;
   2013       HReg tLo = newVRegI(env);
   2014       HReg tHi = newVRegI(env);
   2015       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2016       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2017       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2018       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   2019       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2020       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2021       addInstr(env, X86Instr_Test32(0xFF, r8));
   2022       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
   2023       addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
   2024       add_to_esp(env, 4);
   2025       *rHi = tHi;
   2026       *rLo = tLo;
   2027       return;
   2028    }
   2029    /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
   2030    if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
   2031       X86RM* r8;
   2032       HReg e0Lo, e0Hi;
   2033       HReg tLo = newVRegI(env);
   2034       HReg tHi = newVRegI(env);
   2035       X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2036       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
   2037       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2038       addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
   2039       addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
   2040       addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   2041       addInstr(env, X86Instr_Test32(0xFF, r8));
   2042       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
   2043       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
   2044       add_to_esp(env, 4);
   2045       *rHi = tHi;
   2046       *rLo = tLo;
   2047       return;
   2048    }
   2049 
   2050    /* 64-bit Mux0X: Mux0X(g, expr, expr) */
   2051    if (e->tag == Iex_Mux0X) {
   2052       X86RM* r8;
   2053       HReg e0Lo, e0Hi, eXLo, eXHi;
   2054       HReg tLo = newVRegI(env);
   2055       HReg tHi = newVRegI(env);
   2056       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2057       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
   2058       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
   2059       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
   2060       r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   2061       addInstr(env, X86Instr_Test32(0xFF, r8));
   2062       /* This assumes the first cmov32 doesn't trash the condition
   2063          codes, so they are still available for the second cmov32 */
   2064       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
   2065       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
   2066       *rHi = tHi;
   2067       *rLo = tLo;
   2068       return;
   2069    }
   2070 
   2071    /* --------- BINARY ops --------- */
   2072    if (e->tag == Iex_Binop) {
   2073       switch (e->Iex.Binop.op) {
   2074          /* 32 x 32 -> 64 multiply */
   2075          case Iop_MullU32:
   2076          case Iop_MullS32: {
   2077             /* get one operand into %eax, and the other into a R/M.
   2078                Need to make an educated guess about which is better in
   2079                which. */
   2080             HReg   tLo    = newVRegI(env);
   2081             HReg   tHi    = newVRegI(env);
   2082             Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
   2083             X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2084             HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2085             addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
   2086             addInstr(env, X86Instr_MulL(syned, rmLeft));
   2087             /* Result is now in EDX:EAX.  Tell the caller. */
   2088             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2089             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2090             *rHi = tHi;
   2091             *rLo = tLo;
   2092             return;
   2093          }
   2094 
   2095          /* 64 x 32 -> (32(rem),32(div)) division */
   2096          case Iop_DivModU64to32:
   2097          case Iop_DivModS64to32: {
   2098             /* Get the 64-bit operand into edx:eax, and the other into
   2099                any old R/M. */
   2100             HReg sHi, sLo;
   2101             HReg   tLo     = newVRegI(env);
   2102             HReg   tHi     = newVRegI(env);
   2103             Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   2104             X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2105             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2106             addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
   2107             addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
   2108             addInstr(env, X86Instr_Div(syned, rmRight));
   2109             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2110             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2111             *rHi = tHi;
   2112             *rLo = tLo;
   2113             return;
   2114          }
   2115 
   2116          /* Or64/And64/Xor64 */
   2117          case Iop_Or64:
   2118          case Iop_And64:
   2119          case Iop_Xor64: {
   2120             HReg xLo, xHi, yLo, yHi;
   2121             HReg tLo = newVRegI(env);
   2122             HReg tHi = newVRegI(env);
   2123             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
   2124                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
   2125                           : Xalu_XOR;
   2126             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2127             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2128             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2129             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
   2130             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2131             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
   2132             *rHi = tHi;
   2133             *rLo = tLo;
   2134             return;
   2135          }
   2136 
   2137          /* Add64/Sub64 */
   2138          case Iop_Add64:
   2139             if (e->Iex.Binop.arg2->tag == Iex_Const) {
   2140                /* special case Add64(e, const) */
   2141                ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
   2142                UInt  wHi = toUInt(w64 >> 32);
   2143                UInt  wLo = toUInt(w64);
   2144                HReg  tLo = newVRegI(env);
   2145                HReg  tHi = newVRegI(env);
   2146                HReg  xLo, xHi;
   2147                vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
   2148                iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2149                addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2150                addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2151                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
   2152                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
   2153                *rHi = tHi;
   2154                *rLo = tLo;
   2155                return;
   2156             }
   2157             /* else fall through to the generic case */
   2158          case Iop_Sub64: {
   2159             HReg xLo, xHi, yLo, yHi;
   2160             HReg tLo = newVRegI(env);
   2161             HReg tHi = newVRegI(env);
   2162             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2163             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2164             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2165             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2166             if (e->Iex.Binop.op==Iop_Add64) {
   2167                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
   2168                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
   2169             } else {
   2170                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2171                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2172             }
   2173             *rHi = tHi;
   2174             *rLo = tLo;
   2175             return;
   2176          }
   2177 
   2178          /* 32HLto64(e1,e2) */
   2179          case Iop_32HLto64:
   2180             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2181             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2182             return;
   2183 
   2184          /* 64-bit shifts */
   2185          case Iop_Shl64: {
   2186             /* We use the same ingenious scheme as gcc.  Put the value
   2187                to be shifted into %hi:%lo, and the shift amount into
   2188                %cl.  Then (dsts on right, a la ATT syntax):
   2189 
   2190                shldl %cl, %lo, %hi   -- make %hi be right for the
   2191                                      -- shift amt %cl % 32
   2192                shll  %cl, %lo        -- make %lo be right for the
   2193                                      -- shift amt %cl % 32
   2194 
   2195                Now, if (shift amount % 64) is in the range 32 .. 63,
   2196                we have to do a fixup, which puts the result low half
   2197                into the result high half, and zeroes the low half:
   2198 
   2199                testl $32, %ecx
   2200 
   2201                cmovnz %lo, %hi
   2202                movl $0, %tmp         -- sigh; need yet another reg
   2203                cmovnz %tmp, %lo
   2204             */
   2205             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2206             tLo = newVRegI(env);
   2207             tHi = newVRegI(env);
   2208             tTemp = newVRegI(env);
   2209             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2210             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2211             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2212             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2213             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2214             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2215                and those regs are legitimately modifiable. */
   2216             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
   2217             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
   2218             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2219             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
   2220             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2221             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
   2222             *rHi = tHi;
   2223             *rLo = tLo;
   2224             return;
   2225          }
   2226 
   2227          case Iop_Shr64: {
   2228             /* We use the same ingenious scheme as gcc.  Put the value
   2229                to be shifted into %hi:%lo, and the shift amount into
   2230                %cl.  Then:
   2231 
   2232                shrdl %cl, %hi, %lo   -- make %lo be right for the
   2233                                      -- shift amt %cl % 32
   2234                shrl  %cl, %hi        -- make %hi be right for the
   2235                                      -- shift amt %cl % 32
   2236 
   2237                Now, if (shift amount % 64) is in the range 32 .. 63,
   2238                we have to do a fixup, which puts the result high half
   2239                into the result low half, and zeroes the high half:
   2240 
   2241                testl $32, %ecx
   2242 
   2243                cmovnz %hi, %lo
   2244                movl $0, %tmp         -- sigh; need yet another reg
   2245                cmovnz %tmp, %hi
   2246             */
   2247             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2248             tLo = newVRegI(env);
   2249             tHi = newVRegI(env);
   2250             tTemp = newVRegI(env);
   2251             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2252             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2253             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2254             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2255             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2256             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2257                and those regs are legitimately modifiable. */
   2258             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
   2259             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
   2260             addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
   2261             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
   2262             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2263             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
   2264             *rHi = tHi;
   2265             *rLo = tLo;
   2266             return;
   2267          }
   2268 
   2269          /* F64 -> I64 */
   2270          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
   2271             case.  Unfortunately I see no easy way to avoid the
   2272             duplication. */
   2273          case Iop_F64toI64S: {
   2274             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2275             HReg tLo = newVRegI(env);
   2276             HReg tHi = newVRegI(env);
   2277 
   2278             /* Used several times ... */
   2279             /* Careful ... this sharing is only safe because
   2280 	       zero_esp/four_esp do not hold any registers which the
   2281 	       register allocator could attempt to swizzle later. */
   2282             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2283             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2284 
   2285             /* rf now holds the value to be converted, and rrm holds
   2286                the rounding mode value, encoded as per the
   2287                IRRoundingMode enum.  The first thing to do is set the
   2288                FPU's rounding mode accordingly. */
   2289 
   2290             /* Create a space for the format conversion. */
   2291             /* subl $8, %esp */
   2292             sub_from_esp(env, 8);
   2293 
   2294             /* Set host rounding mode */
   2295             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2296 
   2297             /* gistll %rf, 0(%esp) */
   2298             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
   2299 
   2300             /* movl 0(%esp), %dstLo */
   2301             /* movl 4(%esp), %dstHi */
   2302             addInstr(env, X86Instr_Alu32R(
   2303                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2304             addInstr(env, X86Instr_Alu32R(
   2305                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2306 
   2307             /* Restore default FPU rounding. */
   2308             set_FPU_rounding_default( env );
   2309 
   2310             /* addl $8, %esp */
   2311             add_to_esp(env, 8);
   2312 
   2313             *rHi = tHi;
   2314             *rLo = tLo;
   2315             return;
   2316          }
   2317 
   2318          case Iop_Add8x8:
   2319             fn = (HWord)h_generic_calc_Add8x8; goto binnish;
   2320          case Iop_Add16x4:
   2321             fn = (HWord)h_generic_calc_Add16x4; goto binnish;
   2322          case Iop_Add32x2:
   2323             fn = (HWord)h_generic_calc_Add32x2; goto binnish;
   2324 
   2325          case Iop_Avg8Ux8:
   2326             fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
   2327          case Iop_Avg16Ux4:
   2328             fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
   2329 
   2330          case Iop_CmpEQ8x8:
   2331             fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
   2332          case Iop_CmpEQ16x4:
   2333             fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
   2334          case Iop_CmpEQ32x2:
   2335             fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
   2336 
   2337          case Iop_CmpGT8Sx8:
   2338             fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
   2339          case Iop_CmpGT16Sx4:
   2340             fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
   2341          case Iop_CmpGT32Sx2:
   2342             fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
   2343 
   2344          case Iop_InterleaveHI8x8:
   2345             fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
   2346          case Iop_InterleaveLO8x8:
   2347             fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
   2348          case Iop_InterleaveHI16x4:
   2349             fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
   2350          case Iop_InterleaveLO16x4:
   2351             fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
   2352          case Iop_InterleaveHI32x2:
   2353             fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
   2354          case Iop_InterleaveLO32x2:
   2355             fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
   2356          case Iop_CatOddLanes16x4:
   2357             fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
   2358          case Iop_CatEvenLanes16x4:
   2359             fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
   2360          case Iop_Perm8x8:
   2361             fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
   2362 
   2363          case Iop_Max8Ux8:
   2364             fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
   2365          case Iop_Max16Sx4:
   2366             fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
   2367          case Iop_Min8Ux8:
   2368             fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
   2369          case Iop_Min16Sx4:
   2370             fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
   2371 
   2372          case Iop_Mul16x4:
   2373             fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
   2374          case Iop_Mul32x2:
   2375             fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
   2376          case Iop_MulHi16Sx4:
   2377             fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
   2378          case Iop_MulHi16Ux4:
   2379             fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
   2380 
   2381          case Iop_QAdd8Sx8:
   2382             fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
   2383          case Iop_QAdd16Sx4:
   2384             fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
   2385          case Iop_QAdd8Ux8:
   2386             fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
   2387          case Iop_QAdd16Ux4:
   2388             fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
   2389 
   2390          case Iop_QNarrowBin32Sto16Sx4:
   2391             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; goto binnish;
   2392          case Iop_QNarrowBin16Sto8Sx8:
   2393             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; goto binnish;
   2394          case Iop_QNarrowBin16Sto8Ux8:
   2395             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; goto binnish;
   2396          case Iop_NarrowBin16to8x8:
   2397             fn = (HWord)h_generic_calc_NarrowBin16to8x8; goto binnish;
   2398          case Iop_NarrowBin32to16x4:
   2399             fn = (HWord)h_generic_calc_NarrowBin32to16x4; goto binnish;
   2400 
   2401          case Iop_QSub8Sx8:
   2402             fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
   2403          case Iop_QSub16Sx4:
   2404             fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
   2405          case Iop_QSub8Ux8:
   2406             fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
   2407          case Iop_QSub16Ux4:
   2408             fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
   2409 
   2410          case Iop_Sub8x8:
   2411             fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
   2412          case Iop_Sub16x4:
   2413             fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
   2414          case Iop_Sub32x2:
   2415             fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
   2416 
   2417          binnish: {
   2418             /* Note: the following assumes all helpers are of
   2419                signature
   2420                   ULong fn ( ULong, ULong ), and they are
   2421                not marked as regparm functions.
   2422             */
   2423             HReg xLo, xHi, yLo, yHi;
   2424             HReg tLo = newVRegI(env);
   2425             HReg tHi = newVRegI(env);
   2426             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2427             addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
   2428             addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
   2429             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2430             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2431             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2432             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2433             add_to_esp(env, 4*4);
   2434             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2435             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2436             *rHi = tHi;
   2437             *rLo = tLo;
   2438             return;
   2439          }
   2440 
   2441          case Iop_ShlN32x2:
   2442             fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
   2443          case Iop_ShlN16x4:
   2444             fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
   2445          case Iop_ShlN8x8:
   2446             fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
   2447          case Iop_ShrN32x2:
   2448             fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
   2449          case Iop_ShrN16x4:
   2450             fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
   2451          case Iop_SarN32x2:
   2452             fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
   2453          case Iop_SarN16x4:
   2454             fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
   2455          case Iop_SarN8x8:
   2456             fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
   2457          shifty: {
   2458             /* Note: the following assumes all helpers are of
   2459                signature
   2460                   ULong fn ( ULong, UInt ), and they are
   2461                not marked as regparm functions.
   2462             */
   2463             HReg xLo, xHi;
   2464             HReg tLo = newVRegI(env);
   2465             HReg tHi = newVRegI(env);
   2466             X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2467             addInstr(env, X86Instr_Push(y));
   2468             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2469             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2470             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2471             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2472             add_to_esp(env, 3*4);
   2473             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2474             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2475             *rHi = tHi;
   2476             *rLo = tLo;
   2477             return;
   2478          }
   2479 
   2480          default:
   2481             break;
   2482       }
   2483    } /* if (e->tag == Iex_Binop) */
   2484 
   2485 
   2486    /* --------- UNARY ops --------- */
   2487    if (e->tag == Iex_Unop) {
   2488       switch (e->Iex.Unop.op) {
   2489 
   2490          /* 32Sto64(e) */
   2491          case Iop_32Sto64: {
   2492             HReg tLo = newVRegI(env);
   2493             HReg tHi = newVRegI(env);
   2494             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2495             addInstr(env, mk_iMOVsd_RR(src,tHi));
   2496             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2497             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
   2498             *rHi = tHi;
   2499             *rLo = tLo;
   2500             return;
   2501          }
   2502 
   2503          /* 32Uto64(e) */
   2504          case Iop_32Uto64: {
   2505             HReg tLo = newVRegI(env);
   2506             HReg tHi = newVRegI(env);
   2507             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2508             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2509             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2510             *rHi = tHi;
   2511             *rLo = tLo;
   2512             return;
   2513          }
   2514 
   2515          /* 16Uto64(e) */
   2516          case Iop_16Uto64: {
   2517             HReg tLo = newVRegI(env);
   2518             HReg tHi = newVRegI(env);
   2519             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2520             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2521             addInstr(env, X86Instr_Alu32R(Xalu_AND,
   2522                                           X86RMI_Imm(0xFFFF), tLo));
   2523             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2524             *rHi = tHi;
   2525             *rLo = tLo;
   2526             return;
   2527          }
   2528 
   2529          /* V128{HI}to64 */
   2530          case Iop_V128HIto64:
   2531          case Iop_V128to64: {
   2532             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
   2533             HReg tLo = newVRegI(env);
   2534             HReg tHi = newVRegI(env);
   2535             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   2536             X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
   2537             X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
   2538             X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
   2539             sub_from_esp(env, 16);
   2540             addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
   2541             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2542                                            X86RMI_Mem(espLO), tLo ));
   2543             addInstr(env, X86Instr_Alu32R( Xalu_MOV,
   2544                                            X86RMI_Mem(espHI), tHi ));
   2545             add_to_esp(env, 16);
   2546             *rHi = tHi;
   2547             *rLo = tLo;
   2548             return;
   2549          }
   2550 
   2551          /* could do better than this, but for now ... */
   2552          case Iop_1Sto64: {
   2553             HReg tLo = newVRegI(env);
   2554             HReg tHi = newVRegI(env);
   2555             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   2556             addInstr(env, X86Instr_Set32(cond,tLo));
   2557             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
   2558             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
   2559             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
   2560             *rHi = tHi;
   2561             *rLo = tLo;
   2562             return;
   2563          }
   2564 
   2565          /* Not64(e) */
   2566          case Iop_Not64: {
   2567             HReg tLo = newVRegI(env);
   2568             HReg tHi = newVRegI(env);
   2569             HReg sHi, sLo;
   2570             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
   2571             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2572             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2573             addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
   2574             addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
   2575             *rHi = tHi;
   2576             *rLo = tLo;
   2577             return;
   2578          }
   2579 
   2580          /* Left64(e) */
   2581          case Iop_Left64: {
   2582             HReg yLo, yHi;
   2583             HReg tLo = newVRegI(env);
   2584             HReg tHi = newVRegI(env);
   2585             /* yHi:yLo = arg */
   2586             iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
   2587             /* tLo = 0 - yLo, and set carry */
   2588             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
   2589             addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2590             /* tHi = 0 - yHi - carry */
   2591             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2592             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2593             /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
   2594                back in, so as to give the final result
   2595                tHi:tLo = arg | -arg. */
   2596             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
   2597             addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
   2598             *rHi = tHi;
   2599             *rLo = tLo;
   2600             return;
   2601          }
   2602 
   2603          /* --- patterns rooted at: CmpwNEZ64 --- */
   2604 
   2605          /* CmpwNEZ64(e) */
   2606          case Iop_CmpwNEZ64: {
   2607 
   2608          DECLARE_PATTERN(p_CmpwNEZ64_Or64);
   2609          DEFINE_PATTERN(p_CmpwNEZ64_Or64,
   2610                         unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
   2611          if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
   2612             /* CmpwNEZ64(Or64(x,y)) */
   2613             HReg xHi,xLo,yHi,yLo;
   2614             HReg xBoth = newVRegI(env);
   2615             HReg merged = newVRegI(env);
   2616             HReg tmp2 = newVRegI(env);
   2617 
   2618             iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
   2619             addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
   2620             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2621                                           X86RMI_Reg(xLo),xBoth));
   2622 
   2623             iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
   2624             addInstr(env, mk_iMOVsd_RR(yHi,merged));
   2625             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2626                                           X86RMI_Reg(yLo),merged));
   2627             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2628                                              X86RMI_Reg(xBoth),merged));
   2629 
   2630             /* tmp2 = (merged | -merged) >>s 31 */
   2631             addInstr(env, mk_iMOVsd_RR(merged,tmp2));
   2632             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2633             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2634                                           X86RMI_Reg(merged), tmp2));
   2635             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2636             *rHi = tmp2;
   2637             *rLo = tmp2;
   2638             return;
   2639          } else {
   2640             /* CmpwNEZ64(e) */
   2641             HReg srcLo, srcHi;
   2642             HReg tmp1  = newVRegI(env);
   2643             HReg tmp2  = newVRegI(env);
   2644             /* srcHi:srcLo = arg */
   2645             iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
   2646             /* tmp1 = srcHi | srcLo */
   2647             addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
   2648             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2649                                           X86RMI_Reg(srcLo), tmp1));
   2650             /* tmp2 = (tmp1 | -tmp1) >>s 31 */
   2651             addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
   2652             addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
   2653             addInstr(env, X86Instr_Alu32R(Xalu_OR,
   2654                                           X86RMI_Reg(tmp1), tmp2));
   2655             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
   2656             *rHi = tmp2;
   2657             *rLo = tmp2;
   2658             return;
   2659          }
   2660          }
   2661 
   2662          /* ReinterpF64asI64(e) */
   2663          /* Given an IEEE754 double, produce an I64 with the same bit
   2664             pattern. */
   2665          case Iop_ReinterpF64asI64: {
   2666             HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
   2667             HReg tLo  = newVRegI(env);
   2668             HReg tHi  = newVRegI(env);
   2669             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2670             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2671             /* paranoia */
   2672             set_FPU_rounding_default(env);
   2673             /* subl $8, %esp */
   2674             sub_from_esp(env, 8);
   2675             /* gstD %rf, 0(%esp) */
   2676             addInstr(env,
   2677                      X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
   2678             /* movl 0(%esp), %tLo */
   2679             addInstr(env,
   2680                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2681             /* movl 4(%esp), %tHi */
   2682             addInstr(env,
   2683                      X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2684             /* addl $8, %esp */
   2685             add_to_esp(env, 8);
   2686             *rHi = tHi;
   2687             *rLo = tLo;
   2688             return;
   2689          }
   2690 
   2691          case Iop_CmpNEZ32x2:
   2692             fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
   2693          case Iop_CmpNEZ16x4:
   2694             fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
   2695          case Iop_CmpNEZ8x8:
   2696             fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
   2697          unish: {
   2698             /* Note: the following assumes all helpers are of
   2699                signature
   2700                   ULong fn ( ULong ), and they are
   2701                not marked as regparm functions.
   2702             */
   2703             HReg xLo, xHi;
   2704             HReg tLo = newVRegI(env);
   2705             HReg tHi = newVRegI(env);
   2706             iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
   2707             addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
   2708             addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
   2709             addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
   2710             add_to_esp(env, 2*4);
   2711             addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2712             addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2713             *rHi = tHi;
   2714             *rLo = tLo;
   2715             return;
   2716          }
   2717 
   2718          default:
   2719             break;
   2720       }
   2721    } /* if (e->tag == Iex_Unop) */
   2722 
   2723 
   2724    /* --------- CCALL --------- */
   2725    if (e->tag == Iex_CCall) {
   2726       HReg tLo = newVRegI(env);
   2727       HReg tHi = newVRegI(env);
   2728 
   2729       /* Marshal args, do the call, clear stack. */
   2730       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   2731 
   2732       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2733       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2734       *rHi = tHi;
   2735       *rLo = tLo;
   2736       return;
   2737    }
   2738 
   2739    ppIRExpr(e);
   2740    vpanic("iselInt64Expr");
   2741 }
   2742 
   2743 
   2744 /*---------------------------------------------------------*/
   2745 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2746 /*---------------------------------------------------------*/
   2747 
   2748 /* Nothing interesting here; really just wrappers for
   2749    64-bit stuff. */
   2750 
   2751 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2752 {
   2753    HReg r = iselFltExpr_wrk( env, e );
   2754 #  if 0
   2755    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2756 #  endif
   2757    vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
   2758    vassert(hregIsVirtual(r));
   2759    return r;
   2760 }
   2761 
   2762 /* DO NOT CALL THIS DIRECTLY */
   2763 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2764 {
   2765    IRType ty = typeOfIRExpr(env->type_env,e);
   2766    vassert(ty == Ity_F32);
   2767 
   2768    if (e->tag == Iex_RdTmp) {
   2769       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2770    }
   2771 
   2772    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2773       X86AMode* am;
   2774       HReg res = newVRegF(env);
   2775       vassert(e->Iex.Load.ty == Ity_F32);
   2776       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2777       addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
   2778       return res;
   2779    }
   2780 
   2781    if (e->tag == Iex_Binop
   2782        && e->Iex.Binop.op == Iop_F64toF32) {
   2783       /* Although the result is still held in a standard FPU register,
   2784          we need to round it to reflect the loss of accuracy/range
   2785          entailed in casting it to a 32-bit float. */
   2786       HReg dst = newVRegF(env);
   2787       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2788       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2789       addInstr(env, X86Instr_Fp64to32(src,dst));
   2790       set_FPU_rounding_default( env );
   2791       return dst;
   2792    }
   2793 
   2794    if (e->tag == Iex_Get) {
   2795       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2796                                   hregX86_EBP() );
   2797       HReg res = newVRegF(env);
   2798       addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
   2799       return res;
   2800    }
   2801 
   2802    if (e->tag == Iex_Unop
   2803        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2804        /* Given an I32, produce an IEEE754 float with the same bit
   2805           pattern. */
   2806       HReg    dst = newVRegF(env);
   2807       X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   2808       /* paranoia */
   2809       addInstr(env, X86Instr_Push(rmi));
   2810       addInstr(env, X86Instr_FpLdSt(
   2811                        True/*load*/, 4, dst,
   2812                        X86AMode_IR(0, hregX86_ESP())));
   2813       add_to_esp(env, 4);
   2814       return dst;
   2815    }
   2816 
   2817    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2818       HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
   2819       HReg dst = newVRegF(env);
   2820 
   2821       /* rf now holds the value to be rounded.  The first thing to do
   2822          is set the FPU's rounding mode accordingly. */
   2823 
   2824       /* Set host rounding mode */
   2825       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2826 
   2827       /* grndint %rf, %dst */
   2828       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   2829 
   2830       /* Restore default FPU rounding. */
   2831       set_FPU_rounding_default( env );
   2832 
   2833       return dst;
   2834    }
   2835 
   2836    ppIRExpr(e);
   2837    vpanic("iselFltExpr_wrk");
   2838 }
   2839 
   2840 
   2841 /*---------------------------------------------------------*/
   2842 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2843 /*---------------------------------------------------------*/
   2844 
   2845 /* Compute a 64-bit floating point value into a register, the identity
   2846    of which is returned.  As with iselIntExpr_R, the reg may be either
   2847    real or virtual; in any case it must not be changed by subsequent
   2848    code emitted by the caller.  */
   2849 
   2850 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2851 
   2852     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2853     ----                  ---------   -----------   -----------
   2854     signalling NaN        u           2047 (max)    .0uuuuu---u
   2855                                                     (with at least
   2856                                                      one 1 bit)
   2857     quiet NaN             u           2047 (max)    .1uuuuu---u
   2858 
   2859     negative infinity     1           2047 (max)    .000000---0
   2860 
   2861     positive infinity     0           2047 (max)    .000000---0
   2862 
   2863     negative zero         1           0             .000000---0
   2864 
   2865     positive zero         0           0             .000000---0
   2866 */
   2867 
   2868 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2869 {
   2870    HReg r = iselDblExpr_wrk( env, e );
   2871 #  if 0
   2872    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2873 #  endif
   2874    vassert(hregClass(r) == HRcFlt64);
   2875    vassert(hregIsVirtual(r));
   2876    return r;
   2877 }
   2878 
   2879 /* DO NOT CALL THIS DIRECTLY */
   2880 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2881 {
   2882    IRType ty = typeOfIRExpr(env->type_env,e);
   2883    vassert(e);
   2884    vassert(ty == Ity_F64);
   2885 
   2886    if (e->tag == Iex_RdTmp) {
   2887       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2888    }
   2889 
   2890    if (e->tag == Iex_Const) {
   2891       union { UInt u32x2[2]; ULong u64; Double f64; } u;
   2892       HReg freg = newVRegF(env);
   2893       vassert(sizeof(u) == 8);
   2894       vassert(sizeof(u.u64) == 8);
   2895       vassert(sizeof(u.f64) == 8);
   2896       vassert(sizeof(u.u32x2) == 8);
   2897 
   2898       if (e->Iex.Const.con->tag == Ico_F64) {
   2899          u.f64 = e->Iex.Const.con->Ico.F64;
   2900       }
   2901       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2902          u.u64 = e->Iex.Const.con->Ico.F64i;
   2903       }
   2904       else
   2905          vpanic("iselDblExpr(x86): const");
   2906 
   2907       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
   2908       addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
   2909       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg,
   2910                                     X86AMode_IR(0, hregX86_ESP())));
   2911       add_to_esp(env, 8);
   2912       return freg;
   2913    }
   2914 
   2915    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2916       X86AMode* am;
   2917       HReg res = newVRegF(env);
   2918       vassert(e->Iex.Load.ty == Ity_F64);
   2919       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2920       addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
   2921       return res;
   2922    }
   2923 
   2924    if (e->tag == Iex_Get) {
   2925       X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
   2926                                   hregX86_EBP() );
   2927       HReg res = newVRegF(env);
   2928       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2929       return res;
   2930    }
   2931 
   2932    if (e->tag == Iex_GetI) {
   2933       X86AMode* am
   2934          = genGuestArrayOffset(
   2935               env, e->Iex.GetI.descr,
   2936                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2937       HReg res = newVRegF(env);
   2938       addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
   2939       return res;
   2940    }
   2941 
   2942    if (e->tag == Iex_Triop) {
   2943       X86FpOp fpop = Xfp_INVALID;
   2944       switch (e->Iex.Triop.op) {
   2945          case Iop_AddF64:    fpop = Xfp_ADD; break;
   2946          case Iop_SubF64:    fpop = Xfp_SUB; break;
   2947          case Iop_MulF64:    fpop = Xfp_MUL; break;
   2948          case Iop_DivF64:    fpop = Xfp_DIV; break;
   2949          case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
   2950          case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
   2951          case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
   2952          case Iop_AtanF64:   fpop = Xfp_ATAN; break;
   2953          case Iop_PRemF64:   fpop = Xfp_PREM; break;
   2954          case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
   2955          default: break;
   2956       }
   2957       if (fpop != Xfp_INVALID) {
   2958          HReg res  = newVRegF(env);
   2959          HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
   2960          HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
   2961          /* XXXROUNDINGFIXME */
   2962          /* set roundingmode here */
   2963          addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
   2964 	 if (fpop != Xfp_ADD && fpop != Xfp_SUB
   2965 	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
   2966             roundToF64(env, res);
   2967          return res;
   2968       }
   2969    }
   2970 
   2971    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2972       HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2973       HReg dst = newVRegF(env);
   2974 
   2975       /* rf now holds the value to be rounded.  The first thing to do
   2976          is set the FPU's rounding mode accordingly. */
   2977 
   2978       /* Set host rounding mode */
   2979       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2980 
   2981       /* grndint %rf, %dst */
   2982       addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
   2983 
   2984       /* Restore default FPU rounding. */
   2985       set_FPU_rounding_default( env );
   2986 
   2987       return dst;
   2988    }
   2989 
   2990    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   2991       HReg dst = newVRegF(env);
   2992       HReg rHi,rLo;
   2993       iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
   2994       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   2995       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   2996 
   2997       /* Set host rounding mode */
   2998       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2999 
   3000       addInstr(env, X86Instr_FpLdStI(
   3001                        True/*load*/, 8, dst,
   3002                        X86AMode_IR(0, hregX86_ESP())));
   3003 
   3004       /* Restore default FPU rounding. */
   3005       set_FPU_rounding_default( env );
   3006 
   3007       add_to_esp(env, 8);
   3008       return dst;
   3009    }
   3010 
   3011    if (e->tag == Iex_Binop) {
   3012       X86FpOp fpop = Xfp_INVALID;
   3013       switch (e->Iex.Binop.op) {
   3014          case Iop_SinF64:  fpop = Xfp_SIN; break;
   3015          case Iop_CosF64:  fpop = Xfp_COS; break;
   3016          case Iop_TanF64:  fpop = Xfp_TAN; break;
   3017          case Iop_2xm1F64: fpop = Xfp_2XM1; break;
   3018          case Iop_SqrtF64: fpop = Xfp_SQRT; break;
   3019          default: break;
   3020       }
   3021       if (fpop != Xfp_INVALID) {
   3022          HReg res = newVRegF(env);
   3023          HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   3024          /* XXXROUNDINGFIXME */
   3025          /* set roundingmode here */
   3026          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3027 	 if (fpop != Xfp_SQRT
   3028              && fpop != Xfp_NEG && fpop != Xfp_ABS)
   3029             roundToF64(env, res);
   3030          return res;
   3031       }
   3032    }
   3033 
   3034    if (e->tag == Iex_Unop) {
   3035       X86FpOp fpop = Xfp_INVALID;
   3036       switch (e->Iex.Unop.op) {
   3037          case Iop_NegF64:  fpop = Xfp_NEG; break;
   3038          case Iop_AbsF64:  fpop = Xfp_ABS; break;
   3039          default: break;
   3040       }
   3041       if (fpop != Xfp_INVALID) {
   3042          HReg res = newVRegF(env);
   3043          HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   3044          addInstr(env, X86Instr_FpUnary(fpop,src,res));
   3045 	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
   3046             roundToF64(env, res);
   3047          return res;
   3048       }
   3049    }
   3050 
   3051    if (e->tag == Iex_Unop) {
   3052       switch (e->Iex.Unop.op) {
   3053          case Iop_I32StoF64: {
   3054             HReg dst = newVRegF(env);
   3055             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3056             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3057             set_FPU_rounding_default(env);
   3058             addInstr(env, X86Instr_FpLdStI(
   3059                              True/*load*/, 4, dst,
   3060                              X86AMode_IR(0, hregX86_ESP())));
   3061 	    add_to_esp(env, 4);
   3062             return dst;
   3063          }
   3064          case Iop_ReinterpI64asF64: {
   3065             /* Given an I64, produce an IEEE754 double with the same
   3066                bit pattern. */
   3067             HReg dst = newVRegF(env);
   3068             HReg rHi, rLo;
   3069 	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
   3070             /* paranoia */
   3071             set_FPU_rounding_default(env);
   3072             addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3073             addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3074             addInstr(env, X86Instr_FpLdSt(
   3075                              True/*load*/, 8, dst,
   3076                              X86AMode_IR(0, hregX86_ESP())));
   3077 	    add_to_esp(env, 8);
   3078             return dst;
   3079 	 }
   3080          case Iop_F32toF64: {
   3081             /* this is a no-op */
   3082             HReg res = iselFltExpr(env, e->Iex.Unop.arg);
   3083             return res;
   3084 	 }
   3085          default:
   3086             break;
   3087       }
   3088    }
   3089 
   3090    /* --------- MULTIPLEX --------- */
   3091    if (e->tag == Iex_Mux0X) {
   3092      if (ty == Ity_F64
   3093          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   3094         X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3095         HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
   3096         HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
   3097         HReg dst = newVRegF(env);
   3098         addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
   3099         addInstr(env, X86Instr_Test32(0xFF, r8));
   3100         addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
   3101         return dst;
   3102       }
   3103    }
   3104 
   3105    ppIRExpr(e);
   3106    vpanic("iselDblExpr_wrk");
   3107 }
   3108 
   3109 
   3110 /*---------------------------------------------------------*/
   3111 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3112 /*---------------------------------------------------------*/
   3113 
   3114 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3115 {
   3116    HReg r = iselVecExpr_wrk( env, e );
   3117 #  if 0
   3118    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3119 #  endif
   3120    vassert(hregClass(r) == HRcVec128);
   3121    vassert(hregIsVirtual(r));
   3122    return r;
   3123 }
   3124 
   3125 
   3126 /* DO NOT CALL THIS DIRECTLY */
   3127 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3128 {
   3129 
   3130 #  define REQUIRE_SSE1                                    \
   3131       do { if (env->hwcaps == 0/*baseline, no sse*/)      \
   3132               goto vec_fail;                              \
   3133       } while (0)
   3134 
   3135 #  define REQUIRE_SSE2                                    \
   3136       do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
   3137               goto vec_fail;                              \
   3138       } while (0)
   3139 
   3140 #  define SSE2_OR_ABOVE                                   \
   3141        (env->hwcaps & VEX_HWCAPS_X86_SSE2)
   3142 
   3143    HWord     fn = 0; /* address of helper fn, if required */
   3144    MatchInfo mi;
   3145    Bool      arg1isEReg = False;
   3146    X86SseOp  op = Xsse_INVALID;
   3147    IRType    ty = typeOfIRExpr(env->type_env,e);
   3148    vassert(e);
   3149    vassert(ty == Ity_V128);
   3150 
   3151    REQUIRE_SSE1;
   3152 
   3153    if (e->tag == Iex_RdTmp) {
   3154       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3155    }
   3156 
   3157    if (e->tag == Iex_Get) {
   3158       HReg dst = newVRegV(env);
   3159       addInstr(env, X86Instr_SseLdSt(
   3160                        True/*load*/,
   3161                        dst,
   3162                        X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
   3163                     )
   3164               );
   3165       return dst;
   3166    }
   3167 
   3168    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3169       HReg      dst = newVRegV(env);
   3170       X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3171       addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
   3172       return dst;
   3173    }
   3174 
   3175    if (e->tag == Iex_Const) {
   3176       HReg dst = newVRegV(env);
   3177       vassert(e->Iex.Const.con->tag == Ico_V128);
   3178       addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
   3179       return dst;
   3180    }
   3181 
   3182    if (e->tag == Iex_Unop) {
   3183 
   3184    if (SSE2_OR_ABOVE) {
   3185       /* 64UtoV128(LDle:I64(addr)) */
   3186       DECLARE_PATTERN(p_zwiden_load64);
   3187       DEFINE_PATTERN(p_zwiden_load64,
   3188                      unop(Iop_64UtoV128,
   3189                           IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
   3190       if (matchIRExpr(&mi, p_zwiden_load64, e)) {
   3191          X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
   3192          HReg dst = newVRegV(env);
   3193          addInstr(env, X86Instr_SseLdzLO(8, dst, am));
   3194          return dst;
   3195       }
   3196    }
   3197 
   3198    switch (e->Iex.Unop.op) {
   3199 
   3200       case Iop_NotV128: {
   3201          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3202          return do_sse_Not128(env, arg);
   3203       }
   3204 
   3205       case Iop_CmpNEZ64x2: {
   3206          /* We can use SSE2 instructions for this. */
   3207          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3208             the operand.  Problem is no such insn exists.  Solution
   3209             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3210             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3211             let the not'd result of this initial comparison be a:b:c:d.
   3212             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3213             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3214             giving the required result.
   3215 
   3216             The required selection sequence is 2,3,0,1, which
   3217             according to Intel's documentation means the pshufd
   3218             literal value is 0xB1, that is,
   3219             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3220          */
   3221          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3222          HReg tmp  = newVRegV(env);
   3223          HReg dst  = newVRegV(env);
   3224          REQUIRE_SSE2;
   3225          addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
   3226          addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
   3227          tmp = do_sse_Not128(env, tmp);
   3228          addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
   3229          addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
   3230          return dst;
   3231       }
   3232 
   3233       case Iop_CmpNEZ32x4: {
   3234          /* Sigh, we have to generate lousy code since this has to
   3235             work on SSE1 hosts */
   3236          /* basically, the idea is: for each lane:
   3237                movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
   3238                sbbl %r, %r               (now %r = 1Sto32(CF))
   3239                movl %r, lane
   3240          */
   3241          Int       i;
   3242          X86AMode* am;
   3243          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3244          HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3245          HReg      dst  = newVRegV(env);
   3246          HReg      r32  = newVRegI(env);
   3247          sub_from_esp(env, 16);
   3248          addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
   3249          for (i = 0; i < 4; i++) {
   3250             am = X86AMode_IR(i*4, hregX86_ESP());
   3251             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
   3252             addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
   3253             addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
   3254             addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
   3255          }
   3256          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3257          add_to_esp(env, 16);
   3258          return dst;
   3259       }
   3260 
   3261       case Iop_CmpNEZ8x16:
   3262       case Iop_CmpNEZ16x8: {
   3263          /* We can use SSE2 instructions for this. */
   3264          HReg arg;
   3265          HReg vec0 = newVRegV(env);
   3266          HReg vec1 = newVRegV(env);
   3267          HReg dst  = newVRegV(env);
   3268          X86SseOp cmpOp
   3269             = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
   3270                                              : Xsse_CMPEQ8;
   3271          REQUIRE_SSE2;
   3272          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
   3273          addInstr(env, mk_vMOVsd_RR(vec0, vec1));
   3274          addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
   3275          /* defer arg computation to here so as to give CMPEQF as long
   3276             as possible to complete */
   3277          arg = iselVecExpr(env, e->Iex.Unop.arg);
   3278          /* vec0 is all 0s; vec1 is all 1s */
   3279          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3280          /* 16x8 or 8x16 comparison == */
   3281          addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
   3282          /* invert result */
   3283          addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
   3284          return dst;
   3285       }
   3286 
   3287       case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
   3288       case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
   3289       case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
   3290       do_32Fx4_unary:
   3291       {
   3292          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3293          HReg dst = newVRegV(env);
   3294          addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
   3295          return dst;
   3296       }
   3297 
   3298       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
   3299       case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
   3300       case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
   3301       do_64Fx2_unary:
   3302       {
   3303          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3304          HReg dst = newVRegV(env);
   3305          REQUIRE_SSE2;
   3306          addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
   3307          return dst;
   3308       }
   3309 
   3310       case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
   3311       case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
   3312       case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
   3313       do_32F0x4_unary:
   3314       {
   3315          /* A bit subtle.  We have to copy the arg to the result
   3316             register first, because actually doing the SSE scalar insn
   3317             leaves the upper 3/4 of the destination register
   3318             unchanged.  Whereas the required semantics of these
   3319             primops is that the upper 3/4 is simply copied in from the
   3320             argument. */
   3321          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3322          HReg dst = newVRegV(env);
   3323          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3324          addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
   3325          return dst;
   3326       }
   3327 
   3328       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
   3329       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
   3330       case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
   3331       do_64F0x2_unary:
   3332       {
   3333          /* A bit subtle.  We have to copy the arg to the result
   3334             register first, because actually doing the SSE scalar insn
   3335             leaves the upper half of the destination register
   3336             unchanged.  Whereas the required semantics of these
   3337             primops is that the upper half is simply copied in from the
   3338             argument. */
   3339          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3340          HReg dst = newVRegV(env);
   3341          REQUIRE_SSE2;
   3342          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3343          addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
   3344          return dst;
   3345       }
   3346 
   3347       case Iop_32UtoV128: {
   3348          HReg      dst  = newVRegV(env);
   3349          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3350          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3351          addInstr(env, X86Instr_Push(rmi));
   3352 	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
   3353          add_to_esp(env, 4);
   3354          return dst;
   3355       }
   3356 
   3357       case Iop_64UtoV128: {
   3358          HReg      rHi, rLo;
   3359          HReg      dst  = newVRegV(env);
   3360          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3361          iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
   3362          addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
   3363          addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
   3364 	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
   3365          add_to_esp(env, 8);
   3366          return dst;
   3367       }
   3368 
   3369       default:
   3370          break;
   3371    } /* switch (e->Iex.Unop.op) */
   3372    } /* if (e->tag == Iex_Unop) */
   3373 
   3374    if (e->tag == Iex_Binop) {
   3375    switch (e->Iex.Binop.op) {
   3376 
   3377       case Iop_SetV128lo32: {
   3378          HReg dst = newVRegV(env);
   3379          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3380          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3381          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3382          sub_from_esp(env, 16);
   3383          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3384          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
   3385          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3386          add_to_esp(env, 16);
   3387          return dst;
   3388       }
   3389 
   3390       case Iop_SetV128lo64: {
   3391          HReg dst = newVRegV(env);
   3392          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3393          HReg srcIhi, srcIlo;
   3394          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3395          X86AMode* esp4 = advance4(esp0);
   3396          iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
   3397          sub_from_esp(env, 16);
   3398          addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
   3399          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
   3400          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
   3401          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3402          add_to_esp(env, 16);
   3403          return dst;
   3404       }
   3405 
   3406       case Iop_64HLtoV128: {
   3407          HReg r3, r2, r1, r0;
   3408          X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
   3409          X86AMode* esp4  = advance4(esp0);
   3410          X86AMode* esp8  = advance4(esp4);
   3411          X86AMode* esp12 = advance4(esp8);
   3412          HReg dst = newVRegV(env);
   3413 	 /* do this via the stack (easy, convenient, etc) */
   3414          sub_from_esp(env, 16);
   3415          /* Do the less significant 64 bits */
   3416          iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
   3417          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
   3418          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
   3419          /* Do the more significant 64 bits */
   3420          iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
   3421          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
   3422          addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
   3423 	 /* Fetch result back from stack. */
   3424          addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
   3425          add_to_esp(env, 16);
   3426          return dst;
   3427       }
   3428 
   3429       case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
   3430       case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
   3431       case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
   3432       case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
   3433       case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
   3434       case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
   3435       case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
   3436       case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
   3437       case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
   3438       case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
   3439       do_32Fx4:
   3440       {
   3441          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3442          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3443          HReg dst = newVRegV(env);
   3444          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3445          addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
   3446          return dst;
   3447       }
   3448 
   3449       case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
   3450       case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
   3451       case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
   3452       case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
   3453       case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
   3454       case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
   3455       case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
   3456       case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
   3457       case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
   3458       case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
   3459       do_64Fx2:
   3460       {
   3461          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3462          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3463          HReg dst = newVRegV(env);
   3464          REQUIRE_SSE2;
   3465          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3466          addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
   3467          return dst;
   3468       }
   3469 
   3470       case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
   3471       case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
   3472       case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
   3473       case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
   3474       case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
   3475       case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
   3476       case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
   3477       case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
   3478       case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
   3479       case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
   3480       do_32F0x4: {
   3481          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3482          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3483          HReg dst = newVRegV(env);
   3484          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3485          addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
   3486          return dst;
   3487       }
   3488 
   3489       case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
   3490       case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
   3491       case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
   3492       case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
   3493       case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
   3494       case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
   3495       case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
   3496       case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
   3497       case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
   3498       case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
   3499       do_64F0x2: {
   3500          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3501          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3502          HReg dst = newVRegV(env);
   3503          REQUIRE_SSE2;
   3504          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3505          addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
   3506          return dst;
   3507       }
   3508 
   3509       case Iop_QNarrowBin32Sto16Sx8:
   3510          op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3511       case Iop_QNarrowBin16Sto8Sx16:
   3512          op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3513       case Iop_QNarrowBin16Sto8Ux16:
   3514          op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3515 
   3516       case Iop_InterleaveHI8x16:
   3517          op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3518       case Iop_InterleaveHI16x8:
   3519          op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3520       case Iop_InterleaveHI32x4:
   3521          op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3522       case Iop_InterleaveHI64x2:
   3523          op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3524 
   3525       case Iop_InterleaveLO8x16:
   3526          op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3527       case Iop_InterleaveLO16x8:
   3528          op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3529       case Iop_InterleaveLO32x4:
   3530          op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3531       case Iop_InterleaveLO64x2:
   3532          op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3533 
   3534       case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
   3535       case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
   3536       case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
   3537       case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
   3538       case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
   3539       case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
   3540       case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
   3541       case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
   3542       case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
   3543       case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
   3544       case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
   3545       case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
   3546       case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
   3547       case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
   3548       case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
   3549       case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
   3550       case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
   3551       case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
   3552       case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
   3553       case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
   3554       case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
   3555       case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
   3556       case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
   3557       case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
   3558       case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
   3559       case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
   3560       case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
   3561       case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
   3562       case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
   3563       case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
   3564       case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
   3565       case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
   3566       case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
   3567       case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
   3568       do_SseReRg: {
   3569          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3570          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3571          HReg dst = newVRegV(env);
   3572          if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
   3573             REQUIRE_SSE2;
   3574          if (arg1isEReg) {
   3575             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3576             addInstr(env, X86Instr_SseReRg(op, arg1, dst));
   3577          } else {
   3578             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3579             addInstr(env, X86Instr_SseReRg(op, arg2, dst));
   3580          }
   3581          return dst;
   3582       }
   3583 
   3584       case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
   3585       case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
   3586       case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
   3587       case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
   3588       case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
   3589       case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
   3590       case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
   3591       case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
   3592       do_SseShift: {
   3593          HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3594          X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3595          X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
   3596          HReg      ereg = newVRegV(env);
   3597          HReg      dst  = newVRegV(env);
   3598          REQUIRE_SSE2;
   3599          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3600          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3601          addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
   3602          addInstr(env, X86Instr_Push(rmi));
   3603          addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
   3604 	 addInstr(env, mk_vMOVsd_RR(greg, dst));
   3605          addInstr(env, X86Instr_SseReRg(op, ereg, dst));
   3606          add_to_esp(env, 16);
   3607          return dst;
   3608       }
   3609 
   3610       case Iop_NarrowBin32to16x8:
   3611          fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3612          goto do_SseAssistedBinary;
   3613       case Iop_NarrowBin16to8x16:
   3614          fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3615          goto do_SseAssistedBinary;
   3616       do_SseAssistedBinary: {
   3617          /* As with the amd64 case (where this is copied from) we
   3618             generate pretty bad code. */
   3619          vassert(fn != 0);
   3620          HReg dst = newVRegV(env);
   3621          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3622          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3623          HReg argp = newVRegI(env);
   3624          /* subl $112, %esp         -- make a space */
   3625          sub_from_esp(env, 112);
   3626          /* leal 48(%esp), %r_argp  -- point into it */
   3627          addInstr(env, X86Instr_Lea32(X86AMode_IR(48, hregX86_ESP()),
   3628                                       argp));
   3629          /* andl $-16, %r_argp      -- 16-align the pointer */
   3630          addInstr(env, X86Instr_Alu32R(Xalu_AND,
   3631                                        X86RMI_Imm( ~(UInt)15 ),
   3632                                        argp));
   3633          /* Prepare 3 arg regs:
   3634             leal  0(%r_argp), %eax
   3635             leal 16(%r_argp), %edx
   3636             leal 32(%r_argp), %ecx
   3637          */
   3638          addInstr(env, X86Instr_Lea32(X86AMode_IR(0, argp),
   3639                                       hregX86_EAX()));
   3640          addInstr(env, X86Instr_Lea32(X86AMode_IR(16, argp),
   3641                                       hregX86_EDX()));
   3642          addInstr(env, X86Instr_Lea32(X86AMode_IR(32, argp),
   3643                                       hregX86_ECX()));
   3644          /* Store the two args, at (%edx) and (%ecx):
   3645             movupd  %argL, 0(%edx)
   3646             movupd  %argR, 0(%ecx)
   3647          */
   3648          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argL,
   3649                                         X86AMode_IR(0, hregX86_EDX())));
   3650          addInstr(env, X86Instr_SseLdSt(False/*!isLoad*/, argR,
   3651                                         X86AMode_IR(0, hregX86_ECX())));
   3652          /* call the helper */
   3653          addInstr(env, X86Instr_Call( Xcc_ALWAYS, (Addr32)fn, 3 ));
   3654          /* fetch the result from memory, using %r_argp, which the
   3655             register allocator will keep alive across the call. */
   3656          addInstr(env, X86Instr_SseLdSt(True/*isLoad*/, dst,
   3657                                         X86AMode_IR(0, argp)));
   3658          /* and finally, clear the space */
   3659          add_to_esp(env, 112);
   3660          return dst;
   3661       }
   3662 
   3663       default:
   3664          break;
   3665    } /* switch (e->Iex.Binop.op) */
   3666    } /* if (e->tag == Iex_Binop) */
   3667 
   3668    if (e->tag == Iex_Mux0X) {
   3669       X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
   3670       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
   3671       HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
   3672       HReg dst = newVRegV(env);
   3673       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3674       addInstr(env, X86Instr_Test32(0xFF, r8));
   3675       addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
   3676       return dst;
   3677    }
   3678 
   3679    vec_fail:
   3680    vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
   3681               LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
   3682    ppIRExpr(e);
   3683    vpanic("iselVecExpr_wrk");
   3684 
   3685 #  undef REQUIRE_SSE1
   3686 #  undef REQUIRE_SSE2
   3687 #  undef SSE2_OR_ABOVE
   3688 }
   3689 
   3690 
   3691 /*---------------------------------------------------------*/
   3692 /*--- ISEL: Statements                                  ---*/
   3693 /*---------------------------------------------------------*/
   3694 
   3695 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   3696 {
   3697    if (vex_traceflags & VEX_TRACE_VCODE) {
   3698       vex_printf("\n-- ");
   3699       ppIRStmt(stmt);
   3700       vex_printf("\n");
   3701    }
   3702 
   3703    switch (stmt->tag) {
   3704 
   3705    /* --------- STORE --------- */
   3706    case Ist_Store: {
   3707       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   3708       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   3709       IREndness end   = stmt->Ist.Store.end;
   3710 
   3711       if (tya != Ity_I32 || end != Iend_LE)
   3712          goto stmt_fail;
   3713 
   3714       if (tyd == Ity_I32) {
   3715          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3716          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   3717          addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
   3718          return;
   3719       }
   3720       if (tyd == Ity_I8 || tyd == Ity_I16) {
   3721          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3722          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   3723          addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
   3724                                        r,am ));
   3725          return;
   3726       }
   3727       if (tyd == Ity_F64) {
   3728          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3729          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   3730          addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
   3731          return;
   3732       }
   3733       if (tyd == Ity_F32) {
   3734          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3735          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   3736          addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
   3737          return;
   3738       }
   3739       if (tyd == Ity_I64) {
   3740          HReg vHi, vLo, rA;
   3741          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
   3742          rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
   3743          addInstr(env, X86Instr_Alu32M(
   3744                           Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
   3745          addInstr(env, X86Instr_Alu32M(
   3746                           Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
   3747          return;
   3748       }
   3749       if (tyd == Ity_V128) {
   3750          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3751          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   3752          addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
   3753          return;
   3754       }
   3755       break;
   3756    }
   3757 
   3758    /* --------- PUT --------- */
   3759    case Ist_Put: {
   3760       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   3761       if (ty == Ity_I32) {
   3762          /* We're going to write to memory, so compute the RHS into an
   3763             X86RI. */
   3764          X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   3765          addInstr(env,
   3766                   X86Instr_Alu32M(
   3767                      Xalu_MOV,
   3768                      ri,
   3769                      X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
   3770                  ));
   3771          return;
   3772       }
   3773       if (ty == Ity_I8 || ty == Ity_I16) {
   3774          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   3775          addInstr(env, X86Instr_Store(
   3776                           toUChar(ty==Ity_I8 ? 1 : 2),
   3777                           r,
   3778                           X86AMode_IR(stmt->Ist.Put.offset,
   3779                                       hregX86_EBP())));
   3780          return;
   3781       }
   3782       if (ty == Ity_I64) {
   3783          HReg vHi, vLo;
   3784          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3785          X86AMode* am4 = advance4(am);
   3786          iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
   3787          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
   3788          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
   3789          return;
   3790       }
   3791       if (ty == Ity_V128) {
   3792          HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
   3793          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3794          addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
   3795          return;
   3796       }
   3797       if (ty == Ity_F32) {
   3798          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   3799          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3800          set_FPU_rounding_default(env); /* paranoia */
   3801          addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
   3802          return;
   3803       }
   3804       if (ty == Ity_F64) {
   3805          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   3806          X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
   3807          set_FPU_rounding_default(env); /* paranoia */
   3808          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
   3809          return;
   3810       }
   3811       break;
   3812    }
   3813 
   3814    /* --------- Indexed PUT --------- */
   3815    case Ist_PutI: {
   3816       X86AMode* am
   3817          = genGuestArrayOffset(
   3818               env, stmt->Ist.PutI.descr,
   3819                    stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
   3820 
   3821       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
   3822       if (ty == Ity_F64) {
   3823          HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
   3824          addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
   3825          return;
   3826       }
   3827       if (ty == Ity_I8) {
   3828          HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
   3829          addInstr(env, X86Instr_Store( 1, r, am ));
   3830          return;
   3831       }
   3832       if (ty == Ity_I32) {
   3833          HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
   3834          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
   3835          return;
   3836       }
   3837       if (ty == Ity_I64) {
   3838          HReg rHi, rLo;
   3839          X86AMode* am4 = advance4(am);
   3840          iselInt64Expr(&rHi, &rLo, env, stmt->Ist.PutI.data);
   3841          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
   3842          addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
   3843          return;
   3844       }
   3845       break;
   3846    }
   3847 
   3848    /* --------- TMP --------- */
   3849    case Ist_WrTmp: {
   3850       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   3851       IRType ty = typeOfIRTemp(env->type_env, tmp);
   3852 
   3853       /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
   3854          compute it into an AMode and then use LEA.  This usually
   3855          produces fewer instructions, often because (for memcheck
   3856          created IR) we get t = address-expression, (t is later used
   3857          twice) and so doing this naturally turns address-expression
   3858          back into an X86 amode. */
   3859       if (ty == Ity_I32
   3860           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   3861           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
   3862          X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   3863          HReg dst = lookupIRTemp(env, tmp);
   3864          if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
   3865             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   3866                value into a register.  Just emit a normal reg-reg move
   3867                so reg-alloc can coalesce it away in the usual way. */
   3868             HReg src = am->Xam.IR.reg;
   3869             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
   3870          } else {
   3871             addInstr(env, X86Instr_Lea32(am,dst));
   3872          }
   3873          return;
   3874       }
   3875 
   3876       if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
   3877          X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   3878          HReg dst = lookupIRTemp(env, tmp);
   3879          addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
   3880          return;
   3881       }
   3882       if (ty == Ity_I64) {
   3883          HReg rHi, rLo, dstHi, dstLo;
   3884          iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3885          lookupIRTemp64( &dstHi, &dstLo, env, tmp);
   3886          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   3887          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   3888          return;
   3889       }
   3890       if (ty == Ity_I1) {
   3891          X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   3892          HReg dst = lookupIRTemp(env, tmp);
   3893          addInstr(env, X86Instr_Set32(cond, dst));
   3894          return;
   3895       }
   3896       if (ty == Ity_F64) {
   3897          HReg dst = lookupIRTemp(env, tmp);
   3898          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   3899          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3900          return;
   3901       }
   3902       if (ty == Ity_F32) {
   3903          HReg dst = lookupIRTemp(env, tmp);
   3904          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   3905          addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
   3906          return;
   3907       }
   3908       if (ty == Ity_V128) {
   3909          HReg dst = lookupIRTemp(env, tmp);
   3910          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   3911          addInstr(env, mk_vMOVsd_RR(src,dst));
   3912          return;
   3913       }
   3914       break;
   3915    }
   3916 
   3917    /* --------- Call to DIRTY helper --------- */
   3918    case Ist_Dirty: {
   3919       IRType   retty;
   3920       IRDirty* d = stmt->Ist.Dirty.details;
   3921       Bool     passBBP = False;
   3922 
   3923       if (d->nFxState == 0)
   3924          vassert(!d->needsBBP);
   3925 
   3926       passBBP = toBool(d->nFxState > 0 && d->needsBBP);
   3927 
   3928       /* Marshal args, do the call, clear stack. */
   3929       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
   3930 
   3931       /* Now figure out what to do with the returned value, if any. */
   3932       if (d->tmp == IRTemp_INVALID)
   3933          /* No return value.  Nothing to do. */
   3934          return;
   3935 
   3936       retty = typeOfIRTemp(env->type_env, d->tmp);
   3937       if (retty == Ity_I64) {
   3938          HReg dstHi, dstLo;
   3939          /* The returned value is in %edx:%eax.  Park it in the
   3940             register-pair associated with tmp. */
   3941          lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
   3942          addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
   3943          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
   3944          return;
   3945       }
   3946       if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
   3947          /* The returned value is in %eax.  Park it in the register
   3948             associated with tmp. */
   3949          HReg dst = lookupIRTemp(env, d->tmp);
   3950          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
   3951          return;
   3952       }
   3953       break;
   3954    }
   3955 
   3956    /* --------- MEM FENCE --------- */
   3957    case Ist_MBE:
   3958       switch (stmt->Ist.MBE.event) {
   3959          case Imbe_Fence:
   3960             addInstr(env, X86Instr_MFence(env->hwcaps));
   3961             return;
   3962          default:
   3963             break;
   3964       }
   3965       break;
   3966 
   3967    /* --------- ACAS --------- */
   3968    case Ist_CAS:
   3969       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   3970          /* "normal" singleton CAS */
   3971          UChar  sz;
   3972          IRCAS* cas = stmt->Ist.CAS.details;
   3973          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3974          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   3975          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   3976          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   3977          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   3978          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   3979          vassert(cas->expdHi == NULL);
   3980          vassert(cas->dataHi == NULL);
   3981          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   3982          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   3983          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   3984          switch (ty) {
   3985             case Ity_I32: sz = 4; break;
   3986             case Ity_I16: sz = 2; break;
   3987             case Ity_I8:  sz = 1; break;
   3988             default: goto unhandled_cas;
   3989          }
   3990          addInstr(env, X86Instr_ACAS(am, sz));
   3991          addInstr(env,
   3992                   X86Instr_CMov32(Xcc_NZ,
   3993                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   3994          return;
   3995       } else {
   3996          /* double CAS */
   3997          IRCAS* cas = stmt->Ist.CAS.details;
   3998          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3999          /* only 32-bit allowed in this case */
   4000          /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
   4001          /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
   4002          X86AMode* am = iselIntExpr_AMode(env, cas->addr);
   4003          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   4004          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   4005          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   4006          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   4007          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   4008          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4009          if (ty != Ity_I32)
   4010             goto unhandled_cas;
   4011          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   4012          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4013          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
   4014          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
   4015          addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
   4016          addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
   4017          addInstr(env, X86Instr_DACAS(am));
   4018          addInstr(env,
   4019                   X86Instr_CMov32(Xcc_NZ,
   4020                                   X86RM_Reg(hregX86_EDX()), rOldHi));
   4021          addInstr(env,
   4022                   X86Instr_CMov32(Xcc_NZ,
   4023                                   X86RM_Reg(hregX86_EAX()), rOldLo));
   4024          return;
   4025       }
   4026       unhandled_cas:
   4027       break;
   4028 
   4029    /* --------- INSTR MARK --------- */
   4030    /* Doesn't generate any executable code ... */
   4031    case Ist_IMark:
   4032        return;
   4033 
   4034    /* --------- NO-OP --------- */
   4035    /* Fairly self-explanatory, wouldn't you say? */
   4036    case Ist_NoOp:
   4037        return;
   4038 
   4039    /* --------- EXIT --------- */
   4040    case Ist_Exit: {
   4041       X86RI*      dst;
   4042       X86CondCode cc;
   4043       if (stmt->Ist.Exit.dst->tag != Ico_U32)
   4044          vpanic("isel_x86: Ist_Exit: dst is not a 32-bit value");
   4045       dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4046       cc  = iselCondCode(env,stmt->Ist.Exit.guard);
   4047       addInstr(env, X86Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
   4048       return;
   4049    }
   4050 
   4051    default: break;
   4052    }
   4053   stmt_fail:
   4054    ppIRStmt(stmt);
   4055    vpanic("iselStmt");
   4056 }
   4057 
   4058 
   4059 /*---------------------------------------------------------*/
   4060 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4061 /*---------------------------------------------------------*/
   4062 
   4063 static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
   4064 {
   4065    X86RI* ri;
   4066    if (vex_traceflags & VEX_TRACE_VCODE) {
   4067       vex_printf("\n-- goto {");
   4068       ppIRJumpKind(jk);
   4069       vex_printf("} ");
   4070       ppIRExpr(next);
   4071       vex_printf("\n");
   4072    }
   4073    ri = iselIntExpr_RI(env, next);
   4074    addInstr(env, X86Instr_Goto(jk, Xcc_ALWAYS,ri));
   4075 }
   4076 
   4077 
   4078 /*---------------------------------------------------------*/
   4079 /*--- Insn selector top-level                           ---*/
   4080 /*---------------------------------------------------------*/
   4081 
   4082 /* Translate an entire SB to x86 code. */
   4083 
   4084 HInstrArray* iselSB_X86 ( IRSB* bb, VexArch      arch_host,
   4085                                     VexArchInfo* archinfo_host,
   4086                                     VexAbiInfo*  vbi/*UNUSED*/ )
   4087 {
   4088    Int      i, j;
   4089    HReg     hreg, hregHI;
   4090    ISelEnv* env;
   4091    UInt     hwcaps_host = archinfo_host->hwcaps;
   4092 
   4093    /* sanity ... */
   4094    vassert(arch_host == VexArchX86);
   4095    vassert(0 == (hwcaps_host
   4096                  & ~(VEX_HWCAPS_X86_SSE1
   4097                      | VEX_HWCAPS_X86_SSE2
   4098                      | VEX_HWCAPS_X86_SSE3
   4099                      | VEX_HWCAPS_X86_LZCNT)));
   4100 
   4101    /* Make up an initial environment to use. */
   4102    env = LibVEX_Alloc(sizeof(ISelEnv));
   4103    env->vreg_ctr = 0;
   4104 
   4105    /* Set up output code array. */
   4106    env->code = newHInstrArray();
   4107 
   4108    /* Copy BB's type env. */
   4109    env->type_env = bb->tyenv;
   4110 
   4111    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4112       change as we go along. */
   4113    env->n_vregmap = bb->tyenv->types_used;
   4114    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4115    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4116 
   4117    /* and finally ... */
   4118    env->hwcaps = hwcaps_host;
   4119 
   4120    /* For each IR temporary, allocate a suitably-kinded virtual
   4121       register. */
   4122    j = 0;
   4123    for (i = 0; i < env->n_vregmap; i++) {
   4124       hregHI = hreg = INVALID_HREG;
   4125       switch (bb->tyenv->types[i]) {
   4126          case Ity_I1:
   4127          case Ity_I8:
   4128          case Ity_I16:
   4129          case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
   4130          case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
   4131                         hregHI = mkHReg(j++, HRcInt32, True); break;
   4132          case Ity_F32:
   4133          case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
   4134          case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
   4135          default: ppIRType(bb->tyenv->types[i]);
   4136                   vpanic("iselBB: IRTemp type");
   4137       }
   4138       env->vregmap[i]   = hreg;
   4139       env->vregmapHI[i] = hregHI;
   4140    }
   4141    env->vreg_ctr = j;
   4142 
   4143    /* Ok, finally we can iterate over the statements. */
   4144    for (i = 0; i < bb->stmts_used; i++)
   4145       iselStmt(env,bb->stmts[i]);
   4146 
   4147    iselNext(env,bb->next,bb->jumpkind);
   4148 
   4149    /* record the number of vregs we used. */
   4150    env->code->n_vregs = env->vreg_ctr;
   4151    return env->code;
   4152 }
   4153 
   4154 
   4155 /*---------------------------------------------------------------*/
   4156 /*--- end                                     host_x86_isel.c ---*/
   4157 /*---------------------------------------------------------------*/
   4158