Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2011 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_amd64_defs.h"
     47 
     48 
     49 /*---------------------------------------------------------*/
     50 /*--- x87/SSE control word stuff                        ---*/
     51 /*---------------------------------------------------------*/
     52 
     53 /* Vex-generated code expects to run with the FPU set as follows: all
     54    exceptions masked, round-to-nearest, precision = 53 bits.  This
     55    corresponds to a FPU control word value of 0x027F.
     56 
     57    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     58 
     59    %fpucw and %mxcsr should have these values on entry to
     60    Vex-generated code, and should those values should be
     61    unchanged at exit.
     62 */
     63 
     64 #define DEFAULT_FPUCW 0x027F
     65 
     66 #define DEFAULT_MXCSR 0x1F80
     67 
     68 /* debugging only, do not use */
     69 /* define DEFAULT_FPUCW 0x037F */
     70 
     71 
     72 /*---------------------------------------------------------*/
     73 /*--- misc helpers                                      ---*/
     74 /*---------------------------------------------------------*/
     75 
     76 /* These are duplicated in guest-amd64/toIR.c */
     77 static IRExpr* unop ( IROp op, IRExpr* a )
     78 {
     79    return IRExpr_Unop(op, a);
     80 }
     81 
     82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     83 {
     84    return IRExpr_Binop(op, a1, a2);
     85 }
     86 
     87 static IRExpr* bind ( Int binder )
     88 {
     89    return IRExpr_Binder(binder);
     90 }
     91 
     92 
     93 /*---------------------------------------------------------*/
     94 /*--- ISelEnv                                           ---*/
     95 /*---------------------------------------------------------*/
     96 
     97 /* This carries around:
     98 
     99    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    100      might encounter.  This is computed before insn selection starts,
    101      and does not change.
    102 
    103    - A mapping from IRTemp to HReg.  This tells the insn selector
    104      which virtual register is associated with each IRTemp
    105      temporary.  This is computed before insn selection starts, and
    106      does not change.  We expect this mapping to map precisely the
    107      same set of IRTemps as the type mapping does.
    108 
    109         - vregmap   holds the primary register for the IRTemp.
    110         - vregmapHI is only used for 128-bit integer-typed
    111              IRTemps.  It holds the identity of a second
    112              64-bit virtual HReg, which holds the high half
    113              of the value.
    114 
    115    - The code array, that is, the insns selected so far.
    116 
    117    - A counter, for generating new virtual registers.
    118 
    119    - The host subarchitecture we are selecting insns for.
    120      This is set at the start and does not change.
    121 
    122    Note, this is all host-independent.  (JRS 20050201: well, kinda
    123    ... not completely.  Compare with ISelEnv for X86.)
    124 */
    125 
    126 typedef
    127    struct {
    128       IRTypeEnv*   type_env;
    129 
    130       HReg*        vregmap;
    131       HReg*        vregmapHI;
    132       Int          n_vregmap;
    133 
    134       HInstrArray* code;
    135 
    136       Int          vreg_ctr;
    137 
    138       UInt         hwcaps;
    139    }
    140    ISelEnv;
    141 
    142 
    143 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    144 {
    145    vassert(tmp >= 0);
    146    vassert(tmp < env->n_vregmap);
    147    return env->vregmap[tmp];
    148 }
    149 
    150 static void lookupIRTemp128 ( HReg* vrHI, HReg* vrLO,
    151                               ISelEnv* env, IRTemp tmp )
    152 {
    153    vassert(tmp >= 0);
    154    vassert(tmp < env->n_vregmap);
    155    vassert(env->vregmapHI[tmp] != INVALID_HREG);
    156    *vrLO = env->vregmap[tmp];
    157    *vrHI = env->vregmapHI[tmp];
    158 }
    159 
    160 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
    161 {
    162    addHInstr(env->code, instr);
    163    if (vex_traceflags & VEX_TRACE_VCODE) {
    164       ppAMD64Instr(instr, True);
    165       vex_printf("\n");
    166    }
    167 }
    168 
    169 static HReg newVRegI ( ISelEnv* env )
    170 {
    171    HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
    172    env->vreg_ctr++;
    173    return reg;
    174 }
    175 
    176 //.. static HReg newVRegF ( ISelEnv* env )
    177 //.. {
    178 //..    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
    179 //..    env->vreg_ctr++;
    180 //..    return reg;
    181 //.. }
    182 
    183 static HReg newVRegV ( ISelEnv* env )
    184 {
    185    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    186    env->vreg_ctr++;
    187    return reg;
    188 }
    189 
    190 
    191 /*---------------------------------------------------------*/
    192 /*--- ISEL: Forward declarations                        ---*/
    193 /*---------------------------------------------------------*/
    194 
    195 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    196    iselXXX_wrk do the real work, but are not to be called directly.
    197    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    198    checks that all returned registers are virtual.  You should not
    199    call the _wrk version directly.
    200 */
    201 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    202 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    203 
    204 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
    205 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
    206 
    207 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
    208 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
    209 
    210 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
    211 static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
    212 
    213 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    214 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    215 
    216 static void          iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
    217                                           ISelEnv* env, IRExpr* e );
    218 static void          iselInt128Expr     ( HReg* rHi, HReg* rLo,
    219                                           ISelEnv* env, IRExpr* e );
    220 
    221 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
    222 static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
    223 
    224 static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
    225 static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
    226 
    227 static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
    228 static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
    229 
    230 static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
    231 static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
    232 
    233 
    234 /*---------------------------------------------------------*/
    235 /*--- ISEL: Misc helpers                                ---*/
    236 /*---------------------------------------------------------*/
    237 
    238 static Bool sane_AMode ( AMD64AMode* am )
    239 {
    240    switch (am->tag) {
    241       case Aam_IR:
    242          return
    243             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
    244                     && (hregIsVirtual(am->Aam.IR.reg)
    245                         || am->Aam.IR.reg == hregAMD64_RBP()) );
    246       case Aam_IRRS:
    247          return
    248             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
    249                     && hregIsVirtual(am->Aam.IRRS.base)
    250                     && hregClass(am->Aam.IRRS.index) == HRcInt64
    251                     && hregIsVirtual(am->Aam.IRRS.index) );
    252       default:
    253         vpanic("sane_AMode: unknown amd64 amode tag");
    254    }
    255 }
    256 
    257 
    258 /* Can the lower 32 bits be signedly widened to produce the whole
    259    64-bit value?  In other words, are the top 33 bits either all 0 or
    260    all 1 ? */
    261 static Bool fitsIn32Bits ( ULong x )
    262 {
    263    Long y0 = (Long)x;
    264    Long y1 = y0;
    265    y1 <<= 32;
    266    y1 >>=/*s*/ 32;
    267    return toBool(x == y1);
    268 }
    269 
    270 /* Is this a 64-bit zero expression? */
    271 
    272 static Bool isZeroU64 ( IRExpr* e )
    273 {
    274    return e->tag == Iex_Const
    275           && e->Iex.Const.con->tag == Ico_U64
    276           && e->Iex.Const.con->Ico.U64 == 0ULL;
    277 }
    278 
    279 static Bool isZeroU32 ( IRExpr* e )
    280 {
    281    return e->tag == Iex_Const
    282           && e->Iex.Const.con->tag == Ico_U32
    283           && e->Iex.Const.con->Ico.U32 == 0;
    284 }
    285 
    286 /* Make a int reg-reg move. */
    287 
    288 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    289 {
    290    vassert(hregClass(src) == HRcInt64);
    291    vassert(hregClass(dst) == HRcInt64);
    292    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
    293 }
    294 
    295 /* Make a vector reg-reg move. */
    296 
    297 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    298 {
    299    vassert(hregClass(src) == HRcVec128);
    300    vassert(hregClass(dst) == HRcVec128);
    301    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
    302 }
    303 
    304 /* Advance/retreat %rsp by n. */
    305 
    306 static void add_to_rsp ( ISelEnv* env, Int n )
    307 {
    308    vassert(n > 0 && n < 256 && (n%8) == 0);
    309    addInstr(env,
    310             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
    311                                         hregAMD64_RSP()));
    312 }
    313 
    314 static void sub_from_rsp ( ISelEnv* env, Int n )
    315 {
    316    vassert(n > 0 && n < 256 && (n%8) == 0);
    317    addInstr(env,
    318             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
    319                                         hregAMD64_RSP()));
    320 }
    321 
    322 /* Push 64-bit constants on the stack. */
    323 static void push_uimm64( ISelEnv* env, ULong uimm64 )
    324 {
    325    /* If uimm64 can be expressed as the sign extension of its
    326       lower 32 bits, we can do it the easy way. */
    327    Long simm64 = (Long)uimm64;
    328    if ( simm64 == ((simm64 << 32) >> 32) ) {
    329       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
    330    } else {
    331       HReg tmp = newVRegI(env);
    332       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
    333       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
    334    }
    335 }
    336 
    337 //.. /* Given an amode, return one which references 4 bytes further
    338 //..    along. */
    339 //..
    340 //.. static X86AMode* advance4 ( X86AMode* am )
    341 //.. {
    342 //..    X86AMode* am4 = dopyX86AMode(am);
    343 //..    switch (am4->tag) {
    344 //..       case Xam_IRRS:
    345 //..          am4->Xam.IRRS.imm += 4; break;
    346 //..       case Xam_IR:
    347 //..          am4->Xam.IR.imm += 4; break;
    348 //..       default:
    349 //..          vpanic("advance4(x86,host)");
    350 //..    }
    351 //..    return am4;
    352 //.. }
    353 //..
    354 //..
    355 //.. /* Push an arg onto the host stack, in preparation for a call to a
    356 //..    helper function of some kind.  Returns the number of 32-bit words
    357 //..    pushed. */
    358 //..
    359 //.. static Int pushArg ( ISelEnv* env, IRExpr* arg )
    360 //.. {
    361 //..    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
    362 //..    if (arg_ty == Ity_I32) {
    363 //..       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
    364 //..       return 1;
    365 //..    } else
    366 //..    if (arg_ty == Ity_I64) {
    367 //..       HReg rHi, rLo;
    368 //..       iselInt64Expr(&rHi, &rLo, env, arg);
    369 //..       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
    370 //..       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
    371 //..       return 2;
    372 //..    }
    373 //..    ppIRExpr(arg);
    374 //..    vpanic("pushArg(x86): can't handle arg of this type");
    375 //.. }
    376 
    377 
    378 /* Used only in doHelperCall.  If possible, produce a single
    379    instruction which computes 'e' into 'dst'.  If not possible, return
    380    NULL. */
    381 
    382 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
    383                                                     HReg     dst,
    384                                                     IRExpr*  e )
    385 {
    386    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
    387 
    388    if (e->tag == Iex_Const) {
    389       vassert(e->Iex.Const.con->tag == Ico_U64);
    390       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
    391          return AMD64Instr_Alu64R(
    392                    Aalu_MOV,
    393                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
    394                    dst
    395                 );
    396       } else {
    397          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
    398       }
    399    }
    400 
    401    if (e->tag == Iex_RdTmp) {
    402       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
    403       return mk_iMOVsd_RR(src, dst);
    404    }
    405 
    406    if (e->tag == Iex_Get) {
    407       vassert(e->Iex.Get.ty == Ity_I64);
    408       return AMD64Instr_Alu64R(
    409                 Aalu_MOV,
    410                 AMD64RMI_Mem(
    411                    AMD64AMode_IR(e->Iex.Get.offset,
    412                                  hregAMD64_RBP())),
    413                 dst);
    414    }
    415 
    416    if (e->tag == Iex_Unop
    417        && e->Iex.Unop.op == Iop_32Uto64
    418        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
    419       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
    420       return AMD64Instr_MovxLQ(False, src, dst);
    421    }
    422 
    423    if (0) { ppIRExpr(e); vex_printf("\n"); }
    424 
    425    return NULL;
    426 }
    427 
    428 
    429 /* Do a complete function call.  guard is a Ity_Bit expression
    430    indicating whether or not the call happens.  If guard==NULL, the
    431    call is unconditional. */
    432 
    433 static
    434 void doHelperCall ( ISelEnv* env,
    435                     Bool passBBP,
    436                     IRExpr* guard, IRCallee* cee, IRExpr** args )
    437 {
    438    AMD64CondCode cc;
    439    HReg          argregs[6];
    440    HReg          tmpregs[6];
    441    AMD64Instr*   fastinstrs[6];
    442    Int           n_args, i, argreg;
    443 
    444    /* Marshal args for a call and do the call.
    445 
    446       If passBBP is True, %rbp (the baseblock pointer) is to be passed
    447       as the first arg.
    448 
    449       This function only deals with a tiny set of possibilities, which
    450       cover all helpers in practice.  The restrictions are that only
    451       arguments in registers are supported, hence only 6x64 integer
    452       bits in total can be passed.  In fact the only supported arg
    453       type is I64.
    454 
    455       Generating code which is both efficient and correct when
    456       parameters are to be passed in registers is difficult, for the
    457       reasons elaborated in detail in comments attached to
    458       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
    459       of the method described in those comments.
    460 
    461       The problem is split into two cases: the fast scheme and the
    462       slow scheme.  In the fast scheme, arguments are computed
    463       directly into the target (real) registers.  This is only safe
    464       when we can be sure that computation of each argument will not
    465       trash any real registers set by computation of any other
    466       argument.
    467 
    468       In the slow scheme, all args are first computed into vregs, and
    469       once they are all done, they are moved to the relevant real
    470       regs.  This always gives correct code, but it also gives a bunch
    471       of vreg-to-rreg moves which are usually redundant but are hard
    472       for the register allocator to get rid of.
    473 
    474       To decide which scheme to use, all argument expressions are
    475       first examined.  If they are all so simple that it is clear they
    476       will be evaluated without use of any fixed registers, use the
    477       fast scheme, else use the slow scheme.  Note also that only
    478       unconditional calls may use the fast scheme, since having to
    479       compute a condition expression could itself trash real
    480       registers.
    481 
    482       Note this requires being able to examine an expression and
    483       determine whether or not evaluation of it might use a fixed
    484       register.  That requires knowledge of how the rest of this insn
    485       selector works.  Currently just the following 3 are regarded as
    486       safe -- hopefully they cover the majority of arguments in
    487       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    488    */
    489 
    490    /* Note that the cee->regparms field is meaningless on AMD64 host
    491       (since there is only one calling convention) and so we always
    492       ignore it. */
    493 
    494    n_args = 0;
    495    for (i = 0; args[i]; i++)
    496       n_args++;
    497 
    498    if (6 < n_args + (passBBP ? 1 : 0))
    499       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
    500 
    501    argregs[0] = hregAMD64_RDI();
    502    argregs[1] = hregAMD64_RSI();
    503    argregs[2] = hregAMD64_RDX();
    504    argregs[3] = hregAMD64_RCX();
    505    argregs[4] = hregAMD64_R8();
    506    argregs[5] = hregAMD64_R9();
    507 
    508    tmpregs[0] = tmpregs[1] = tmpregs[2] =
    509    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
    510 
    511    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
    512    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
    513 
    514    /* First decide which scheme (slow or fast) is to be used.  First
    515       assume the fast scheme, and select slow if any contraindications
    516       (wow) appear. */
    517 
    518    if (guard) {
    519       if (guard->tag == Iex_Const
    520           && guard->Iex.Const.con->tag == Ico_U1
    521           && guard->Iex.Const.con->Ico.U1 == True) {
    522          /* unconditional */
    523       } else {
    524          /* Not manifestly unconditional -- be conservative. */
    525          goto slowscheme;
    526       }
    527    }
    528 
    529    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
    530       use the slow scheme.  Because this is tentative, we can't call
    531       addInstr (that is, commit to) any instructions until we're
    532       handled all the arguments.  So park the resulting instructions
    533       in a buffer and emit that if we're successful. */
    534 
    535    /* FAST SCHEME */
    536    argreg = 0;
    537    if (passBBP) {
    538       fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
    539       argreg++;
    540    }
    541 
    542    for (i = 0; i < n_args; i++) {
    543       vassert(argreg < 6);
    544       vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    545       fastinstrs[argreg]
    546          = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
    547       if (fastinstrs[argreg] == NULL)
    548          goto slowscheme;
    549       argreg++;
    550    }
    551 
    552    /* Looks like we're in luck.  Emit the accumulated instructions and
    553       move on to doing the call itself. */
    554    vassert(argreg <= 6);
    555    for (i = 0; i < argreg; i++)
    556       addInstr(env, fastinstrs[i]);
    557 
    558    /* Fast scheme only applies for unconditional calls.  Hence: */
    559    cc = Acc_ALWAYS;
    560 
    561    goto handle_call;
    562 
    563 
    564    /* SLOW SCHEME; move via temporaries */
    565   slowscheme:
    566 #if 0
    567 if (n_args > 0) {for (i = 0; args[i]; i++) {
    568 ppIRExpr(args[i]); vex_printf(" "); }
    569 vex_printf("\n");}
    570 #endif
    571    argreg = 0;
    572 
    573    if (passBBP) {
    574       /* This is pretty stupid; better to move directly to rdi
    575          after the rest of the args are done. */
    576       tmpregs[argreg] = newVRegI(env);
    577       addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
    578       argreg++;
    579    }
    580 
    581    for (i = 0; i < n_args; i++) {
    582       vassert(argreg < 6);
    583       vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    584       tmpregs[argreg] = iselIntExpr_R(env, args[i]);
    585       argreg++;
    586    }
    587 
    588    /* Now we can compute the condition.  We can't do it earlier
    589       because the argument computations could trash the condition
    590       codes.  Be a bit clever to handle the common case where the
    591       guard is 1:Bit. */
    592    cc = Acc_ALWAYS;
    593    if (guard) {
    594       if (guard->tag == Iex_Const
    595           && guard->Iex.Const.con->tag == Ico_U1
    596           && guard->Iex.Const.con->Ico.U1 == True) {
    597          /* unconditional -- do nothing */
    598       } else {
    599          cc = iselCondCode( env, guard );
    600       }
    601    }
    602 
    603    /* Move the args to their final destinations. */
    604    for (i = 0; i < argreg; i++) {
    605       /* None of these insns, including any spill code that might
    606          be generated, may alter the condition codes. */
    607       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
    608    }
    609 
    610 
    611    /* Finally, the call itself. */
    612   handle_call:
    613    addInstr(env, AMD64Instr_Call(
    614                     cc,
    615                     Ptr_to_ULong(cee->addr),
    616                     n_args + (passBBP ? 1 : 0)
    617                  )
    618    );
    619 }
    620 
    621 
    622 /* Given a guest-state array descriptor, an index expression and a
    623    bias, generate an AMD64AMode holding the relevant guest state
    624    offset. */
    625 
    626 static
    627 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    628                                   IRExpr* off, Int bias )
    629 {
    630    HReg tmp, roff;
    631    Int  elemSz = sizeofIRType(descr->elemTy);
    632    Int  nElems = descr->nElems;
    633 
    634    /* Throw out any cases not generated by an amd64 front end.  In
    635       theory there might be a day where we need to handle them -- if
    636       we ever run non-amd64-guest on amd64 host. */
    637 
    638    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
    639       vpanic("genGuestArrayOffset(amd64 host)");
    640 
    641    /* Compute off into a reg, %off.  Then return:
    642 
    643          movq %off, %tmp
    644          addq $bias, %tmp  (if bias != 0)
    645          andq %tmp, 7
    646          ... base(%rbp, %tmp, shift) ...
    647    */
    648    tmp  = newVRegI(env);
    649    roff = iselIntExpr_R(env, off);
    650    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    651    if (bias != 0) {
    652       /* Make sure the bias is sane, in the sense that there are
    653          no significant bits above bit 30 in it. */
    654       vassert(-10000 < bias && bias < 10000);
    655       addInstr(env,
    656                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
    657    }
    658    addInstr(env,
    659             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
    660    vassert(elemSz == 1 || elemSz == 8);
    661    return
    662       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
    663                                     elemSz==8 ? 3 : 0);
    664 }
    665 
    666 
    667 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
    668 static
    669 void set_SSE_rounding_default ( ISelEnv* env )
    670 {
    671    /* pushq $DEFAULT_MXCSR
    672       ldmxcsr 0(%rsp)
    673       addq $8, %rsp
    674    */
    675    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    676    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
    677    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    678    add_to_rsp(env, 8);
    679 }
    680 
    681 /* Mess with the FPU's rounding mode: set to the default rounding mode
    682    (DEFAULT_FPUCW). */
    683 static
    684 void set_FPU_rounding_default ( ISelEnv* env )
    685 {
    686    /* movq $DEFAULT_FPUCW, -8(%rsp)
    687       fldcw -8(%esp)
    688    */
    689    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    690    addInstr(env, AMD64Instr_Alu64M(
    691                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
    692    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    693 }
    694 
    695 
    696 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
    697    expression denoting a value in the range 0 .. 3, indicating a round
    698    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
    699    have the same rounding.
    700 */
    701 static
    702 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
    703 {
    704    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
    705       both rounding bits == 0.  If that wasn't the case, we couldn't
    706       create a new rounding field simply by ORing the new value into
    707       place. */
    708 
    709    /* movq $3, %reg
    710       andq [[mode]], %reg  -- shouldn't be needed; paranoia
    711       shlq $13, %reg
    712       orq $DEFAULT_MXCSR, %reg
    713       pushq %reg
    714       ldmxcsr 0(%esp)
    715       addq $8, %rsp
    716    */
    717    HReg        reg      = newVRegI(env);
    718    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    719    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
    720    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
    721                                    iselIntExpr_RMI(env, mode), reg));
    722    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
    723    addInstr(env, AMD64Instr_Alu64R(
    724                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
    725    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
    726    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    727    add_to_rsp(env, 8);
    728 }
    729 
    730 
    731 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    732    expression denoting a value in the range 0 .. 3, indicating a round
    733    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    734    the same rounding.
    735 */
    736 static
    737 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    738 {
    739    HReg rrm  = iselIntExpr_R(env, mode);
    740    HReg rrm2 = newVRegI(env);
    741    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    742 
    743    /* movq  %rrm, %rrm2
    744       andq  $3, %rrm2   -- shouldn't be needed; paranoia
    745       shlq  $10, %rrm2
    746       orq   $DEFAULT_FPUCW, %rrm2
    747       movq  %rrm2, -8(%rsp)
    748       fldcw -8(%esp)
    749    */
    750    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    751    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
    752    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
    753    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
    754                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
    755    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
    756                                    AMD64RI_Reg(rrm2), m8_rsp));
    757    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    758 }
    759 
    760 
    761 /* Generate all-zeroes into a new vector register.
    762 */
    763 static HReg generate_zeroes_V128 ( ISelEnv* env )
    764 {
    765    HReg dst = newVRegV(env);
    766    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
    767    return dst;
    768 }
    769 
    770 /* Generate all-ones into a new vector register.
    771 */
    772 static HReg generate_ones_V128 ( ISelEnv* env )
    773 {
    774    HReg dst = newVRegV(env);
    775    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
    776    return dst;
    777 }
    778 
    779 
    780 /* Generate !src into a new vector register.  Amazing that there isn't
    781    a less crappy way to do this.
    782 */
    783 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
    784 {
    785    HReg dst = generate_ones_V128(env);
    786    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
    787    return dst;
    788 }
    789 
    790 
    791 /* Expand the given byte into a 64-bit word, by cloning each bit
    792    8 times. */
    793 static ULong bitmask8_to_bytemask64 ( UShort w8 )
    794 {
    795    vassert(w8 == (w8 & 0xFF));
    796    ULong w64 = 0;
    797    Int i;
    798    for (i = 0; i < 8; i++) {
    799       if (w8 & (1<<i))
    800          w64 |= (0xFFULL << (8 * i));
    801    }
    802    return w64;
    803 }
    804 
    805 
    806 //.. /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
    807 //..    after most non-simple FPU operations (simple = +, -, *, / and
    808 //..    sqrt).
    809 //..
    810 //..    This could be done a lot more efficiently if needed, by loading
    811 //..    zero and adding it to the value to be rounded (fldz ; faddp?).
    812 //.. */
    813 //.. static void roundToF64 ( ISelEnv* env, HReg reg )
    814 //.. {
    815 //..    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
    816 //..    sub_from_esp(env, 8);
    817 //..    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
    818 //..    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
    819 //..    add_to_esp(env, 8);
    820 //.. }
    821 
    822 
    823 /*---------------------------------------------------------*/
    824 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
    825 /*---------------------------------------------------------*/
    826 
    827 /* Select insns for an integer-typed expression, and add them to the
    828    code list.  Return a reg holding the result.  This reg will be a
    829    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    830    want to modify it, ask for a new vreg, copy it in there, and modify
    831    the copy.  The register allocator will do its best to map both
    832    vregs to the same real register, so the copies will often disappear
    833    later in the game.
    834 
    835    This should handle expressions of 64, 32, 16 and 8-bit type.  All
    836    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
    837    expressions, the upper 32/16/24 bits are arbitrary, so you should
    838    mask or sign extend partial values if necessary.
    839 */
    840 
    841 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    842 {
    843    HReg r = iselIntExpr_R_wrk(env, e);
    844    /* sanity checks ... */
    845 #  if 0
    846    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
    847 #  endif
    848    vassert(hregClass(r) == HRcInt64);
    849    vassert(hregIsVirtual(r));
    850    return r;
    851 }
    852 
    853 /* DO NOT CALL THIS DIRECTLY ! */
    854 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    855 {
    856    /* Used for unary/binary SIMD64 ops. */
    857    HWord fn = 0;
    858    Bool second_is_UInt;
    859 
    860    MatchInfo mi;
    861    DECLARE_PATTERN(p_1Uto8_64to1);
    862    DECLARE_PATTERN(p_LDle8_then_8Uto64);
    863    DECLARE_PATTERN(p_LDle16_then_16Uto64);
    864 
    865    IRType ty = typeOfIRExpr(env->type_env,e);
    866    switch (ty) {
    867       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
    868       default: vassert(0);
    869    }
    870 
    871    switch (e->tag) {
    872 
    873    /* --------- TEMP --------- */
    874    case Iex_RdTmp: {
    875       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    876    }
    877 
    878    /* --------- LOAD --------- */
    879    case Iex_Load: {
    880       HReg dst = newVRegI(env);
    881       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    882 
    883       /* We can't handle big-endian loads, nor load-linked. */
    884       if (e->Iex.Load.end != Iend_LE)
    885          goto irreducible;
    886 
    887       if (ty == Ity_I64) {
    888          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
    889                                          AMD64RMI_Mem(amode), dst) );
    890          return dst;
    891       }
    892       if (ty == Ity_I32) {
    893          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
    894          return dst;
    895       }
    896       if (ty == Ity_I16) {
    897          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
    898          return dst;
    899       }
    900       if (ty == Ity_I8) {
    901          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
    902          return dst;
    903       }
    904       break;
    905    }
    906 
    907    /* --------- BINARY OP --------- */
    908    case Iex_Binop: {
    909       AMD64AluOp   aluOp;
    910       AMD64ShiftOp shOp;
    911 
    912       /* Pattern: Sub64(0,x) */
    913       /*     and: Sub32(0,x) */
    914       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
    915           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
    916          HReg dst = newVRegI(env);
    917          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    918          addInstr(env, mk_iMOVsd_RR(reg,dst));
    919          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
    920          return dst;
    921       }
    922 
    923       /* Is it an addition or logical style op? */
    924       switch (e->Iex.Binop.op) {
    925          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
    926             aluOp = Aalu_ADD; break;
    927          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
    928             aluOp = Aalu_SUB; break;
    929          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
    930             aluOp = Aalu_AND; break;
    931          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
    932             aluOp = Aalu_OR; break;
    933          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
    934             aluOp = Aalu_XOR; break;
    935          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
    936             aluOp = Aalu_MUL; break;
    937          default:
    938             aluOp = Aalu_INVALID; break;
    939       }
    940       /* For commutative ops we assume any literal
    941          values are on the second operand. */
    942       if (aluOp != Aalu_INVALID) {
    943          HReg dst      = newVRegI(env);
    944          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
    945          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
    946          addInstr(env, mk_iMOVsd_RR(reg,dst));
    947          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
    948          return dst;
    949       }
    950 
    951       /* Perhaps a shift op? */
    952       switch (e->Iex.Binop.op) {
    953          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    954             shOp = Ash_SHL; break;
    955          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
    956             shOp = Ash_SHR; break;
    957          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
    958             shOp = Ash_SAR; break;
    959          default:
    960             shOp = Ash_INVALID; break;
    961       }
    962       if (shOp != Ash_INVALID) {
    963          HReg dst = newVRegI(env);
    964 
    965          /* regL = the value to be shifted */
    966          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
    967          addInstr(env, mk_iMOVsd_RR(regL,dst));
    968 
    969          /* Do any necessary widening for 32/16/8 bit operands */
    970          switch (e->Iex.Binop.op) {
    971             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
    972                break;
    973             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    974                break;
    975             case Iop_Shr8:
    976                addInstr(env, AMD64Instr_Alu64R(
    977                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
    978                break;
    979             case Iop_Shr16:
    980                addInstr(env, AMD64Instr_Alu64R(
    981                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
    982                break;
    983             case Iop_Shr32:
    984                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
    985                break;
    986             case Iop_Sar8:
    987                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
    988                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
    989                break;
    990             case Iop_Sar16:
    991                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
    992                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
    993                break;
    994             case Iop_Sar32:
    995                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
    996                break;
    997             default:
    998                ppIROp(e->Iex.Binop.op);
    999                vassert(0);
   1000          }
   1001 
   1002          /* Now consider the shift amount.  If it's a literal, we
   1003             can do a much better job than the general case. */
   1004          if (e->Iex.Binop.arg2->tag == Iex_Const) {
   1005             /* assert that the IR is well-typed */
   1006             Int nshift;
   1007             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
   1008             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1009             vassert(nshift >= 0);
   1010             if (nshift > 0)
   1011                /* Can't allow nshift==0 since that means %cl */
   1012                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
   1013          } else {
   1014             /* General case; we have to force the amount into %cl. */
   1015             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1016             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
   1017             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
   1018          }
   1019          return dst;
   1020       }
   1021 
   1022       /* Deal with 64-bit SIMD binary ops */
   1023       second_is_UInt = False;
   1024       switch (e->Iex.Binop.op) {
   1025          case Iop_Add8x8:
   1026             fn = (HWord)h_generic_calc_Add8x8; break;
   1027          case Iop_Add16x4:
   1028             fn = (HWord)h_generic_calc_Add16x4; break;
   1029          case Iop_Add32x2:
   1030             fn = (HWord)h_generic_calc_Add32x2; break;
   1031 
   1032          case Iop_Avg8Ux8:
   1033             fn = (HWord)h_generic_calc_Avg8Ux8; break;
   1034          case Iop_Avg16Ux4:
   1035             fn = (HWord)h_generic_calc_Avg16Ux4; break;
   1036 
   1037          case Iop_CmpEQ8x8:
   1038             fn = (HWord)h_generic_calc_CmpEQ8x8; break;
   1039          case Iop_CmpEQ16x4:
   1040             fn = (HWord)h_generic_calc_CmpEQ16x4; break;
   1041          case Iop_CmpEQ32x2:
   1042             fn = (HWord)h_generic_calc_CmpEQ32x2; break;
   1043 
   1044          case Iop_CmpGT8Sx8:
   1045             fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
   1046          case Iop_CmpGT16Sx4:
   1047             fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
   1048          case Iop_CmpGT32Sx2:
   1049             fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
   1050 
   1051          case Iop_InterleaveHI8x8:
   1052             fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
   1053          case Iop_InterleaveLO8x8:
   1054             fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
   1055          case Iop_InterleaveHI16x4:
   1056             fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
   1057          case Iop_InterleaveLO16x4:
   1058             fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
   1059          case Iop_InterleaveHI32x2:
   1060             fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
   1061          case Iop_InterleaveLO32x2:
   1062             fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
   1063          case Iop_CatOddLanes16x4:
   1064             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
   1065          case Iop_CatEvenLanes16x4:
   1066             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
   1067          case Iop_Perm8x8:
   1068             fn = (HWord)h_generic_calc_Perm8x8; break;
   1069 
   1070          case Iop_Max8Ux8:
   1071             fn = (HWord)h_generic_calc_Max8Ux8; break;
   1072          case Iop_Max16Sx4:
   1073             fn = (HWord)h_generic_calc_Max16Sx4; break;
   1074          case Iop_Min8Ux8:
   1075             fn = (HWord)h_generic_calc_Min8Ux8; break;
   1076          case Iop_Min16Sx4:
   1077             fn = (HWord)h_generic_calc_Min16Sx4; break;
   1078 
   1079          case Iop_Mul16x4:
   1080             fn = (HWord)h_generic_calc_Mul16x4; break;
   1081          case Iop_Mul32x2:
   1082             fn = (HWord)h_generic_calc_Mul32x2; break;
   1083          case Iop_MulHi16Sx4:
   1084             fn = (HWord)h_generic_calc_MulHi16Sx4; break;
   1085          case Iop_MulHi16Ux4:
   1086             fn = (HWord)h_generic_calc_MulHi16Ux4; break;
   1087 
   1088          case Iop_QAdd8Sx8:
   1089             fn = (HWord)h_generic_calc_QAdd8Sx8; break;
   1090          case Iop_QAdd16Sx4:
   1091             fn = (HWord)h_generic_calc_QAdd16Sx4; break;
   1092          case Iop_QAdd8Ux8:
   1093             fn = (HWord)h_generic_calc_QAdd8Ux8; break;
   1094          case Iop_QAdd16Ux4:
   1095             fn = (HWord)h_generic_calc_QAdd16Ux4; break;
   1096 
   1097          case Iop_QNarrowBin32Sto16Sx4:
   1098             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
   1099          case Iop_QNarrowBin16Sto8Sx8:
   1100             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
   1101          case Iop_QNarrowBin16Sto8Ux8:
   1102             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
   1103          case Iop_NarrowBin16to8x8:
   1104             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
   1105          case Iop_NarrowBin32to16x4:
   1106             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
   1107 
   1108          case Iop_QSub8Sx8:
   1109             fn = (HWord)h_generic_calc_QSub8Sx8; break;
   1110          case Iop_QSub16Sx4:
   1111             fn = (HWord)h_generic_calc_QSub16Sx4; break;
   1112          case Iop_QSub8Ux8:
   1113             fn = (HWord)h_generic_calc_QSub8Ux8; break;
   1114          case Iop_QSub16Ux4:
   1115             fn = (HWord)h_generic_calc_QSub16Ux4; break;
   1116 
   1117          case Iop_Sub8x8:
   1118             fn = (HWord)h_generic_calc_Sub8x8; break;
   1119          case Iop_Sub16x4:
   1120             fn = (HWord)h_generic_calc_Sub16x4; break;
   1121          case Iop_Sub32x2:
   1122             fn = (HWord)h_generic_calc_Sub32x2; break;
   1123 
   1124          case Iop_ShlN32x2:
   1125             fn = (HWord)h_generic_calc_ShlN32x2;
   1126             second_is_UInt = True;
   1127             break;
   1128          case Iop_ShlN16x4:
   1129             fn = (HWord)h_generic_calc_ShlN16x4;
   1130             second_is_UInt = True;
   1131             break;
   1132          case Iop_ShlN8x8:
   1133             fn = (HWord)h_generic_calc_ShlN8x8;
   1134             second_is_UInt = True;
   1135             break;
   1136          case Iop_ShrN32x2:
   1137             fn = (HWord)h_generic_calc_ShrN32x2;
   1138             second_is_UInt = True;
   1139             break;
   1140          case Iop_ShrN16x4:
   1141             fn = (HWord)h_generic_calc_ShrN16x4;
   1142             second_is_UInt = True;
   1143             break;
   1144          case Iop_SarN32x2:
   1145             fn = (HWord)h_generic_calc_SarN32x2;
   1146             second_is_UInt = True;
   1147             break;
   1148          case Iop_SarN16x4:
   1149             fn = (HWord)h_generic_calc_SarN16x4;
   1150             second_is_UInt = True;
   1151             break;
   1152          case Iop_SarN8x8:
   1153             fn = (HWord)h_generic_calc_SarN8x8;
   1154             second_is_UInt = True;
   1155             break;
   1156 
   1157          default:
   1158             fn = (HWord)0; break;
   1159       }
   1160       if (fn != (HWord)0) {
   1161          /* Note: the following assumes all helpers are of signature
   1162                ULong fn ( ULong, ULong ), and they are
   1163             not marked as regparm functions.
   1164          */
   1165          HReg dst  = newVRegI(env);
   1166          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1167          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1168          if (second_is_UInt)
   1169             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
   1170          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
   1171          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
   1172          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
   1173          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1174          return dst;
   1175       }
   1176 
   1177       /* Handle misc other ops. */
   1178 
   1179       if (e->Iex.Binop.op == Iop_Max32U) {
   1180          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1181          HReg dst  = newVRegI(env);
   1182          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1183          addInstr(env, mk_iMOVsd_RR(src1, dst));
   1184          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
   1185          addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
   1186          return dst;
   1187       }
   1188 
   1189       if (e->Iex.Binop.op == Iop_DivModS64to32
   1190           || e->Iex.Binop.op == Iop_DivModU64to32) {
   1191          /* 64 x 32 -> (32(rem),32(div)) division */
   1192          /* Get the 64-bit operand into edx:eax, and the other into
   1193             any old R/M. */
   1194          HReg      rax     = hregAMD64_RAX();
   1195          HReg      rdx     = hregAMD64_RDX();
   1196          HReg      dst     = newVRegI(env);
   1197          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   1198          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   1199          /* Compute the left operand into a reg, and then
   1200             put the top half in edx and the bottom in eax. */
   1201          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1202          addInstr(env, mk_iMOVsd_RR(left64, rdx));
   1203          addInstr(env, mk_iMOVsd_RR(left64, rax));
   1204          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
   1205          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
   1206 	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
   1207 	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
   1208          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
   1209          addInstr(env, mk_iMOVsd_RR(rax, dst));
   1210          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
   1211          return dst;
   1212       }
   1213 
   1214       if (e->Iex.Binop.op == Iop_32HLto64) {
   1215          HReg hi32  = newVRegI(env);
   1216          HReg lo32  = newVRegI(env);
   1217          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1218          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1219          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
   1220          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
   1221          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
   1222 	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
   1223          addInstr(env, AMD64Instr_Alu64R(
   1224                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
   1225          return hi32;
   1226       }
   1227 
   1228       if (e->Iex.Binop.op == Iop_16HLto32) {
   1229          HReg hi16  = newVRegI(env);
   1230          HReg lo16  = newVRegI(env);
   1231          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1232          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1233          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
   1234          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
   1235          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
   1236          addInstr(env, AMD64Instr_Alu64R(
   1237                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
   1238          addInstr(env, AMD64Instr_Alu64R(
   1239                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
   1240          return hi16;
   1241       }
   1242 
   1243       if (e->Iex.Binop.op == Iop_8HLto16) {
   1244          HReg hi8  = newVRegI(env);
   1245          HReg lo8  = newVRegI(env);
   1246          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1247          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1248          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
   1249          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
   1250          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
   1251          addInstr(env, AMD64Instr_Alu64R(
   1252                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
   1253          addInstr(env, AMD64Instr_Alu64R(
   1254                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
   1255          return hi8;
   1256       }
   1257 
   1258       if (e->Iex.Binop.op == Iop_MullS32
   1259           || e->Iex.Binop.op == Iop_MullS16
   1260           || e->Iex.Binop.op == Iop_MullS8
   1261           || e->Iex.Binop.op == Iop_MullU32
   1262           || e->Iex.Binop.op == Iop_MullU16
   1263           || e->Iex.Binop.op == Iop_MullU8) {
   1264          HReg a32   = newVRegI(env);
   1265          HReg b32   = newVRegI(env);
   1266          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1267          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1268          Int          shift  = 0;
   1269          AMD64ShiftOp shr_op = Ash_SHR;
   1270          switch (e->Iex.Binop.op) {
   1271             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
   1272             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
   1273             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
   1274             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
   1275             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
   1276             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
   1277             default: vassert(0);
   1278          }
   1279 
   1280          addInstr(env, mk_iMOVsd_RR(a32s, a32));
   1281          addInstr(env, mk_iMOVsd_RR(b32s, b32));
   1282          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
   1283          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
   1284          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
   1285          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
   1286          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
   1287          return b32;
   1288       }
   1289 
   1290       if (e->Iex.Binop.op == Iop_CmpF64) {
   1291          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
   1292          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
   1293          HReg dst = newVRegI(env);
   1294          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
   1295          /* Mask out irrelevant parts of the result so as to conform
   1296             to the CmpF64 definition. */
   1297          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
   1298          return dst;
   1299       }
   1300 
   1301       if (e->Iex.Binop.op == Iop_F64toI32S
   1302           || e->Iex.Binop.op == Iop_F64toI64S) {
   1303          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
   1304          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1305          HReg dst = newVRegI(env);
   1306          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   1307          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
   1308          set_SSE_rounding_default(env);
   1309          return dst;
   1310       }
   1311 
   1312 //..       if (e->Iex.Binop.op == Iop_F64toI32 || e->Iex.Binop.op == Iop_F64toI16) {
   1313 //..          Int  sz  = e->Iex.Binop.op == Iop_F64toI16 ? 2 : 4;
   1314 //..          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1315 //..          HReg dst = newVRegI(env);
   1316 //..
   1317 //..          /* Used several times ... */
   1318 //..          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   1319 //..
   1320 //..          /* rf now holds the value to be converted, and rrm holds the
   1321 //.. 	    rounding mode value, encoded as per the IRRoundingMode
   1322 //.. 	    enum.  The first thing to do is set the FPU's rounding
   1323 //.. 	    mode accordingly. */
   1324 //..
   1325 //..          /* Create a space for the format conversion. */
   1326 //..          /* subl $4, %esp */
   1327 //..          sub_from_esp(env, 4);
   1328 //..
   1329 //.. 	 /* Set host rounding mode */
   1330 //.. 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   1331 //..
   1332 //..          /* gistw/l %rf, 0(%esp) */
   1333 //..          addInstr(env, X86Instr_FpLdStI(False/*store*/, sz, rf, zero_esp));
   1334 //..
   1335 //..          if (sz == 2) {
   1336 //..             /* movzwl 0(%esp), %dst */
   1337 //..             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
   1338 //..          } else {
   1339 //..             /* movl 0(%esp), %dst */
   1340 //..             vassert(sz == 4);
   1341 //..             addInstr(env, X86Instr_Alu32R(
   1342 //..                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
   1343 //..          }
   1344 //..
   1345 //.. 	 /* Restore default FPU rounding. */
   1346 //..          set_FPU_rounding_default( env );
   1347 //..
   1348 //..          /* addl $4, %esp */
   1349 //.. 	 add_to_esp(env, 4);
   1350 //..          return dst;
   1351 //..       }
   1352 //..
   1353 //..       /* C3210 flags following FPU partial remainder (fprem), both
   1354 //..          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
   1355 //..       if (e->Iex.Binop.op == Iop_PRemC3210F64
   1356 //..           || e->Iex.Binop.op == Iop_PRem1C3210F64) {
   1357 //..          HReg junk = newVRegF(env);
   1358 //..          HReg dst  = newVRegI(env);
   1359 //..          HReg srcL = iselDblExpr(env, e->Iex.Binop.arg1);
   1360 //..          HReg srcR = iselDblExpr(env, e->Iex.Binop.arg2);
   1361 //..          addInstr(env, X86Instr_FpBinary(
   1362 //..                            e->Iex.Binop.op==Iop_PRemC3210F64
   1363 //..                               ? Xfp_PREM : Xfp_PREM1,
   1364 //..                            srcL,srcR,junk
   1365 //..                  ));
   1366 //..          /* The previous pseudo-insn will have left the FPU's C3210
   1367 //..             flags set correctly.  So bag them. */
   1368 //..          addInstr(env, X86Instr_FpStSW_AX());
   1369 //..          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
   1370 //.. 	 addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
   1371 //..          return dst;
   1372 //..       }
   1373 
   1374       break;
   1375    }
   1376 
   1377    /* --------- UNARY OP --------- */
   1378    case Iex_Unop: {
   1379 
   1380       /* 1Uto8(64to1(expr64)) */
   1381       {
   1382          DEFINE_PATTERN( p_1Uto8_64to1,
   1383                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
   1384          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
   1385             IRExpr* expr64 = mi.bindee[0];
   1386             HReg    dst    = newVRegI(env);
   1387             HReg    src    = iselIntExpr_R(env, expr64);
   1388             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1389             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1390                                             AMD64RMI_Imm(1), dst));
   1391             return dst;
   1392          }
   1393       }
   1394 
   1395       /* 8Uto64(LDle(expr64)) */
   1396       {
   1397          DEFINE_PATTERN(p_LDle8_then_8Uto64,
   1398                         unop(Iop_8Uto64,
   1399                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1400          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
   1401             HReg dst = newVRegI(env);
   1402             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1403             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
   1404             return dst;
   1405          }
   1406       }
   1407 
   1408       /* 16Uto64(LDle(expr64)) */
   1409       {
   1410          DEFINE_PATTERN(p_LDle16_then_16Uto64,
   1411                         unop(Iop_16Uto64,
   1412                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1413          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
   1414             HReg dst = newVRegI(env);
   1415             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1416             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
   1417             return dst;
   1418          }
   1419       }
   1420 
   1421       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
   1422          Use 32 bit arithmetic and let the default zero-extend rule
   1423          do the 32Uto64 for free. */
   1424       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
   1425          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
   1426          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
   1427          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
   1428          AMD64AluOp aluOp = Aalu_INVALID;
   1429          switch (opi) {
   1430             case Iop_Add32: aluOp = Aalu_ADD; break;
   1431             case Iop_Sub32: aluOp = Aalu_SUB; break;
   1432             case Iop_And32: aluOp = Aalu_AND; break;
   1433             case Iop_Or32:  aluOp = Aalu_OR;  break;
   1434             case Iop_Xor32: aluOp = Aalu_XOR; break;
   1435             default: break;
   1436          }
   1437          if (aluOp != Aalu_INVALID) {
   1438             /* For commutative ops we assume any literal values are on
   1439                the second operand. */
   1440             HReg dst      = newVRegI(env);
   1441             HReg reg      = iselIntExpr_R(env, argL);
   1442             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
   1443             addInstr(env, mk_iMOVsd_RR(reg,dst));
   1444             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
   1445             return dst;
   1446          }
   1447          /* just fall through to normal handling for Iop_32Uto64 */
   1448       }
   1449 
   1450       /* Fallback cases */
   1451       switch (e->Iex.Unop.op) {
   1452          case Iop_32Uto64:
   1453          case Iop_32Sto64: {
   1454             HReg dst = newVRegI(env);
   1455             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1456             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
   1457                                             src, dst) );
   1458             return dst;
   1459          }
   1460          case Iop_128HIto64: {
   1461             HReg rHi, rLo;
   1462             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1463             return rHi; /* and abandon rLo */
   1464          }
   1465          case Iop_128to64: {
   1466             HReg rHi, rLo;
   1467             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1468             return rLo; /* and abandon rHi */
   1469          }
   1470          case Iop_8Uto16:
   1471          case Iop_8Uto32:
   1472          case Iop_8Uto64:
   1473          case Iop_16Uto64:
   1474          case Iop_16Uto32: {
   1475             HReg dst     = newVRegI(env);
   1476             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1477             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
   1478                                    || e->Iex.Unop.op==Iop_16Uto64 );
   1479             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
   1480             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1481             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1482                                             AMD64RMI_Imm(mask), dst));
   1483             return dst;
   1484          }
   1485          case Iop_8Sto16:
   1486          case Iop_8Sto64:
   1487          case Iop_8Sto32:
   1488          case Iop_16Sto32:
   1489          case Iop_16Sto64: {
   1490             HReg dst     = newVRegI(env);
   1491             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1492             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
   1493                                    || e->Iex.Unop.op==Iop_16Sto64 );
   1494             UInt amt     = srcIs16 ? 48 : 56;
   1495             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1496             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
   1497             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
   1498             return dst;
   1499          }
   1500  	 case Iop_Not8:
   1501  	 case Iop_Not16:
   1502          case Iop_Not32:
   1503          case Iop_Not64: {
   1504             HReg dst = newVRegI(env);
   1505             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1506             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1507             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
   1508             return dst;
   1509          }
   1510 //..          case Iop_64HIto32: {
   1511 //..             HReg rHi, rLo;
   1512 //..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1513 //..             return rHi; /* and abandon rLo .. poor wee thing :-) */
   1514 //..          }
   1515 //..          case Iop_64to32: {
   1516 //..             HReg rHi, rLo;
   1517 //..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1518 //..             return rLo; /* similar stupid comment to the above ... */
   1519 //..          }
   1520          case Iop_16HIto8:
   1521          case Iop_32HIto16:
   1522          case Iop_64HIto32: {
   1523             HReg dst  = newVRegI(env);
   1524             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1525             Int shift = 0;
   1526             switch (e->Iex.Unop.op) {
   1527                case Iop_16HIto8:  shift = 8;  break;
   1528                case Iop_32HIto16: shift = 16; break;
   1529                case Iop_64HIto32: shift = 32; break;
   1530                default: vassert(0);
   1531             }
   1532             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1533             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
   1534             return dst;
   1535          }
   1536          case Iop_1Uto64:
   1537          case Iop_1Uto32:
   1538          case Iop_1Uto8: {
   1539             HReg dst           = newVRegI(env);
   1540             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1541             addInstr(env, AMD64Instr_Set64(cond,dst));
   1542             return dst;
   1543          }
   1544          case Iop_1Sto8:
   1545          case Iop_1Sto16:
   1546          case Iop_1Sto32:
   1547          case Iop_1Sto64: {
   1548             /* could do better than this, but for now ... */
   1549             HReg dst           = newVRegI(env);
   1550             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1551             addInstr(env, AMD64Instr_Set64(cond,dst));
   1552             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
   1553             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1554             return dst;
   1555          }
   1556          case Iop_Ctz64: {
   1557             /* Count trailing zeroes, implemented by amd64 'bsfq' */
   1558             HReg dst = newVRegI(env);
   1559             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1560             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
   1561             return dst;
   1562          }
   1563          case Iop_Clz64: {
   1564             /* Count leading zeroes.  Do 'bsrq' to establish the index
   1565                of the highest set bit, and subtract that value from
   1566                63. */
   1567             HReg tmp = newVRegI(env);
   1568             HReg dst = newVRegI(env);
   1569             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1570             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
   1571             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
   1572                                             AMD64RMI_Imm(63), dst));
   1573             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
   1574                                             AMD64RMI_Reg(tmp), dst));
   1575             return dst;
   1576          }
   1577 
   1578          case Iop_CmpwNEZ64: {
   1579             HReg dst = newVRegI(env);
   1580             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1581             addInstr(env, mk_iMOVsd_RR(src,dst));
   1582             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1583             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1584                                             AMD64RMI_Reg(src), dst));
   1585             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1586             return dst;
   1587          }
   1588 
   1589          case Iop_CmpwNEZ32: {
   1590             HReg src = newVRegI(env);
   1591             HReg dst = newVRegI(env);
   1592             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
   1593             addInstr(env, mk_iMOVsd_RR(pre,src));
   1594             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
   1595             addInstr(env, mk_iMOVsd_RR(src,dst));
   1596             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1597             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1598                                             AMD64RMI_Reg(src), dst));
   1599             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1600             return dst;
   1601          }
   1602 
   1603          case Iop_Left8:
   1604          case Iop_Left16:
   1605          case Iop_Left32:
   1606          case Iop_Left64: {
   1607             HReg dst = newVRegI(env);
   1608             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1609             addInstr(env, mk_iMOVsd_RR(src, dst));
   1610             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
   1611             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
   1612             return dst;
   1613          }
   1614 
   1615          case Iop_V128to32: {
   1616             HReg        dst     = newVRegI(env);
   1617             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
   1618             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   1619             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
   1620             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
   1621             return dst;
   1622          }
   1623 
   1624          /* V128{HI}to64 */
   1625          case Iop_V128HIto64:
   1626          case Iop_V128to64: {
   1627             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
   1628             HReg dst = newVRegI(env);
   1629             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1630             AMD64AMode* rsp0 = AMD64AMode_IR(0,   hregAMD64_RSP());
   1631             AMD64AMode* rspN = AMD64AMode_IR(off, hregAMD64_RSP());
   1632             sub_from_rsp(env, 16);
   1633             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp0));
   1634             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1635                                              AMD64RMI_Mem(rspN), dst ));
   1636             add_to_rsp(env, 16);
   1637             return dst;
   1638          }
   1639 
   1640          /* ReinterpF64asI64(e) */
   1641          /* Given an IEEE754 double, produce an I64 with the same bit
   1642             pattern. */
   1643          case Iop_ReinterpF64asI64: {
   1644             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1645             HReg        dst    = newVRegI(env);
   1646             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
   1647             /* paranoia */
   1648             set_SSE_rounding_default(env);
   1649             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
   1650             addInstr(env, AMD64Instr_Alu64R(
   1651                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
   1652             return dst;
   1653          }
   1654 
   1655          /* ReinterpF32asI32(e) */
   1656          /* Given an IEEE754 single, produce an I64 with the same bit
   1657             pattern in the lower half. */
   1658          case Iop_ReinterpF32asI32: {
   1659             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1660             HReg        dst    = newVRegI(env);
   1661             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
   1662             /* paranoia */
   1663             set_SSE_rounding_default(env);
   1664             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
   1665             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
   1666             return dst;
   1667          }
   1668 
   1669          case Iop_16to8:
   1670          case Iop_32to8:
   1671          case Iop_64to8:
   1672          case Iop_32to16:
   1673          case Iop_64to16:
   1674          case Iop_64to32:
   1675             /* These are no-ops. */
   1676             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1677 
   1678          default:
   1679             break;
   1680       }
   1681 
   1682       /* Deal with unary 64-bit SIMD ops. */
   1683       switch (e->Iex.Unop.op) {
   1684          case Iop_CmpNEZ32x2:
   1685             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
   1686          case Iop_CmpNEZ16x4:
   1687             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
   1688          case Iop_CmpNEZ8x8:
   1689             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
   1690          default:
   1691             fn = (HWord)0; break;
   1692       }
   1693       if (fn != (HWord)0) {
   1694          /* Note: the following assumes all helpers are of
   1695             signature
   1696                ULong fn ( ULong ), and they are
   1697             not marked as regparm functions.
   1698          */
   1699          HReg dst = newVRegI(env);
   1700          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1701          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1702          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
   1703          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1704          return dst;
   1705       }
   1706 
   1707       break;
   1708    }
   1709 
   1710    /* --------- GET --------- */
   1711    case Iex_Get: {
   1712       if (ty == Ity_I64) {
   1713          HReg dst = newVRegI(env);
   1714          addInstr(env, AMD64Instr_Alu64R(
   1715                           Aalu_MOV,
   1716                           AMD64RMI_Mem(
   1717                              AMD64AMode_IR(e->Iex.Get.offset,
   1718                                            hregAMD64_RBP())),
   1719                           dst));
   1720          return dst;
   1721       }
   1722       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   1723          HReg dst = newVRegI(env);
   1724          addInstr(env, AMD64Instr_LoadEX(
   1725                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   1726                           False,
   1727                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
   1728                           dst));
   1729          return dst;
   1730       }
   1731       break;
   1732    }
   1733 
   1734    case Iex_GetI: {
   1735       AMD64AMode* am
   1736          = genGuestArrayOffset(
   1737               env, e->Iex.GetI.descr,
   1738                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1739       HReg dst = newVRegI(env);
   1740       if (ty == Ity_I8) {
   1741          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
   1742          return dst;
   1743       }
   1744       if (ty == Ity_I64) {
   1745          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
   1746          return dst;
   1747       }
   1748       break;
   1749    }
   1750 
   1751    /* --------- CCALL --------- */
   1752    case Iex_CCall: {
   1753       HReg    dst = newVRegI(env);
   1754       vassert(ty == e->Iex.CCall.retty);
   1755 
   1756       /* be very restrictive for now.  Only 64-bit ints allowed
   1757          for args, and 64 or 32 bits for return type. */
   1758       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
   1759          goto irreducible;
   1760 
   1761       /* Marshal args, do the call. */
   1762       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   1763 
   1764       /* Move to dst, and zero out the top 32 bits if the result type is
   1765          Ity_I32.  Probably overkill, but still .. */
   1766       if (e->Iex.CCall.retty == Ity_I64)
   1767          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1768       else
   1769          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1770 
   1771       return dst;
   1772    }
   1773 
   1774    /* --------- LITERAL --------- */
   1775    /* 64/32/16/8-bit literals */
   1776    case Iex_Const:
   1777       if (ty == Ity_I64) {
   1778          HReg r = newVRegI(env);
   1779          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
   1780          return r;
   1781       } else {
   1782          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
   1783          HReg      r   = newVRegI(env);
   1784          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
   1785          return r;
   1786       }
   1787 
   1788    /* --------- MULTIPLEX --------- */
   1789    case Iex_Mux0X: {
   1790      if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1791          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   1792         HReg     r8;
   1793         HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
   1794         AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
   1795         HReg dst = newVRegI(env);
   1796         addInstr(env, mk_iMOVsd_RR(rX,dst));
   1797         r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   1798         addInstr(env, AMD64Instr_Test64(0xFF, r8));
   1799         addInstr(env, AMD64Instr_CMov64(Acc_Z,r0,dst));
   1800         return dst;
   1801       }
   1802       break;
   1803    }
   1804 
   1805    /* --------- TERNARY OP --------- */
   1806    case Iex_Triop: {
   1807       /* C3210 flags following FPU partial remainder (fprem), both
   1808          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
   1809       if (e->Iex.Triop.op == Iop_PRemC3210F64
   1810           || e->Iex.Triop.op == Iop_PRem1C3210F64) {
   1811          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1812          HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
   1813          HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
   1814          HReg        dst    = newVRegI(env);
   1815          addInstr(env, AMD64Instr_A87Free(2));
   1816 
   1817          /* one arg -> top of x87 stack */
   1818          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
   1819          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1820 
   1821          /* other arg -> top of x87 stack */
   1822          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
   1823          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1824 
   1825          switch (e->Iex.Triop.op) {
   1826             case Iop_PRemC3210F64:
   1827                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   1828                break;
   1829             case Iop_PRem1C3210F64:
   1830                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   1831                break;
   1832             default:
   1833                vassert(0);
   1834          }
   1835          /* Ignore the result, and instead make off with the FPU's
   1836 	    C3210 flags (in the status word). */
   1837          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
   1838          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
   1839          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
   1840          return dst;
   1841       }
   1842       break;
   1843    }
   1844 
   1845    default:
   1846    break;
   1847    } /* switch (e->tag) */
   1848 
   1849    /* We get here if no pattern matched. */
   1850   irreducible:
   1851    ppIRExpr(e);
   1852    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
   1853 }
   1854 
   1855 
   1856 /*---------------------------------------------------------*/
   1857 /*--- ISEL: Integer expression auxiliaries              ---*/
   1858 /*---------------------------------------------------------*/
   1859 
   1860 /* --------------------- AMODEs --------------------- */
   1861 
   1862 /* Return an AMode which computes the value of the specified
   1863    expression, possibly also adding insns to the code list as a
   1864    result.  The expression may only be a 32-bit one.
   1865 */
   1866 
   1867 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1868 {
   1869    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
   1870    vassert(sane_AMode(am));
   1871    return am;
   1872 }
   1873 
   1874 /* DO NOT CALL THIS DIRECTLY ! */
   1875 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1876 {
   1877    MatchInfo mi;
   1878    DECLARE_PATTERN(p_complex);
   1879    IRType ty = typeOfIRExpr(env->type_env,e);
   1880    vassert(ty == Ity_I64);
   1881 
   1882    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
   1883    /*              bind0        bind1  bind2   bind3   */
   1884    DEFINE_PATTERN(p_complex,
   1885       binop( Iop_Add64,
   1886              binop( Iop_Add64,
   1887                     bind(0),
   1888                     binop(Iop_Shl64, bind(1), bind(2))
   1889                   ),
   1890              bind(3)
   1891            )
   1892    );
   1893    if (matchIRExpr(&mi, p_complex, e)) {
   1894       IRExpr* expr1  = mi.bindee[0];
   1895       IRExpr* expr2  = mi.bindee[1];
   1896       IRExpr* imm8   = mi.bindee[2];
   1897       IRExpr* simm32 = mi.bindee[3];
   1898       if (imm8->tag == Iex_Const
   1899           && imm8->Iex.Const.con->tag == Ico_U8
   1900           && imm8->Iex.Const.con->Ico.U8 < 4
   1901           /* imm8 is OK, now check simm32 */
   1902           && simm32->tag == Iex_Const
   1903           && simm32->Iex.Const.con->tag == Ico_U64
   1904           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
   1905          UInt shift = imm8->Iex.Const.con->Ico.U8;
   1906          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
   1907          HReg r1 = iselIntExpr_R(env, expr1);
   1908          HReg r2 = iselIntExpr_R(env, expr2);
   1909          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
   1910          return AMD64AMode_IRRS(offset, r1, r2, shift);
   1911       }
   1912    }
   1913 
   1914    /* Add64(expr1, Shl64(expr2, imm)) */
   1915    if (e->tag == Iex_Binop
   1916        && e->Iex.Binop.op == Iop_Add64
   1917        && e->Iex.Binop.arg2->tag == Iex_Binop
   1918        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
   1919        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1920        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1921       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1922       if (shift == 1 || shift == 2 || shift == 3) {
   1923          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1924          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1925          return AMD64AMode_IRRS(0, r1, r2, shift);
   1926       }
   1927    }
   1928 
   1929    /* Add64(expr,i) */
   1930    if (e->tag == Iex_Binop
   1931        && e->Iex.Binop.op == Iop_Add64
   1932        && e->Iex.Binop.arg2->tag == Iex_Const
   1933        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
   1934        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
   1935       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1936       return AMD64AMode_IR(
   1937                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
   1938                 r1
   1939              );
   1940    }
   1941 
   1942    /* Doesn't match anything in particular.  Generate it into
   1943       a register and use that. */
   1944    {
   1945       HReg r1 = iselIntExpr_R(env, e);
   1946       return AMD64AMode_IR(0, r1);
   1947    }
   1948 }
   1949 
   1950 
   1951 /* --------------------- RMIs --------------------- */
   1952 
   1953 /* Similarly, calculate an expression into an X86RMI operand.  As with
   1954    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1955 
   1956 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   1957 {
   1958    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   1959    /* sanity checks ... */
   1960    switch (rmi->tag) {
   1961       case Armi_Imm:
   1962          return rmi;
   1963       case Armi_Reg:
   1964          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
   1965          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
   1966          return rmi;
   1967       case Armi_Mem:
   1968          vassert(sane_AMode(rmi->Armi.Mem.am));
   1969          return rmi;
   1970       default:
   1971          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
   1972    }
   1973 }
   1974 
   1975 /* DO NOT CALL THIS DIRECTLY ! */
   1976 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   1977 {
   1978    IRType ty = typeOfIRExpr(env->type_env,e);
   1979    vassert(ty == Ity_I64 || ty == Ity_I32
   1980            || ty == Ity_I16 || ty == Ity_I8);
   1981 
   1982    /* special case: immediate 64/32/16/8 */
   1983    if (e->tag == Iex_Const) {
   1984       switch (e->Iex.Const.con->tag) {
   1985         case Ico_U64:
   1986            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   1987               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   1988            }
   1989            break;
   1990          case Ico_U32:
   1991             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
   1992          case Ico_U16:
   1993             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
   1994          case Ico_U8:
   1995             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
   1996          default:
   1997             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   1998       }
   1999    }
   2000 
   2001    /* special case: 64-bit GET */
   2002    if (e->tag == Iex_Get && ty == Ity_I64) {
   2003       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2004                                         hregAMD64_RBP()));
   2005    }
   2006 
   2007    /* special case: 64-bit load from memory */
   2008    if (e->tag == Iex_Load && ty == Ity_I64
   2009        && e->Iex.Load.end == Iend_LE) {
   2010       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2011       return AMD64RMI_Mem(am);
   2012    }
   2013 
   2014    /* default case: calculate into a register and return that */
   2015    {
   2016       HReg r = iselIntExpr_R ( env, e );
   2017       return AMD64RMI_Reg(r);
   2018    }
   2019 }
   2020 
   2021 
   2022 /* --------------------- RIs --------------------- */
   2023 
   2024 /* Calculate an expression into an AMD64RI operand.  As with
   2025    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2026    bits. */
   2027 
   2028 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   2029 {
   2030    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
   2031    /* sanity checks ... */
   2032    switch (ri->tag) {
   2033       case Ari_Imm:
   2034          return ri;
   2035       case Ari_Reg:
   2036          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
   2037          vassert(hregIsVirtual(ri->Ari.Reg.reg));
   2038          return ri;
   2039       default:
   2040          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
   2041    }
   2042 }
   2043 
   2044 /* DO NOT CALL THIS DIRECTLY ! */
   2045 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   2046 {
   2047    IRType ty = typeOfIRExpr(env->type_env,e);
   2048    vassert(ty == Ity_I64 || ty == Ity_I32
   2049            || ty == Ity_I16 || ty == Ity_I8);
   2050 
   2051    /* special case: immediate */
   2052    if (e->tag == Iex_Const) {
   2053       switch (e->Iex.Const.con->tag) {
   2054         case Ico_U64:
   2055            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   2056               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   2057            }
   2058            break;
   2059          case Ico_U32:
   2060             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
   2061          case Ico_U16:
   2062             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
   2063          case Ico_U8:
   2064             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
   2065          default:
   2066             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   2067       }
   2068    }
   2069 
   2070    /* default case: calculate into a register and return that */
   2071    {
   2072       HReg r = iselIntExpr_R ( env, e );
   2073       return AMD64RI_Reg(r);
   2074    }
   2075 }
   2076 
   2077 
   2078 /* --------------------- RMs --------------------- */
   2079 
   2080 /* Similarly, calculate an expression into an AMD64RM operand.  As
   2081    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2082    bits.  */
   2083 
   2084 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   2085 {
   2086    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
   2087    /* sanity checks ... */
   2088    switch (rm->tag) {
   2089       case Arm_Reg:
   2090          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
   2091          vassert(hregIsVirtual(rm->Arm.Reg.reg));
   2092          return rm;
   2093       case Arm_Mem:
   2094          vassert(sane_AMode(rm->Arm.Mem.am));
   2095          return rm;
   2096       default:
   2097          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
   2098    }
   2099 }
   2100 
   2101 /* DO NOT CALL THIS DIRECTLY ! */
   2102 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   2103 {
   2104    IRType ty = typeOfIRExpr(env->type_env,e);
   2105    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   2106 
   2107    /* special case: 64-bit GET */
   2108    if (e->tag == Iex_Get && ty == Ity_I64) {
   2109       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2110                                        hregAMD64_RBP()));
   2111    }
   2112 
   2113    /* special case: load from memory */
   2114 
   2115    /* default case: calculate into a register and return that */
   2116    {
   2117       HReg r = iselIntExpr_R ( env, e );
   2118       return AMD64RM_Reg(r);
   2119    }
   2120 }
   2121 
   2122 
   2123 /* --------------------- CONDCODE --------------------- */
   2124 
   2125 /* Generate code to evaluated a bit-typed expression, returning the
   2126    condition code which would correspond when the expression would
   2127    notionally have returned 1. */
   2128 
   2129 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   2130 {
   2131    /* Uh, there's nothing we can sanity check here, unfortunately. */
   2132    return iselCondCode_wrk(env,e);
   2133 }
   2134 
   2135 /* DO NOT CALL THIS DIRECTLY ! */
   2136 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   2137 {
   2138    MatchInfo mi;
   2139 
   2140    vassert(e);
   2141    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   2142 
   2143    /* var */
   2144    if (e->tag == Iex_RdTmp) {
   2145       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2146       HReg dst = newVRegI(env);
   2147       addInstr(env, mk_iMOVsd_RR(r64,dst));
   2148       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
   2149       return Acc_NZ;
   2150    }
   2151 
   2152    /* Constant 1:Bit */
   2153    if (e->tag == Iex_Const) {
   2154       HReg r;
   2155       vassert(e->Iex.Const.con->tag == Ico_U1);
   2156       vassert(e->Iex.Const.con->Ico.U1 == True
   2157               || e->Iex.Const.con->Ico.U1 == False);
   2158       r = newVRegI(env);
   2159       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
   2160       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
   2161       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
   2162    }
   2163 
   2164    /* Not1(...) */
   2165    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   2166       /* Generate code for the arg, and negate the test condition */
   2167       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   2168    }
   2169 
   2170    /* --- patterns rooted at: 64to1 --- */
   2171 
   2172    /* 64to1 */
   2173    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
   2174       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2175       addInstr(env, AMD64Instr_Test64(1,reg));
   2176       return Acc_NZ;
   2177    }
   2178 
   2179    /* --- patterns rooted at: CmpNEZ8 --- */
   2180 
   2181    /* CmpNEZ8(x) */
   2182    if (e->tag == Iex_Unop
   2183        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   2184       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2185       addInstr(env, AMD64Instr_Test64(0xFF,r));
   2186       return Acc_NZ;
   2187    }
   2188 
   2189    /* --- patterns rooted at: CmpNEZ16 --- */
   2190 
   2191    /* CmpNEZ16(x) */
   2192    if (e->tag == Iex_Unop
   2193        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   2194       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2195       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
   2196       return Acc_NZ;
   2197    }
   2198 
   2199    /* --- patterns rooted at: CmpNEZ32 --- */
   2200 
   2201    /* CmpNEZ32(x) */
   2202    if (e->tag == Iex_Unop
   2203        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   2204       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2205       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2206       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2207       return Acc_NZ;
   2208    }
   2209 
   2210    /* --- patterns rooted at: CmpNEZ64 --- */
   2211 
   2212    /* CmpNEZ64(Or64(x,y)) */
   2213    {
   2214       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   2215       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   2216                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   2217       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   2218          HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
   2219          AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   2220          HReg      tmp  = newVRegI(env);
   2221          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   2222          addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
   2223          return Acc_NZ;
   2224       }
   2225    }
   2226 
   2227    /* CmpNEZ64(x) */
   2228    if (e->tag == Iex_Unop
   2229        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   2230       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2231       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2232       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2233       return Acc_NZ;
   2234    }
   2235 
   2236    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
   2237 
   2238    /* CmpEQ8 / CmpNE8 */
   2239    if (e->tag == Iex_Binop
   2240        && (e->Iex.Binop.op == Iop_CmpEQ8
   2241            || e->Iex.Binop.op == Iop_CmpNE8
   2242            || e->Iex.Binop.op == Iop_CasCmpEQ8
   2243            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   2244       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2245       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2246       HReg      r    = newVRegI(env);
   2247       addInstr(env, mk_iMOVsd_RR(r1,r));
   2248       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2249       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
   2250       switch (e->Iex.Binop.op) {
   2251          case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2252          case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2253          default: vpanic("iselCondCode(amd64): CmpXX8");
   2254       }
   2255    }
   2256 
   2257    /* CmpEQ16 / CmpNE16 */
   2258    if (e->tag == Iex_Binop
   2259        && (e->Iex.Binop.op == Iop_CmpEQ16
   2260            || e->Iex.Binop.op == Iop_CmpNE16
   2261            || e->Iex.Binop.op == Iop_CasCmpEQ16
   2262            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   2263       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2264       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2265       HReg      r    = newVRegI(env);
   2266       addInstr(env, mk_iMOVsd_RR(r1,r));
   2267       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2268       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
   2269       switch (e->Iex.Binop.op) {
   2270          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
   2271          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
   2272          default: vpanic("iselCondCode(amd64): CmpXX16");
   2273       }
   2274    }
   2275 
   2276    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
   2277       Saves a "movq %rax, %tmp" compared to the default route. */
   2278    if (e->tag == Iex_Binop
   2279        && e->Iex.Binop.op == Iop_CmpNE64
   2280        && e->Iex.Binop.arg1->tag == Iex_CCall
   2281        && e->Iex.Binop.arg2->tag == Iex_Const) {
   2282       IRExpr* cal = e->Iex.Binop.arg1;
   2283       IRExpr* con = e->Iex.Binop.arg2;
   2284       HReg    tmp = newVRegI(env);
   2285       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   2286       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
   2287       vassert(con->Iex.Const.con->tag == Ico_U64);
   2288       /* Marshal args, do the call. */
   2289       doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
   2290       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
   2291       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
   2292                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
   2293       return Acc_NZ;
   2294    }
   2295 
   2296    /* Cmp*64*(x,y) */
   2297    if (e->tag == Iex_Binop
   2298        && (e->Iex.Binop.op == Iop_CmpEQ64
   2299            || e->Iex.Binop.op == Iop_CmpNE64
   2300            || e->Iex.Binop.op == Iop_CmpLT64S
   2301            || e->Iex.Binop.op == Iop_CmpLT64U
   2302            || e->Iex.Binop.op == Iop_CmpLE64S
   2303            || e->Iex.Binop.op == Iop_CmpLE64U
   2304            || e->Iex.Binop.op == Iop_CasCmpEQ64
   2305            || e->Iex.Binop.op == Iop_CasCmpNE64)) {
   2306       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2307       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2308       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2309       switch (e->Iex.Binop.op) {
   2310          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
   2311          case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
   2312 	 case Iop_CmpLT64S: return Acc_L;
   2313 	 case Iop_CmpLT64U: return Acc_B;
   2314 	 case Iop_CmpLE64S: return Acc_LE;
   2315          case Iop_CmpLE64U: return Acc_BE;
   2316          default: vpanic("iselCondCode(amd64): CmpXX64");
   2317       }
   2318    }
   2319 
   2320    /* Cmp*32*(x,y) */
   2321    if (e->tag == Iex_Binop
   2322        && (e->Iex.Binop.op == Iop_CmpEQ32
   2323            || e->Iex.Binop.op == Iop_CmpNE32
   2324            || e->Iex.Binop.op == Iop_CmpLT32S
   2325            || e->Iex.Binop.op == Iop_CmpLT32U
   2326            || e->Iex.Binop.op == Iop_CmpLE32S
   2327            || e->Iex.Binop.op == Iop_CmpLE32U
   2328            || e->Iex.Binop.op == Iop_CasCmpEQ32
   2329            || e->Iex.Binop.op == Iop_CasCmpNE32)) {
   2330       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2331       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2332       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2333       switch (e->Iex.Binop.op) {
   2334          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
   2335          case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
   2336 	 case Iop_CmpLT32S: return Acc_L;
   2337 	 case Iop_CmpLT32U: return Acc_B;
   2338 	 case Iop_CmpLE32S: return Acc_LE;
   2339          case Iop_CmpLE32U: return Acc_BE;
   2340          default: vpanic("iselCondCode(amd64): CmpXX32");
   2341       }
   2342    }
   2343 
   2344    ppIRExpr(e);
   2345    vpanic("iselCondCode(amd64)");
   2346 }
   2347 
   2348 
   2349 /*---------------------------------------------------------*/
   2350 /*--- ISEL: Integer expressions (128 bit)               ---*/
   2351 /*---------------------------------------------------------*/
   2352 
   2353 /* Compute a 128-bit value into a register pair, which is returned as
   2354    the first two parameters.  As with iselIntExpr_R, these may be
   2355    either real or virtual regs; in any case they must not be changed
   2356    by subsequent code emitted by the caller.  */
   2357 
   2358 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
   2359                              ISelEnv* env, IRExpr* e )
   2360 {
   2361    iselInt128Expr_wrk(rHi, rLo, env, e);
   2362 #  if 0
   2363    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2364 #  endif
   2365    vassert(hregClass(*rHi) == HRcInt64);
   2366    vassert(hregIsVirtual(*rHi));
   2367    vassert(hregClass(*rLo) == HRcInt64);
   2368    vassert(hregIsVirtual(*rLo));
   2369 }
   2370 
   2371 /* DO NOT CALL THIS DIRECTLY ! */
   2372 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
   2373                                  ISelEnv* env, IRExpr* e )
   2374 {
   2375 //..    HWord fn = 0; /* helper fn for most SIMD64 stuff */
   2376    vassert(e);
   2377    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
   2378 
   2379 //..    /* 64-bit literal */
   2380 //..    if (e->tag == Iex_Const) {
   2381 //..       ULong w64 = e->Iex.Const.con->Ico.U64;
   2382 //..       UInt  wHi = ((UInt)(w64 >> 32)) & 0xFFFFFFFF;
   2383 //..       UInt  wLo = ((UInt)w64) & 0xFFFFFFFF;
   2384 //..       HReg  tLo = newVRegI(env);
   2385 //..       HReg  tHi = newVRegI(env);
   2386 //..       vassert(e->Iex.Const.con->tag == Ico_U64);
   2387 //..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
   2388 //..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
   2389 //..       *rHi = tHi;
   2390 //..       *rLo = tLo;
   2391 //..       return;
   2392 //..    }
   2393 
   2394    /* read 128-bit IRTemp */
   2395    if (e->tag == Iex_RdTmp) {
   2396       lookupIRTemp128( rHi, rLo, env, e->Iex.RdTmp.tmp);
   2397       return;
   2398    }
   2399 
   2400 //..    /* 64-bit load */
   2401 //..    if (e->tag == Iex_LDle) {
   2402 //..       HReg     tLo, tHi;
   2403 //..       X86AMode *am0, *am4;
   2404 //..       vassert(e->Iex.LDle.ty == Ity_I64);
   2405 //..       tLo = newVRegI(env);
   2406 //..       tHi = newVRegI(env);
   2407 //..       am0 = iselIntExpr_AMode(env, e->Iex.LDle.addr);
   2408 //..       am4 = advance4(am0);
   2409 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
   2410 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2411 //..       *rHi = tHi;
   2412 //..       *rLo = tLo;
   2413 //..       return;
   2414 //..    }
   2415 //..
   2416 //..    /* 64-bit GET */
   2417 //..    if (e->tag == Iex_Get) {
   2418 //..       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
   2419 //..       X86AMode* am4 = advance4(am);
   2420 //..       HReg tLo = newVRegI(env);
   2421 //..       HReg tHi = newVRegI(env);
   2422 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   2423 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2424 //..       *rHi = tHi;
   2425 //..       *rLo = tLo;
   2426 //..       return;
   2427 //..    }
   2428 //..
   2429 //..    /* 64-bit GETI */
   2430 //..    if (e->tag == Iex_GetI) {
   2431 //..       X86AMode* am
   2432 //..          = genGuestArrayOffset( env, e->Iex.GetI.descr,
   2433 //..                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
   2434 //..       X86AMode* am4 = advance4(am);
   2435 //..       HReg tLo = newVRegI(env);
   2436 //..       HReg tHi = newVRegI(env);
   2437 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
   2438 //..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
   2439 //..       *rHi = tHi;
   2440 //..       *rLo = tLo;
   2441 //..       return;
   2442 //..    }
   2443 //..
   2444 //..    /* 64-bit Mux0X */
   2445 //..    if (e->tag == Iex_Mux0X) {
   2446 //..       HReg e0Lo, e0Hi, eXLo, eXHi, r8;
   2447 //..       HReg tLo = newVRegI(env);
   2448 //..       HReg tHi = newVRegI(env);
   2449 //..       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
   2450 //..       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
   2451 //..       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
   2452 //..       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
   2453 //..       r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   2454 //..       addInstr(env, X86Instr_Test32(X86RI_Imm(0xFF), X86RM_Reg(r8)));
   2455 //..       /* This assumes the first cmov32 doesn't trash the condition
   2456 //..          codes, so they are still available for the second cmov32 */
   2457 //..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
   2458 //..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
   2459 //..       *rHi = tHi;
   2460 //..       *rLo = tLo;
   2461 //..       return;
   2462 //..    }
   2463 
   2464    /* --------- BINARY ops --------- */
   2465    if (e->tag == Iex_Binop) {
   2466       switch (e->Iex.Binop.op) {
   2467          /* 64 x 64 -> 128 multiply */
   2468          case Iop_MullU64:
   2469          case Iop_MullS64: {
   2470             /* get one operand into %rax, and the other into a R/M.
   2471                Need to make an educated guess about which is better in
   2472                which. */
   2473             HReg     tLo    = newVRegI(env);
   2474             HReg     tHi    = newVRegI(env);
   2475             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
   2476             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2477             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2478             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
   2479             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
   2480             /* Result is now in RDX:RAX.  Tell the caller. */
   2481             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2482             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2483             *rHi = tHi;
   2484             *rLo = tLo;
   2485             return;
   2486          }
   2487 
   2488          /* 128 x 64 -> (64(rem),64(div)) division */
   2489          case Iop_DivModU128to64:
   2490          case Iop_DivModS128to64: {
   2491             /* Get the 128-bit operand into rdx:rax, and the other into
   2492                any old R/M. */
   2493             HReg sHi, sLo;
   2494             HReg     tLo     = newVRegI(env);
   2495             HReg     tHi     = newVRegI(env);
   2496             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
   2497             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2498             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2499             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
   2500             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
   2501             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
   2502             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2503             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2504             *rHi = tHi;
   2505             *rLo = tLo;
   2506             return;
   2507          }
   2508 
   2509          /* 64HLto128(e1,e2) */
   2510          case Iop_64HLto128:
   2511             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2512             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2513             return;
   2514 
   2515 //..          /* Or64/And64/Xor64 */
   2516 //..          case Iop_Or64:
   2517 //..          case Iop_And64:
   2518 //..          case Iop_Xor64: {
   2519 //..             HReg xLo, xHi, yLo, yHi;
   2520 //..             HReg tLo = newVRegI(env);
   2521 //..             HReg tHi = newVRegI(env);
   2522 //..             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
   2523 //..                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
   2524 //..                           : Xalu_XOR;
   2525 //..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2526 //..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2527 //..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2528 //..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2529 //..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
   2530 //..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
   2531 //..             *rHi = tHi;
   2532 //..             *rLo = tLo;
   2533 //..             return;
   2534 //..          }
   2535 //..
   2536 //..          /* Add64/Sub64 */
   2537 //..          case Iop_Add64:
   2538 //..          case Iop_Sub64: {
   2539 //..             HReg xLo, xHi, yLo, yHi;
   2540 //..             HReg tLo = newVRegI(env);
   2541 //..             HReg tHi = newVRegI(env);
   2542 //..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
   2543 //..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
   2544 //..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
   2545 //..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
   2546 //..             if (e->Iex.Binop.op==Iop_Add64) {
   2547 //..                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
   2548 //..                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
   2549 //..             } else {
   2550 //..                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
   2551 //..                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
   2552 //..             }
   2553 //..             *rHi = tHi;
   2554 //..             *rLo = tLo;
   2555 //..             return;
   2556 //..          }
   2557 //..
   2558 //..          /* 32HLto64(e1,e2) */
   2559 //..          case Iop_32HLto64:
   2560 //..             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2561 //..             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2562 //..             return;
   2563 //..
   2564 //..          /* 64-bit shifts */
   2565 //..          case Iop_Shl64: {
   2566 //..             /* We use the same ingenious scheme as gcc.  Put the value
   2567 //..                to be shifted into %hi:%lo, and the shift amount into
   2568 //..                %cl.  Then (dsts on right, a la ATT syntax):
   2569 //..
   2570 //..                shldl %cl, %lo, %hi   -- make %hi be right for the
   2571 //..                                      -- shift amt %cl % 32
   2572 //..                shll  %cl, %lo        -- make %lo be right for the
   2573 //..                                      -- shift amt %cl % 32
   2574 //..
   2575 //..                Now, if (shift amount % 64) is in the range 32 .. 63,
   2576 //..                we have to do a fixup, which puts the result low half
   2577 //..                into the result high half, and zeroes the low half:
   2578 //..
   2579 //..                testl $32, %ecx
   2580 //..
   2581 //..                cmovnz %lo, %hi
   2582 //..                movl $0, %tmp         -- sigh; need yet another reg
   2583 //..                cmovnz %tmp, %lo
   2584 //..             */
   2585 //..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2586 //..             tLo = newVRegI(env);
   2587 //..             tHi = newVRegI(env);
   2588 //..             tTemp = newVRegI(env);
   2589 //..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2590 //..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2591 //..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2592 //..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2593 //..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2594 //..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2595 //..                and those regs are legitimately modifiable. */
   2596 //..             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
   2597 //..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, X86RM_Reg(tLo)));
   2598 //..             addInstr(env, X86Instr_Test32(X86RI_Imm(32),
   2599 //..                           X86RM_Reg(hregX86_ECX())));
   2600 //..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
   2601 //..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2602 //..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
   2603 //..             *rHi = tHi;
   2604 //..             *rLo = tLo;
   2605 //..             return;
   2606 //..          }
   2607 //..
   2608 //..          case Iop_Shr64: {
   2609 //..             /* We use the same ingenious scheme as gcc.  Put the value
   2610 //..                to be shifted into %hi:%lo, and the shift amount into
   2611 //..                %cl.  Then:
   2612 //..
   2613 //..                shrdl %cl, %hi, %lo   -- make %lo be right for the
   2614 //..                                      -- shift amt %cl % 32
   2615 //..                shrl  %cl, %hi        -- make %hi be right for the
   2616 //..                                      -- shift amt %cl % 32
   2617 //..
   2618 //..                Now, if (shift amount % 64) is in the range 32 .. 63,
   2619 //..                we have to do a fixup, which puts the result high half
   2620 //..                into the result low half, and zeroes the high half:
   2621 //..
   2622 //..                testl $32, %ecx
   2623 //..
   2624 //..                cmovnz %hi, %lo
   2625 //..                movl $0, %tmp         -- sigh; need yet another reg
   2626 //..                cmovnz %tmp, %hi
   2627 //..             */
   2628 //..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
   2629 //..             tLo = newVRegI(env);
   2630 //..             tHi = newVRegI(env);
   2631 //..             tTemp = newVRegI(env);
   2632 //..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2633 //..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2634 //..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
   2635 //..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2636 //..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2637 //..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
   2638 //..                and those regs are legitimately modifiable. */
   2639 //..             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
   2640 //..             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, X86RM_Reg(tHi)));
   2641 //..             addInstr(env, X86Instr_Test32(X86RI_Imm(32),
   2642 //..                           X86RM_Reg(hregX86_ECX())));
   2643 //..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
   2644 //..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
   2645 //..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
   2646 //..             *rHi = tHi;
   2647 //..             *rLo = tLo;
   2648 //..             return;
   2649 //..          }
   2650 //..
   2651 //..          /* F64 -> I64 */
   2652 //..          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
   2653 //..             case.  Unfortunately I see no easy way to avoid the
   2654 //..             duplication. */
   2655 //..          case Iop_F64toI64: {
   2656 //..             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   2657 //..             HReg tLo = newVRegI(env);
   2658 //..             HReg tHi = newVRegI(env);
   2659 //..
   2660 //..             /* Used several times ... */
   2661 //..             /* Careful ... this sharing is only safe because
   2662 //.. 	       zero_esp/four_esp do not hold any registers which the
   2663 //.. 	       register allocator could attempt to swizzle later. */
   2664 //..             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
   2665 //..             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
   2666 //..
   2667 //..             /* rf now holds the value to be converted, and rrm holds
   2668 //..                the rounding mode value, encoded as per the
   2669 //..                IRRoundingMode enum.  The first thing to do is set the
   2670 //..                FPU's rounding mode accordingly. */
   2671 //..
   2672 //..             /* Create a space for the format conversion. */
   2673 //..             /* subl $8, %esp */
   2674 //..             sub_from_esp(env, 8);
   2675 //..
   2676 //..             /* Set host rounding mode */
   2677 //..             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2678 //..
   2679 //..             /* gistll %rf, 0(%esp) */
   2680 //..             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
   2681 //..
   2682 //..             /* movl 0(%esp), %dstLo */
   2683 //..             /* movl 4(%esp), %dstHi */
   2684 //..             addInstr(env, X86Instr_Alu32R(
   2685 //..                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
   2686 //..             addInstr(env, X86Instr_Alu32R(
   2687 //..                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
   2688 //..
   2689 //..             /* Restore default FPU rounding. */
   2690 //..             set_FPU_rounding_default( env );
   2691 //..
   2692 //..             /* addl $8, %esp */
   2693 //..             add_to_esp(env, 8);
   2694 //..
   2695 //..             *rHi = tHi;
   2696 //..             *rLo = tLo;
   2697 //..             return;
   2698 //..          }
   2699 //..
   2700          default:
   2701             break;
   2702       }
   2703    } /* if (e->tag == Iex_Binop) */
   2704 
   2705 
   2706 //..    /* --------- UNARY ops --------- */
   2707 //..    if (e->tag == Iex_Unop) {
   2708 //..       switch (e->Iex.Unop.op) {
   2709 //..
   2710 //..          /* 32Sto64(e) */
   2711 //..          case Iop_32Sto64: {
   2712 //..             HReg tLo = newVRegI(env);
   2713 //..             HReg tHi = newVRegI(env);
   2714 //..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2715 //..             addInstr(env, mk_iMOVsd_RR(src,tHi));
   2716 //..             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2717 //..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tHi)));
   2718 //..             *rHi = tHi;
   2719 //..             *rLo = tLo;
   2720 //..             return;
   2721 //..          }
   2722 //..
   2723 //..          /* 32Uto64(e) */
   2724 //..          case Iop_32Uto64: {
   2725 //..             HReg tLo = newVRegI(env);
   2726 //..             HReg tHi = newVRegI(env);
   2727 //..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2728 //..             addInstr(env, mk_iMOVsd_RR(src,tLo));
   2729 //..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
   2730 //..             *rHi = tHi;
   2731 //..             *rLo = tLo;
   2732 //..             return;
   2733 //..          }
   2734 
   2735 //..          /* could do better than this, but for now ... */
   2736 //..          case Iop_1Sto64: {
   2737 //..             HReg tLo = newVRegI(env);
   2738 //..             HReg tHi = newVRegI(env);
   2739 //..             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   2740 //..             addInstr(env, X86Instr_Set32(cond,tLo));
   2741 //..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, X86RM_Reg(tLo)));
   2742 //..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tLo)));
   2743 //..             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
   2744 //..             *rHi = tHi;
   2745 //..             *rLo = tLo;
   2746 //..             return;
   2747 //..          }
   2748 //..
   2749 //..          /* Not64(e) */
   2750 //..          case Iop_Not64: {
   2751 //..             HReg tLo = newVRegI(env);
   2752 //..             HReg tHi = newVRegI(env);
   2753 //..             HReg sHi, sLo;
   2754 //..             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
   2755 //..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
   2756 //..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
   2757 //..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tHi)));
   2758 //..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tLo)));
   2759 //..             *rHi = tHi;
   2760 //..             *rLo = tLo;
   2761 //..             return;
   2762 //..          }
   2763 //..
   2764 //..          default:
   2765 //..             break;
   2766 //..       }
   2767 //..    } /* if (e->tag == Iex_Unop) */
   2768 //..
   2769 //..
   2770 //..    /* --------- CCALL --------- */
   2771 //..    if (e->tag == Iex_CCall) {
   2772 //..       HReg tLo = newVRegI(env);
   2773 //..       HReg tHi = newVRegI(env);
   2774 //..
   2775 //..       /* Marshal args, do the call, clear stack. */
   2776 //..       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   2777 //..
   2778 //..       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
   2779 //..       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
   2780 //..       *rHi = tHi;
   2781 //..       *rLo = tLo;
   2782 //..       return;
   2783 //..    }
   2784 
   2785    ppIRExpr(e);
   2786    vpanic("iselInt128Expr");
   2787 }
   2788 
   2789 
   2790 /*---------------------------------------------------------*/
   2791 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2792 /*---------------------------------------------------------*/
   2793 
   2794 /* Nothing interesting here; really just wrappers for
   2795    64-bit stuff. */
   2796 
   2797 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2798 {
   2799    HReg r = iselFltExpr_wrk( env, e );
   2800 #  if 0
   2801    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2802 #  endif
   2803    vassert(hregClass(r) == HRcVec128);
   2804    vassert(hregIsVirtual(r));
   2805    return r;
   2806 }
   2807 
   2808 /* DO NOT CALL THIS DIRECTLY */
   2809 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2810 {
   2811    IRType ty = typeOfIRExpr(env->type_env,e);
   2812    vassert(ty == Ity_F32);
   2813 
   2814    if (e->tag == Iex_RdTmp) {
   2815       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2816    }
   2817 
   2818    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2819       AMD64AMode* am;
   2820       HReg res = newVRegV(env);
   2821       vassert(e->Iex.Load.ty == Ity_F32);
   2822       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2823       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
   2824       return res;
   2825    }
   2826 
   2827    if (e->tag == Iex_Binop
   2828        && e->Iex.Binop.op == Iop_F64toF32) {
   2829       /* Although the result is still held in a standard SSE register,
   2830          we need to round it to reflect the loss of accuracy/range
   2831          entailed in casting it to a 32-bit float. */
   2832       HReg dst = newVRegV(env);
   2833       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2834       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2835       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
   2836       set_SSE_rounding_default( env );
   2837       return dst;
   2838    }
   2839 
   2840    if (e->tag == Iex_Get) {
   2841       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2842                                        hregAMD64_RBP() );
   2843       HReg res = newVRegV(env);
   2844       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
   2845       return res;
   2846    }
   2847 
   2848    if (e->tag == Iex_Unop
   2849        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2850        /* Given an I32, produce an IEEE754 float with the same bit
   2851           pattern. */
   2852        HReg        dst    = newVRegV(env);
   2853        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
   2854        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
   2855        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
   2856        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
   2857        return dst;
   2858    }
   2859 
   2860    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2861       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2862       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
   2863       HReg        dst    = newVRegV(env);
   2864 
   2865       /* rf now holds the value to be rounded.  The first thing to do
   2866          is set the FPU's rounding mode accordingly. */
   2867 
   2868       /* Set host x87 rounding mode */
   2869       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2870 
   2871       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
   2872       addInstr(env, AMD64Instr_A87Free(1));
   2873       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
   2874       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2875       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
   2876       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
   2877 
   2878       /* Restore default x87 rounding. */
   2879       set_FPU_rounding_default( env );
   2880 
   2881       return dst;
   2882    }
   2883 
   2884    ppIRExpr(e);
   2885    vpanic("iselFltExpr_wrk");
   2886 }
   2887 
   2888 
   2889 /*---------------------------------------------------------*/
   2890 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2891 /*---------------------------------------------------------*/
   2892 
   2893 /* Compute a 64-bit floating point value into the lower half of an xmm
   2894    register, the identity of which is returned.  As with
   2895    iselIntExpr_R, the returned reg will be virtual, and it must not be
   2896    changed by subsequent code emitted by the caller.
   2897 */
   2898 
   2899 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2900 
   2901     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2902     ----                  ---------   -----------   -----------
   2903     signalling NaN        u           2047 (max)    .0uuuuu---u
   2904                                                     (with at least
   2905                                                      one 1 bit)
   2906     quiet NaN             u           2047 (max)    .1uuuuu---u
   2907 
   2908     negative infinity     1           2047 (max)    .000000---0
   2909 
   2910     positive infinity     0           2047 (max)    .000000---0
   2911 
   2912     negative zero         1           0             .000000---0
   2913 
   2914     positive zero         0           0             .000000---0
   2915 */
   2916 
   2917 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2918 {
   2919    HReg r = iselDblExpr_wrk( env, e );
   2920 #  if 0
   2921    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2922 #  endif
   2923    vassert(hregClass(r) == HRcVec128);
   2924    vassert(hregIsVirtual(r));
   2925    return r;
   2926 }
   2927 
   2928 /* DO NOT CALL THIS DIRECTLY */
   2929 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2930 {
   2931    IRType ty = typeOfIRExpr(env->type_env,e);
   2932    vassert(e);
   2933    vassert(ty == Ity_F64);
   2934 
   2935    if (e->tag == Iex_RdTmp) {
   2936       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2937    }
   2938 
   2939    if (e->tag == Iex_Const) {
   2940       union { ULong u64; Double f64; } u;
   2941       HReg res = newVRegV(env);
   2942       HReg tmp = newVRegI(env);
   2943       vassert(sizeof(u) == 8);
   2944       vassert(sizeof(u.u64) == 8);
   2945       vassert(sizeof(u.f64) == 8);
   2946 
   2947       if (e->Iex.Const.con->tag == Ico_F64) {
   2948          u.f64 = e->Iex.Const.con->Ico.F64;
   2949       }
   2950       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2951          u.u64 = e->Iex.Const.con->Ico.F64i;
   2952       }
   2953       else
   2954          vpanic("iselDblExpr(amd64): const");
   2955 
   2956       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
   2957       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
   2958       addInstr(env, AMD64Instr_SseLdSt(
   2959                        True/*load*/, 8, res,
   2960                        AMD64AMode_IR(0, hregAMD64_RSP())
   2961               ));
   2962       add_to_rsp(env, 8);
   2963       return res;
   2964    }
   2965 
   2966    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2967       AMD64AMode* am;
   2968       HReg res = newVRegV(env);
   2969       vassert(e->Iex.Load.ty == Ity_F64);
   2970       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2971       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2972       return res;
   2973    }
   2974 
   2975    if (e->tag == Iex_Get) {
   2976       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2977                                       hregAMD64_RBP() );
   2978       HReg res = newVRegV(env);
   2979       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2980       return res;
   2981    }
   2982 
   2983    if (e->tag == Iex_GetI) {
   2984       AMD64AMode* am
   2985          = genGuestArrayOffset(
   2986               env, e->Iex.GetI.descr,
   2987                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2988       HReg res = newVRegV(env);
   2989       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2990       return res;
   2991    }
   2992 
   2993    if (e->tag == Iex_Triop) {
   2994       AMD64SseOp op = Asse_INVALID;
   2995       switch (e->Iex.Triop.op) {
   2996          case Iop_AddF64: op = Asse_ADDF; break;
   2997          case Iop_SubF64: op = Asse_SUBF; break;
   2998          case Iop_MulF64: op = Asse_MULF; break;
   2999          case Iop_DivF64: op = Asse_DIVF; break;
   3000          default: break;
   3001       }
   3002       if (op != Asse_INVALID) {
   3003          HReg dst  = newVRegV(env);
   3004          HReg argL = iselDblExpr(env, e->Iex.Triop.arg2);
   3005          HReg argR = iselDblExpr(env, e->Iex.Triop.arg3);
   3006          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3007          /* XXXROUNDINGFIXME */
   3008          /* set roundingmode here */
   3009          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   3010          return dst;
   3011       }
   3012    }
   3013 
   3014    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   3015       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3016       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   3017       HReg        dst    = newVRegV(env);
   3018 
   3019       /* rf now holds the value to be rounded.  The first thing to do
   3020          is set the FPU's rounding mode accordingly. */
   3021 
   3022       /* Set host x87 rounding mode */
   3023       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   3024 
   3025       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   3026       addInstr(env, AMD64Instr_A87Free(1));
   3027       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3028       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   3029       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3030       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3031 
   3032       /* Restore default x87 rounding. */
   3033       set_FPU_rounding_default( env );
   3034 
   3035       return dst;
   3036    }
   3037 
   3038    if (e->tag == Iex_Triop
   3039        && (e->Iex.Triop.op == Iop_ScaleF64
   3040            || e->Iex.Triop.op == Iop_AtanF64
   3041            || e->Iex.Triop.op == Iop_Yl2xF64
   3042            || e->Iex.Triop.op == Iop_Yl2xp1F64
   3043            || e->Iex.Triop.op == Iop_PRemF64
   3044            || e->Iex.Triop.op == Iop_PRem1F64)
   3045       ) {
   3046       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3047       HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
   3048       HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
   3049       HReg        dst    = newVRegV(env);
   3050       Bool     arg2first = toBool(e->Iex.Triop.op == Iop_ScaleF64
   3051                                   || e->Iex.Triop.op == Iop_PRemF64
   3052                                   || e->Iex.Triop.op == Iop_PRem1F64);
   3053       addInstr(env, AMD64Instr_A87Free(2));
   3054 
   3055       /* one arg -> top of x87 stack */
   3056       addInstr(env, AMD64Instr_SseLdSt(
   3057                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
   3058       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3059 
   3060       /* other arg -> top of x87 stack */
   3061       addInstr(env, AMD64Instr_SseLdSt(
   3062                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
   3063       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3064 
   3065       /* do it */
   3066       /* XXXROUNDINGFIXME */
   3067       /* set roundingmode here */
   3068       switch (e->Iex.Triop.op) {
   3069          case Iop_ScaleF64:
   3070             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
   3071             break;
   3072          case Iop_AtanF64:
   3073             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
   3074             break;
   3075          case Iop_Yl2xF64:
   3076             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
   3077             break;
   3078          case Iop_Yl2xp1F64:
   3079             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
   3080             break;
   3081          case Iop_PRemF64:
   3082             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   3083             break;
   3084          case Iop_PRem1F64:
   3085             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   3086             break;
   3087          default:
   3088             vassert(0);
   3089       }
   3090 
   3091       /* save result */
   3092       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3093       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3094       return dst;
   3095    }
   3096 
   3097    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   3098       HReg dst = newVRegV(env);
   3099       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3100       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   3101       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
   3102       set_SSE_rounding_default( env );
   3103       return dst;
   3104    }
   3105 
   3106    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
   3107       HReg dst = newVRegV(env);
   3108       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   3109       set_SSE_rounding_default( env );
   3110       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
   3111       return dst;
   3112    }
   3113 
   3114    if (e->tag == Iex_Unop
   3115        && (e->Iex.Unop.op == Iop_NegF64
   3116            || e->Iex.Unop.op == Iop_AbsF64)) {
   3117       /* Sigh ... very rough code.  Could do much better. */
   3118       /* Get the 128-bit literal 00---0 10---0 into a register
   3119          and xor/nand it with the value to be negated. */
   3120       HReg r1  = newVRegI(env);
   3121       HReg dst = newVRegV(env);
   3122       HReg tmp = newVRegV(env);
   3123       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   3124       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3125       addInstr(env, mk_vMOVsd_RR(src,tmp));
   3126       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3127       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
   3128       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   3129       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   3130 
   3131       if (e->Iex.Unop.op == Iop_NegF64)
   3132          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   3133       else
   3134          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
   3135 
   3136       add_to_rsp(env, 16);
   3137       return dst;
   3138    }
   3139 
   3140    if (e->tag == Iex_Binop) {
   3141       A87FpOp fpop = Afp_INVALID;
   3142       switch (e->Iex.Binop.op) {
   3143          case Iop_SqrtF64: fpop = Afp_SQRT; break;
   3144          case Iop_SinF64:  fpop = Afp_SIN;  break;
   3145          case Iop_CosF64:  fpop = Afp_COS;  break;
   3146          case Iop_TanF64:  fpop = Afp_TAN;  break;
   3147          case Iop_2xm1F64: fpop = Afp_2XM1; break;
   3148          default: break;
   3149       }
   3150       if (fpop != Afp_INVALID) {
   3151          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3152          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   3153          HReg        dst    = newVRegV(env);
   3154          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
   3155          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   3156          addInstr(env, AMD64Instr_A87Free(nNeeded));
   3157          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3158          /* XXXROUNDINGFIXME */
   3159          /* set roundingmode here */
   3160          addInstr(env, AMD64Instr_A87FpOp(fpop));
   3161          if (e->Iex.Binop.op==Iop_TanF64) {
   3162             /* get rid of the extra 1.0 that fptan pushes */
   3163             addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3164          }
   3165          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3166          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3167          return dst;
   3168       }
   3169    }
   3170 
   3171    if (e->tag == Iex_Unop) {
   3172       switch (e->Iex.Unop.op) {
   3173 //..          case Iop_I32toF64: {
   3174 //..             HReg dst = newVRegF(env);
   3175 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3176 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3177 //..             set_FPU_rounding_default(env);
   3178 //..             addInstr(env, X86Instr_FpLdStI(
   3179 //..                              True/*load*/, 4, dst,
   3180 //..                              X86AMode_IR(0, hregX86_ESP())));
   3181 //..             add_to_esp(env, 4);
   3182 //..             return dst;
   3183 //..          }
   3184          case Iop_ReinterpI64asF64: {
   3185             /* Given an I64, produce an IEEE754 double with the same
   3186                bit pattern. */
   3187             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3188             HReg        dst    = newVRegV(env);
   3189             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3190             /* paranoia */
   3191             set_SSE_rounding_default(env);
   3192             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
   3193             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3194             return dst;
   3195          }
   3196          case Iop_F32toF64: {
   3197             HReg f32;
   3198             HReg f64 = newVRegV(env);
   3199             /* this shouldn't be necessary, but be paranoid ... */
   3200             set_SSE_rounding_default(env);
   3201             f32 = iselFltExpr(env, e->Iex.Unop.arg);
   3202             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
   3203             return f64;
   3204          }
   3205          default:
   3206             break;
   3207       }
   3208    }
   3209 
   3210    /* --------- MULTIPLEX --------- */
   3211    if (e->tag == Iex_Mux0X) {
   3212       HReg r8, rX, r0, dst;
   3213       vassert(ty == Ity_F64);
   3214       vassert(typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8);
   3215       r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   3216       rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
   3217       r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
   3218       dst = newVRegV(env);
   3219       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3220       addInstr(env, AMD64Instr_Test64(0xFF, r8));
   3221       addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
   3222       return dst;
   3223    }
   3224 
   3225    ppIRExpr(e);
   3226    vpanic("iselDblExpr_wrk");
   3227 }
   3228 
   3229 
   3230 /*---------------------------------------------------------*/
   3231 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3232 /*---------------------------------------------------------*/
   3233 
   3234 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3235 {
   3236    HReg r = iselVecExpr_wrk( env, e );
   3237 #  if 0
   3238    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3239 #  endif
   3240    vassert(hregClass(r) == HRcVec128);
   3241    vassert(hregIsVirtual(r));
   3242    return r;
   3243 }
   3244 
   3245 
   3246 /* DO NOT CALL THIS DIRECTLY */
   3247 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3248 {
   3249    HWord      fn = 0; /* address of helper fn, if required */
   3250    Bool       arg1isEReg = False;
   3251    AMD64SseOp op = Asse_INVALID;
   3252    IRType     ty = typeOfIRExpr(env->type_env,e);
   3253    vassert(e);
   3254    vassert(ty == Ity_V128);
   3255 
   3256    if (e->tag == Iex_RdTmp) {
   3257       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3258    }
   3259 
   3260    if (e->tag == Iex_Get) {
   3261       HReg dst = newVRegV(env);
   3262       addInstr(env, AMD64Instr_SseLdSt(
   3263                        True/*load*/,
   3264                        16,
   3265                        dst,
   3266                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
   3267                     )
   3268               );
   3269       return dst;
   3270    }
   3271 
   3272    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3273       HReg        dst = newVRegV(env);
   3274       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3275       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   3276       return dst;
   3277    }
   3278 
   3279    if (e->tag == Iex_Const) {
   3280       HReg dst = newVRegV(env);
   3281       vassert(e->Iex.Const.con->tag == Ico_V128);
   3282       switch (e->Iex.Const.con->Ico.V128) {
   3283          case 0x0000:
   3284             dst = generate_zeroes_V128(env);
   3285             break;
   3286          case 0xFFFF:
   3287             dst = generate_ones_V128(env);
   3288             break;
   3289          default: {
   3290             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3291             /* do push_uimm64 twice, first time for the high-order half. */
   3292             push_uimm64(env, bitmask8_to_bytemask64(
   3293                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
   3294                        ));
   3295             push_uimm64(env, bitmask8_to_bytemask64(
   3296                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
   3297                        ));
   3298             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
   3299             add_to_rsp(env, 16);
   3300             break;
   3301          }
   3302       }
   3303       return dst;
   3304    }
   3305 
   3306    if (e->tag == Iex_Unop) {
   3307    switch (e->Iex.Unop.op) {
   3308 
   3309       case Iop_NotV128: {
   3310          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3311          return do_sse_NotV128(env, arg);
   3312       }
   3313 
   3314       case Iop_CmpNEZ64x2: {
   3315          /* We can use SSE2 instructions for this. */
   3316          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3317             the operand.  Problem is no such insn exists.  Solution
   3318             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3319             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3320             let the not'd result of this initial comparison be a:b:c:d.
   3321             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3322             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3323             giving the required result.
   3324 
   3325             The required selection sequence is 2,3,0,1, which
   3326             according to Intel's documentation means the pshufd
   3327             literal value is 0xB1, that is,
   3328             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3329          */
   3330          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3331          HReg tmp  = generate_zeroes_V128(env);
   3332          HReg dst  = newVRegV(env);
   3333          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
   3334          tmp = do_sse_NotV128(env, tmp);
   3335          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
   3336          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
   3337          return dst;
   3338       }
   3339 
   3340       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3341       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   3342       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   3343       do_CmpNEZ_vector:
   3344       {
   3345          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3346          HReg tmp  = newVRegV(env);
   3347          HReg zero = generate_zeroes_V128(env);
   3348          HReg dst;
   3349          addInstr(env, mk_vMOVsd_RR(arg, tmp));
   3350          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
   3351          dst = do_sse_NotV128(env, tmp);
   3352          return dst;
   3353       }
   3354 
   3355       case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
   3356       case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
   3357       case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
   3358       do_32Fx4_unary:
   3359       {
   3360          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3361          HReg dst = newVRegV(env);
   3362          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
   3363          return dst;
   3364       }
   3365 
   3366 //..       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
   3367 //..       case Iop_RSqrt64Fx2: op = Asse_RSQRTF; goto do_64Fx2_unary;
   3368       case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
   3369       do_64Fx2_unary:
   3370       {
   3371          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3372          HReg dst = newVRegV(env);
   3373          addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
   3374          return dst;
   3375       }
   3376 
   3377       case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
   3378       case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
   3379       case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
   3380       do_32F0x4_unary:
   3381       {
   3382          /* A bit subtle.  We have to copy the arg to the result
   3383             register first, because actually doing the SSE scalar insn
   3384             leaves the upper 3/4 of the destination register
   3385             unchanged.  Whereas the required semantics of these
   3386             primops is that the upper 3/4 is simply copied in from the
   3387             argument. */
   3388          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3389          HReg dst = newVRegV(env);
   3390          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3391          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
   3392          return dst;
   3393       }
   3394 
   3395 //..       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
   3396 //..       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
   3397       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
   3398       do_64F0x2_unary:
   3399       {
   3400          /* A bit subtle.  We have to copy the arg to the result
   3401             register first, because actually doing the SSE scalar insn
   3402             leaves the upper half of the destination register
   3403             unchanged.  Whereas the required semantics of these
   3404             primops is that the upper half is simply copied in from the
   3405             argument. */
   3406          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3407          HReg dst = newVRegV(env);
   3408          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3409          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
   3410          return dst;
   3411       }
   3412 
   3413       case Iop_32UtoV128: {
   3414          HReg        dst     = newVRegV(env);
   3415          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
   3416          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3417          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
   3418          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
   3419          return dst;
   3420       }
   3421 
   3422       case Iop_64UtoV128: {
   3423          HReg        dst  = newVRegV(env);
   3424          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3425          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3426          addInstr(env, AMD64Instr_Push(rmi));
   3427          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
   3428          add_to_rsp(env, 8);
   3429          return dst;
   3430       }
   3431 
   3432       default:
   3433          break;
   3434    } /* switch (e->Iex.Unop.op) */
   3435    } /* if (e->tag == Iex_Unop) */
   3436 
   3437    if (e->tag == Iex_Binop) {
   3438    switch (e->Iex.Binop.op) {
   3439 
   3440       case Iop_SetV128lo64: {
   3441          HReg dst  = newVRegV(env);
   3442          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3443          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3444          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3445          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3446          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
   3447          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3448          return dst;
   3449       }
   3450 
   3451       case Iop_SetV128lo32: {
   3452          HReg dst  = newVRegV(env);
   3453          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3454          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3455          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3456          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3457          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
   3458          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3459          return dst;
   3460       }
   3461 
   3462       case Iop_64HLtoV128: {
   3463          AMD64AMode* rsp = AMD64AMode_IR(0, hregAMD64_RSP());
   3464          HReg        dst = newVRegV(env);
   3465          /* do this via the stack (easy, convenient, etc) */
   3466          addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg1)));
   3467          addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg2)));
   3468          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp));
   3469          add_to_rsp(env, 16);
   3470          return dst;
   3471       }
   3472 
   3473       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
   3474       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
   3475       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
   3476       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
   3477       case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
   3478       case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
   3479       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
   3480       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
   3481       case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
   3482       case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
   3483       do_32Fx4:
   3484       {
   3485          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3486          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3487          HReg dst = newVRegV(env);
   3488          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3489          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3490          return dst;
   3491       }
   3492 
   3493       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
   3494       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
   3495       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
   3496       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
   3497       case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
   3498       case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
   3499       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
   3500       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
   3501       case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
   3502       case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
   3503       do_64Fx2:
   3504       {
   3505          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3506          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3507          HReg dst = newVRegV(env);
   3508          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3509          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3510          return dst;
   3511       }
   3512 
   3513       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
   3514       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
   3515       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
   3516       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
   3517       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
   3518       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
   3519       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
   3520       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
   3521       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
   3522       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
   3523       do_32F0x4: {
   3524          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3525          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3526          HReg dst = newVRegV(env);
   3527          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3528          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
   3529          return dst;
   3530       }
   3531 
   3532       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
   3533       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
   3534       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
   3535       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
   3536       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
   3537       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
   3538       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
   3539       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
   3540       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
   3541       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
   3542       do_64F0x2: {
   3543          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3544          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3545          HReg dst = newVRegV(env);
   3546          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3547          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   3548          return dst;
   3549       }
   3550 
   3551       case Iop_QNarrowBin32Sto16Sx8:
   3552          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3553       case Iop_QNarrowBin16Sto8Sx16:
   3554          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3555       case Iop_QNarrowBin16Sto8Ux16:
   3556          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3557 
   3558       case Iop_InterleaveHI8x16:
   3559          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3560       case Iop_InterleaveHI16x8:
   3561          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3562       case Iop_InterleaveHI32x4:
   3563          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3564       case Iop_InterleaveHI64x2:
   3565          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3566 
   3567       case Iop_InterleaveLO8x16:
   3568          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3569       case Iop_InterleaveLO16x8:
   3570          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3571       case Iop_InterleaveLO32x4:
   3572          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3573       case Iop_InterleaveLO64x2:
   3574          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3575 
   3576       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
   3577       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
   3578       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
   3579       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
   3580       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
   3581       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
   3582       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
   3583       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
   3584       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
   3585       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
   3586       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
   3587       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
   3588       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
   3589       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3590       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
   3591       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3592       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
   3593       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
   3594       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
   3595       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
   3596       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
   3597       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
   3598       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
   3599       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
   3600       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
   3601       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
   3602       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
   3603       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
   3604       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
   3605       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
   3606       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
   3607       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
   3608       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
   3609       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
   3610       do_SseReRg: {
   3611          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3612          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3613          HReg dst = newVRegV(env);
   3614          if (arg1isEReg) {
   3615             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3616             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
   3617          } else {
   3618             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3619             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
   3620          }
   3621          return dst;
   3622       }
   3623 
   3624       case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
   3625       case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
   3626       case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
   3627       case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
   3628       case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
   3629       case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
   3630       case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
   3631       case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
   3632       do_SseShift: {
   3633          HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3634          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3635          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3636          HReg        ereg = newVRegV(env);
   3637          HReg        dst  = newVRegV(env);
   3638          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3639          addInstr(env, AMD64Instr_Push(rmi));
   3640          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3641          addInstr(env, mk_vMOVsd_RR(greg, dst));
   3642          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
   3643          add_to_rsp(env, 16);
   3644          return dst;
   3645       }
   3646 
   3647       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
   3648                            goto do_SseAssistedBinary;
   3649       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
   3650                            goto do_SseAssistedBinary;
   3651       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
   3652                            goto do_SseAssistedBinary;
   3653       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
   3654                            goto do_SseAssistedBinary;
   3655       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
   3656                            goto do_SseAssistedBinary;
   3657       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
   3658                            goto do_SseAssistedBinary;
   3659       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
   3660                            goto do_SseAssistedBinary;
   3661       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
   3662                            goto do_SseAssistedBinary;
   3663       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
   3664                            goto do_SseAssistedBinary;
   3665       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   3666                            goto do_SseAssistedBinary;
   3667       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   3668                            goto do_SseAssistedBinary;
   3669       case Iop_QNarrowBin32Sto16Ux8:
   3670                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
   3671                            goto do_SseAssistedBinary;
   3672       case Iop_NarrowBin16to8x16:
   3673                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3674                            goto do_SseAssistedBinary;
   3675       case Iop_NarrowBin32to16x8:
   3676                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3677                            goto do_SseAssistedBinary;
   3678       do_SseAssistedBinary: {
   3679          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3680             well. */
   3681          vassert(fn != 0);
   3682          HReg dst = newVRegV(env);
   3683          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3684          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3685          HReg argp = newVRegI(env);
   3686          /* subq $112, %rsp         -- make a space*/
   3687          sub_from_rsp(env, 112);
   3688          /* leaq 48(%rsp), %r_argp  -- point into it */
   3689          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3690                                         argp));
   3691          /* andq $-16, %r_argp      -- 16-align the pointer */
   3692          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3693                                          AMD64RMI_Imm( ~(UInt)15 ),
   3694                                          argp));
   3695          /* Prepare 3 arg regs:
   3696             leaq 0(%r_argp), %rdi
   3697             leaq 16(%r_argp), %rsi
   3698             leaq 32(%r_argp), %rdx
   3699          */
   3700          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3701                                         hregAMD64_RDI()));
   3702          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3703                                         hregAMD64_RSI()));
   3704          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   3705                                         hregAMD64_RDX()));
   3706          /* Store the two args, at (%rsi) and (%rdx):
   3707             movupd  %argL, 0(%rsi)
   3708             movupd  %argR, 0(%rdx)
   3709          */
   3710          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3711                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3712          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
   3713                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   3714          /* call the helper */
   3715          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
   3716          /* fetch the result from memory, using %r_argp, which the
   3717             register allocator will keep alive across the call. */
   3718          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3719                                           AMD64AMode_IR(0, argp)));
   3720          /* and finally, clear the space */
   3721          add_to_rsp(env, 112);
   3722          return dst;
   3723       }
   3724 
   3725       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
   3726                          goto do_SseAssistedVectorAndScalar;
   3727       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
   3728                          goto do_SseAssistedVectorAndScalar;
   3729       do_SseAssistedVectorAndScalar: {
   3730          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3731             well. */
   3732          vassert(fn != 0);
   3733          HReg dst = newVRegV(env);
   3734          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3735          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3736          HReg argp = newVRegI(env);
   3737          /* subq $112, %rsp         -- make a space*/
   3738          sub_from_rsp(env, 112);
   3739          /* leaq 48(%rsp), %r_argp  -- point into it */
   3740          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3741                                         argp));
   3742          /* andq $-16, %r_argp      -- 16-align the pointer */
   3743          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3744                                          AMD64RMI_Imm( ~(UInt)15 ),
   3745                                          argp));
   3746          /* Prepare 2 vector arg regs:
   3747             leaq 0(%r_argp), %rdi
   3748             leaq 16(%r_argp), %rsi
   3749          */
   3750          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3751                                         hregAMD64_RDI()));
   3752          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3753                                         hregAMD64_RSI()));
   3754          /* Store the vector arg, at (%rsi):
   3755             movupd  %argL, 0(%rsi)
   3756          */
   3757          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3758                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3759          /* And get the scalar value into rdx */
   3760          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
   3761 
   3762          /* call the helper */
   3763          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
   3764          /* fetch the result from memory, using %r_argp, which the
   3765             register allocator will keep alive across the call. */
   3766          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3767                                           AMD64AMode_IR(0, argp)));
   3768          /* and finally, clear the space */
   3769          add_to_rsp(env, 112);
   3770          return dst;
   3771       }
   3772 
   3773       default:
   3774          break;
   3775    } /* switch (e->Iex.Binop.op) */
   3776    } /* if (e->tag == Iex_Binop) */
   3777 
   3778    if (e->tag == Iex_Mux0X) {
   3779       HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   3780       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
   3781       HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
   3782       HReg dst = newVRegV(env);
   3783       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3784       addInstr(env, AMD64Instr_Test64(0xFF, r8));
   3785       addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
   3786       return dst;
   3787    }
   3788 
   3789    //vec_fail:
   3790    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
   3791               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   3792    ppIRExpr(e);
   3793    vpanic("iselVecExpr_wrk");
   3794 }
   3795 
   3796 
   3797 /*---------------------------------------------------------*/
   3798 /*--- ISEL: Statements                                  ---*/
   3799 /*---------------------------------------------------------*/
   3800 
   3801 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   3802 {
   3803    if (vex_traceflags & VEX_TRACE_VCODE) {
   3804       vex_printf("\n-- ");
   3805       ppIRStmt(stmt);
   3806       vex_printf("\n");
   3807    }
   3808 
   3809    switch (stmt->tag) {
   3810 
   3811    /* --------- STORE --------- */
   3812    case Ist_Store: {
   3813       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   3814       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   3815       IREndness end   = stmt->Ist.Store.end;
   3816 
   3817       if (tya != Ity_I64 || end != Iend_LE)
   3818          goto stmt_fail;
   3819 
   3820       if (tyd == Ity_I64) {
   3821          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3822          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   3823          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
   3824          return;
   3825       }
   3826       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
   3827          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3828          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   3829          addInstr(env, AMD64Instr_Store(
   3830                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
   3831                           r,am));
   3832          return;
   3833       }
   3834       if (tyd == Ity_F64) {
   3835          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3836          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   3837          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
   3838          return;
   3839       }
   3840       if (tyd == Ity_F32) {
   3841          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3842          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   3843          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
   3844          return;
   3845       }
   3846       if (tyd == Ity_V128) {
   3847          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3848          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   3849          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
   3850          return;
   3851       }
   3852       break;
   3853    }
   3854 
   3855    /* --------- PUT --------- */
   3856    case Ist_Put: {
   3857       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   3858       if (ty == Ity_I64) {
   3859          /* We're going to write to memory, so compute the RHS into an
   3860             AMD64RI. */
   3861          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   3862          addInstr(env,
   3863                   AMD64Instr_Alu64M(
   3864                      Aalu_MOV,
   3865                      ri,
   3866                      AMD64AMode_IR(stmt->Ist.Put.offset,
   3867                                    hregAMD64_RBP())
   3868                  ));
   3869          return;
   3870       }
   3871       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   3872          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   3873          addInstr(env, AMD64Instr_Store(
   3874                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   3875                           r,
   3876                           AMD64AMode_IR(stmt->Ist.Put.offset,
   3877                                         hregAMD64_RBP())));
   3878          return;
   3879       }
   3880       if (ty == Ity_V128) {
   3881          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
   3882          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
   3883                                          hregAMD64_RBP());
   3884          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
   3885          return;
   3886       }
   3887       if (ty == Ity_F32) {
   3888          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   3889          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
   3890          set_SSE_rounding_default(env); /* paranoia */
   3891          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
   3892          return;
   3893       }
   3894       if (ty == Ity_F64) {
   3895          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   3896          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
   3897                                          hregAMD64_RBP() );
   3898          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
   3899          return;
   3900       }
   3901       break;
   3902    }
   3903 
   3904    /* --------- Indexed PUT --------- */
   3905    case Ist_PutI: {
   3906       AMD64AMode* am
   3907          = genGuestArrayOffset(
   3908               env, stmt->Ist.PutI.descr,
   3909                    stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
   3910 
   3911       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
   3912       if (ty == Ity_F64) {
   3913          HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
   3914          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
   3915          return;
   3916       }
   3917       if (ty == Ity_I8) {
   3918          HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
   3919          addInstr(env, AMD64Instr_Store( 1, r, am ));
   3920          return;
   3921       }
   3922       if (ty == Ity_I64) {
   3923          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.PutI.data);
   3924          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
   3925          return;
   3926       }
   3927       break;
   3928    }
   3929 
   3930    /* --------- TMP --------- */
   3931    case Ist_WrTmp: {
   3932       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   3933       IRType ty = typeOfIRTemp(env->type_env, tmp);
   3934 
   3935       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
   3936          compute it into an AMode and then use LEA.  This usually
   3937          produces fewer instructions, often because (for memcheck
   3938          created IR) we get t = address-expression, (t is later used
   3939          twice) and so doing this naturally turns address-expression
   3940          back into an AMD64 amode. */
   3941       if (ty == Ity_I64
   3942           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   3943           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
   3944          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   3945          HReg dst = lookupIRTemp(env, tmp);
   3946          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
   3947             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   3948                value into a register.  Just emit a normal reg-reg move
   3949                so reg-alloc can coalesce it away in the usual way. */
   3950             HReg src = am->Aam.IR.reg;
   3951             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
   3952          } else {
   3953             addInstr(env, AMD64Instr_Lea64(am,dst));
   3954          }
   3955          return;
   3956       }
   3957 
   3958       if (ty == Ity_I64 || ty == Ity_I32
   3959           || ty == Ity_I16 || ty == Ity_I8) {
   3960          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   3961          HReg dst = lookupIRTemp(env, tmp);
   3962          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
   3963          return;
   3964       }
   3965       if (ty == Ity_I128) {
   3966          HReg rHi, rLo, dstHi, dstLo;
   3967          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3968          lookupIRTemp128( &dstHi, &dstLo, env, tmp);
   3969          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   3970          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   3971          return;
   3972       }
   3973       if (ty == Ity_I1) {
   3974          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   3975          HReg dst = lookupIRTemp(env, tmp);
   3976          addInstr(env, AMD64Instr_Set64(cond, dst));
   3977          return;
   3978       }
   3979       if (ty == Ity_F64) {
   3980          HReg dst = lookupIRTemp(env, tmp);
   3981          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   3982          addInstr(env, mk_vMOVsd_RR(src, dst));
   3983          return;
   3984       }
   3985       if (ty == Ity_F32) {
   3986          HReg dst = lookupIRTemp(env, tmp);
   3987          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   3988          addInstr(env, mk_vMOVsd_RR(src, dst));
   3989          return;
   3990       }
   3991       if (ty == Ity_V128) {
   3992          HReg dst = lookupIRTemp(env, tmp);
   3993          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   3994          addInstr(env, mk_vMOVsd_RR(src, dst));
   3995          return;
   3996       }
   3997       break;
   3998    }
   3999 
   4000    /* --------- Call to DIRTY helper --------- */
   4001    case Ist_Dirty: {
   4002       IRType   retty;
   4003       IRDirty* d = stmt->Ist.Dirty.details;
   4004       Bool     passBBP = False;
   4005 
   4006       if (d->nFxState == 0)
   4007          vassert(!d->needsBBP);
   4008 
   4009       passBBP = toBool(d->nFxState > 0 && d->needsBBP);
   4010 
   4011       /* Marshal args, do the call, clear stack. */
   4012       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
   4013 
   4014       /* Now figure out what to do with the returned value, if any. */
   4015       if (d->tmp == IRTemp_INVALID)
   4016          /* No return value.  Nothing to do. */
   4017          return;
   4018 
   4019       retty = typeOfIRTemp(env->type_env, d->tmp);
   4020       if (retty == Ity_I64 || retty == Ity_I32
   4021           || retty == Ity_I16 || retty == Ity_I8) {
   4022          /* The returned value is in %rax.  Park it in the register
   4023             associated with tmp. */
   4024          HReg dst = lookupIRTemp(env, d->tmp);
   4025          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
   4026          return;
   4027       }
   4028       break;
   4029    }
   4030 
   4031    /* --------- MEM FENCE --------- */
   4032    case Ist_MBE:
   4033       switch (stmt->Ist.MBE.event) {
   4034          case Imbe_Fence:
   4035             addInstr(env, AMD64Instr_MFence());
   4036             return;
   4037          default:
   4038             break;
   4039       }
   4040       break;
   4041 
   4042    /* --------- ACAS --------- */
   4043    case Ist_CAS:
   4044       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   4045          /* "normal" singleton CAS */
   4046          UChar  sz;
   4047          IRCAS* cas = stmt->Ist.CAS.details;
   4048          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4049          /* get: cas->expd into %rax, and cas->data into %rbx */
   4050          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4051          HReg rData = iselIntExpr_R(env, cas->dataLo);
   4052          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
   4053          HReg rOld  = lookupIRTemp(env, cas->oldLo);
   4054          vassert(cas->expdHi == NULL);
   4055          vassert(cas->dataHi == NULL);
   4056          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
   4057          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
   4058          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
   4059          switch (ty) {
   4060             case Ity_I64: sz = 8; break;
   4061             case Ity_I32: sz = 4; break;
   4062             case Ity_I16: sz = 2; break;
   4063             case Ity_I8:  sz = 1; break;
   4064             default: goto unhandled_cas;
   4065          }
   4066          addInstr(env, AMD64Instr_ACAS(am, sz));
   4067          addInstr(env, AMD64Instr_CMov64(
   4068                           Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
   4069          return;
   4070       } else {
   4071          /* double CAS */
   4072          UChar  sz;
   4073          IRCAS* cas = stmt->Ist.CAS.details;
   4074          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4075          /* only 32-bit and 64-bit allowed in this case */
   4076          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
   4077          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
   4078          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4079          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   4080          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   4081          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   4082          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   4083          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   4084          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4085          switch (ty) {
   4086             case Ity_I64:
   4087                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
   4088                   goto unhandled_cas; /* we'd have to generate
   4089                                          cmpxchg16b, but the host
   4090                                          doesn't support that */
   4091                sz = 8;
   4092                break;
   4093             case Ity_I32:
   4094                sz = 4;
   4095                break;
   4096             default:
   4097                goto unhandled_cas;
   4098          }
   4099          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   4100          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4101          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
   4102          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
   4103          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
   4104          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
   4105          addInstr(env, AMD64Instr_DACAS(am, sz));
   4106          addInstr(env,
   4107                   AMD64Instr_CMov64(
   4108                      Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
   4109          addInstr(env,
   4110                   AMD64Instr_CMov64(
   4111                      Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
   4112          return;
   4113       }
   4114       unhandled_cas:
   4115       break;
   4116 
   4117    /* --------- INSTR MARK --------- */
   4118    /* Doesn't generate any executable code ... */
   4119    case Ist_IMark:
   4120        return;
   4121 
   4122    /* --------- ABI HINT --------- */
   4123    /* These have no meaning (denotation in the IR) and so we ignore
   4124       them ... if any actually made it this far. */
   4125    case Ist_AbiHint:
   4126        return;
   4127 
   4128    /* --------- NO-OP --------- */
   4129    case Ist_NoOp:
   4130        return;
   4131 
   4132    /* --------- EXIT --------- */
   4133    case Ist_Exit: {
   4134       AMD64RI*      dst;
   4135       AMD64CondCode cc;
   4136       if (stmt->Ist.Exit.dst->tag != Ico_U64)
   4137          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
   4138       dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4139       cc  = iselCondCode(env,stmt->Ist.Exit.guard);
   4140       addInstr(env, AMD64Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
   4141       return;
   4142    }
   4143 
   4144    default: break;
   4145    }
   4146   stmt_fail:
   4147    ppIRStmt(stmt);
   4148    vpanic("iselStmt(amd64)");
   4149 }
   4150 
   4151 
   4152 /*---------------------------------------------------------*/
   4153 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4154 /*---------------------------------------------------------*/
   4155 
   4156 static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
   4157 {
   4158    AMD64RI* ri;
   4159    if (vex_traceflags & VEX_TRACE_VCODE) {
   4160       vex_printf("\n-- goto {");
   4161       ppIRJumpKind(jk);
   4162       vex_printf("} ");
   4163       ppIRExpr(next);
   4164       vex_printf("\n");
   4165    }
   4166    ri = iselIntExpr_RI(env, next);
   4167    addInstr(env, AMD64Instr_Goto(jk, Acc_ALWAYS,ri));
   4168 }
   4169 
   4170 
   4171 /*---------------------------------------------------------*/
   4172 /*--- Insn selector top-level                           ---*/
   4173 /*---------------------------------------------------------*/
   4174 
   4175 /* Translate an entire SB to amd64 code. */
   4176 
   4177 HInstrArray* iselSB_AMD64 ( IRSB* bb, VexArch      arch_host,
   4178                                       VexArchInfo* archinfo_host,
   4179                                       VexAbiInfo*  vbi/*UNUSED*/ )
   4180 {
   4181    Int      i, j;
   4182    HReg     hreg, hregHI;
   4183    ISelEnv* env;
   4184    UInt     hwcaps_host = archinfo_host->hwcaps;
   4185 
   4186    /* sanity ... */
   4187    vassert(arch_host == VexArchAMD64);
   4188    vassert(0 == (hwcaps_host
   4189                  & ~(VEX_HWCAPS_AMD64_SSE3
   4190                      | VEX_HWCAPS_AMD64_CX16
   4191                      | VEX_HWCAPS_AMD64_LZCNT)));
   4192 
   4193    /* Make up an initial environment to use. */
   4194    env = LibVEX_Alloc(sizeof(ISelEnv));
   4195    env->vreg_ctr = 0;
   4196 
   4197    /* Set up output code array. */
   4198    env->code = newHInstrArray();
   4199 
   4200    /* Copy BB's type env. */
   4201    env->type_env = bb->tyenv;
   4202 
   4203    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4204       change as we go along. */
   4205    env->n_vregmap = bb->tyenv->types_used;
   4206    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4207    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4208 
   4209    /* and finally ... */
   4210    env->hwcaps = hwcaps_host;
   4211 
   4212    /* For each IR temporary, allocate a suitably-kinded virtual
   4213       register. */
   4214    j = 0;
   4215    for (i = 0; i < env->n_vregmap; i++) {
   4216       hregHI = hreg = INVALID_HREG;
   4217       switch (bb->tyenv->types[i]) {
   4218          case Ity_I1:
   4219          case Ity_I8:
   4220          case Ity_I16:
   4221          case Ity_I32:
   4222          case Ity_I64:  hreg   = mkHReg(j++, HRcInt64, True); break;
   4223          case Ity_I128: hreg   = mkHReg(j++, HRcInt64, True);
   4224                         hregHI = mkHReg(j++, HRcInt64, True); break;
   4225          case Ity_F32:
   4226          case Ity_F64:
   4227          case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
   4228          default: ppIRType(bb->tyenv->types[i]);
   4229                   vpanic("iselBB(amd64): IRTemp type");
   4230       }
   4231       env->vregmap[i]   = hreg;
   4232       env->vregmapHI[i] = hregHI;
   4233    }
   4234    env->vreg_ctr = j;
   4235 
   4236    /* Ok, finally we can iterate over the statements. */
   4237    for (i = 0; i < bb->stmts_used; i++)
   4238       if (bb->stmts[i])
   4239          iselStmt(env,bb->stmts[i]);
   4240 
   4241    iselNext(env,bb->next,bb->jumpkind);
   4242 
   4243    /* record the number of vregs we used. */
   4244    env->code->n_vregs = env->vreg_ctr;
   4245    return env->code;
   4246 }
   4247 
   4248 
   4249 /*---------------------------------------------------------------*/
   4250 /*--- end                                   host_amd64_isel.c ---*/
   4251 /*---------------------------------------------------------------*/
   4252