Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_amd64_defs.h"
     47 
     48 
     49 /*---------------------------------------------------------*/
     50 /*--- x87/SSE control word stuff                        ---*/
     51 /*---------------------------------------------------------*/
     52 
     53 /* Vex-generated code expects to run with the FPU set as follows: all
     54    exceptions masked, round-to-nearest, precision = 53 bits.  This
     55    corresponds to a FPU control word value of 0x027F.
     56 
     57    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     58 
     59    %fpucw and %mxcsr should have these values on entry to
     60    Vex-generated code, and should those values should be
     61    unchanged at exit.
     62 */
     63 
     64 #define DEFAULT_FPUCW 0x027F
     65 
     66 #define DEFAULT_MXCSR 0x1F80
     67 
     68 /* debugging only, do not use */
     69 /* define DEFAULT_FPUCW 0x037F */
     70 
     71 
     72 /*---------------------------------------------------------*/
     73 /*--- misc helpers                                      ---*/
     74 /*---------------------------------------------------------*/
     75 
     76 /* These are duplicated in guest-amd64/toIR.c */
     77 static IRExpr* unop ( IROp op, IRExpr* a )
     78 {
     79    return IRExpr_Unop(op, a);
     80 }
     81 
     82 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     83 {
     84    return IRExpr_Binop(op, a1, a2);
     85 }
     86 
     87 static IRExpr* bind ( Int binder )
     88 {
     89    return IRExpr_Binder(binder);
     90 }
     91 
     92 
     93 /*---------------------------------------------------------*/
     94 /*--- ISelEnv                                           ---*/
     95 /*---------------------------------------------------------*/
     96 
     97 /* This carries around:
     98 
     99    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    100      might encounter.  This is computed before insn selection starts,
    101      and does not change.
    102 
    103    - A mapping from IRTemp to HReg.  This tells the insn selector
    104      which virtual register is associated with each IRTemp
    105      temporary.  This is computed before insn selection starts, and
    106      does not change.  We expect this mapping to map precisely the
    107      same set of IRTemps as the type mapping does.
    108 
    109         - vregmap   holds the primary register for the IRTemp.
    110         - vregmapHI is only used for 128-bit integer-typed
    111              IRTemps.  It holds the identity of a second
    112              64-bit virtual HReg, which holds the high half
    113              of the value.
    114 
    115    - The host subarchitecture we are selecting insns for.
    116      This is set at the start and does not change.
    117 
    118    - The code array, that is, the insns selected so far.
    119 
    120    - A counter, for generating new virtual registers.
    121 
    122    - A Bool for indicating whether we may generate chain-me
    123      instructions for control flow transfers, or whether we must use
    124      XAssisted.
    125 
    126    - The maximum guest address of any guest insn in this block.
    127      Actually, the address of the highest-addressed byte from any insn
    128      in this block.  Is set at the start and does not change.  This is
    129      used for detecting jumps which are definitely forward-edges from
    130      this block, and therefore can be made (chained) to the fast entry
    131      point of the destination, thereby avoiding the destination's
    132      event check.
    133 
    134    Note, this is all host-independent.  (JRS 20050201: well, kinda
    135    ... not completely.  Compare with ISelEnv for X86.)
    136 */
    137 
    138 typedef
    139    struct {
    140       /* Constant -- are set at the start and do not change. */
    141       IRTypeEnv*   type_env;
    142 
    143       HReg*        vregmap;
    144       HReg*        vregmapHI;
    145       Int          n_vregmap;
    146 
    147       UInt         hwcaps;
    148 
    149       Bool         chainingAllowed;
    150       Addr64       max_ga;
    151 
    152       /* These are modified as we go along. */
    153       HInstrArray* code;
    154       Int          vreg_ctr;
    155    }
    156    ISelEnv;
    157 
    158 
    159 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    160 {
    161    vassert(tmp >= 0);
    162    vassert(tmp < env->n_vregmap);
    163    return env->vregmap[tmp];
    164 }
    165 
    166 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
    167                                ISelEnv* env, IRTemp tmp )
    168 {
    169    vassert(tmp >= 0);
    170    vassert(tmp < env->n_vregmap);
    171    vassert(env->vregmapHI[tmp] != INVALID_HREG);
    172    *vrLO = env->vregmap[tmp];
    173    *vrHI = env->vregmapHI[tmp];
    174 }
    175 
    176 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
    177 {
    178    addHInstr(env->code, instr);
    179    if (vex_traceflags & VEX_TRACE_VCODE) {
    180       ppAMD64Instr(instr, True);
    181       vex_printf("\n");
    182    }
    183 }
    184 
    185 static HReg newVRegI ( ISelEnv* env )
    186 {
    187    HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
    188    env->vreg_ctr++;
    189    return reg;
    190 }
    191 
    192 static HReg newVRegV ( ISelEnv* env )
    193 {
    194    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    195    env->vreg_ctr++;
    196    return reg;
    197 }
    198 
    199 
    200 /*---------------------------------------------------------*/
    201 /*--- ISEL: Forward declarations                        ---*/
    202 /*---------------------------------------------------------*/
    203 
    204 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    205    iselXXX_wrk do the real work, but are not to be called directly.
    206    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    207    checks that all returned registers are virtual.  You should not
    208    call the _wrk version directly.
    209 */
    210 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    211 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    212 
    213 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
    214 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
    215 
    216 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
    217 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
    218 
    219 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
    220 static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
    221 
    222 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    223 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    224 
    225 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    226                                           ISelEnv* env, IRExpr* e );
    227 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
    228                                           ISelEnv* env, IRExpr* e );
    229 
    230 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
    231 static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
    232 
    233 static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
    234 static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
    235 
    236 static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
    237 static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
    238 
    239 static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
    240 static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
    241 
    242 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    243                                         ISelEnv* env, IRExpr* e );
    244 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
    245                                         ISelEnv* env, IRExpr* e );
    246 
    247 
    248 /*---------------------------------------------------------*/
    249 /*--- ISEL: Misc helpers                                ---*/
    250 /*---------------------------------------------------------*/
    251 
    252 static Bool sane_AMode ( AMD64AMode* am )
    253 {
    254    switch (am->tag) {
    255       case Aam_IR:
    256          return
    257             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
    258                     && (hregIsVirtual(am->Aam.IR.reg)
    259                         || am->Aam.IR.reg == hregAMD64_RBP()) );
    260       case Aam_IRRS:
    261          return
    262             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
    263                     && hregIsVirtual(am->Aam.IRRS.base)
    264                     && hregClass(am->Aam.IRRS.index) == HRcInt64
    265                     && hregIsVirtual(am->Aam.IRRS.index) );
    266       default:
    267         vpanic("sane_AMode: unknown amd64 amode tag");
    268    }
    269 }
    270 
    271 
    272 /* Can the lower 32 bits be signedly widened to produce the whole
    273    64-bit value?  In other words, are the top 33 bits either all 0 or
    274    all 1 ? */
    275 static Bool fitsIn32Bits ( ULong x )
    276 {
    277    Long y0 = (Long)x;
    278    Long y1 = y0;
    279    y1 <<= 32;
    280    y1 >>=/*s*/ 32;
    281    return toBool(x == y1);
    282 }
    283 
    284 /* Is this a 64-bit zero expression? */
    285 
    286 static Bool isZeroU64 ( IRExpr* e )
    287 {
    288    return e->tag == Iex_Const
    289           && e->Iex.Const.con->tag == Ico_U64
    290           && e->Iex.Const.con->Ico.U64 == 0ULL;
    291 }
    292 
    293 static Bool isZeroU32 ( IRExpr* e )
    294 {
    295    return e->tag == Iex_Const
    296           && e->Iex.Const.con->tag == Ico_U32
    297           && e->Iex.Const.con->Ico.U32 == 0;
    298 }
    299 
    300 /* Make a int reg-reg move. */
    301 
    302 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    303 {
    304    vassert(hregClass(src) == HRcInt64);
    305    vassert(hregClass(dst) == HRcInt64);
    306    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
    307 }
    308 
    309 /* Make a vector (128 bit) reg-reg move. */
    310 
    311 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    312 {
    313    vassert(hregClass(src) == HRcVec128);
    314    vassert(hregClass(dst) == HRcVec128);
    315    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
    316 }
    317 
    318 /* Advance/retreat %rsp by n. */
    319 
    320 static void add_to_rsp ( ISelEnv* env, Int n )
    321 {
    322    vassert(n > 0 && n < 256 && (n%8) == 0);
    323    addInstr(env,
    324             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
    325                                         hregAMD64_RSP()));
    326 }
    327 
    328 static void sub_from_rsp ( ISelEnv* env, Int n )
    329 {
    330    vassert(n > 0 && n < 256 && (n%8) == 0);
    331    addInstr(env,
    332             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
    333                                         hregAMD64_RSP()));
    334 }
    335 
    336 /* Push 64-bit constants on the stack. */
    337 static void push_uimm64( ISelEnv* env, ULong uimm64 )
    338 {
    339    /* If uimm64 can be expressed as the sign extension of its
    340       lower 32 bits, we can do it the easy way. */
    341    Long simm64 = (Long)uimm64;
    342    if ( simm64 == ((simm64 << 32) >> 32) ) {
    343       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
    344    } else {
    345       HReg tmp = newVRegI(env);
    346       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
    347       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
    348    }
    349 }
    350 
    351 
    352 /* Used only in doHelperCall.  If possible, produce a single
    353    instruction which computes 'e' into 'dst'.  If not possible, return
    354    NULL. */
    355 
    356 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
    357                                                     HReg     dst,
    358                                                     IRExpr*  e )
    359 {
    360    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
    361 
    362    if (e->tag == Iex_Const) {
    363       vassert(e->Iex.Const.con->tag == Ico_U64);
    364       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
    365          return AMD64Instr_Alu64R(
    366                    Aalu_MOV,
    367                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
    368                    dst
    369                 );
    370       } else {
    371          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
    372       }
    373    }
    374 
    375    if (e->tag == Iex_RdTmp) {
    376       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
    377       return mk_iMOVsd_RR(src, dst);
    378    }
    379 
    380    if (e->tag == Iex_Get) {
    381       vassert(e->Iex.Get.ty == Ity_I64);
    382       return AMD64Instr_Alu64R(
    383                 Aalu_MOV,
    384                 AMD64RMI_Mem(
    385                    AMD64AMode_IR(e->Iex.Get.offset,
    386                                  hregAMD64_RBP())),
    387                 dst);
    388    }
    389 
    390    if (e->tag == Iex_Unop
    391        && e->Iex.Unop.op == Iop_32Uto64
    392        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
    393       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
    394       return AMD64Instr_MovxLQ(False, src, dst);
    395    }
    396 
    397    if (0) { ppIRExpr(e); vex_printf("\n"); }
    398 
    399    return NULL;
    400 }
    401 
    402 
    403 /* Do a complete function call.  guard is a Ity_Bit expression
    404    indicating whether or not the call happens.  If guard==NULL, the
    405    call is unconditional. */
    406 
    407 static
    408 void doHelperCall ( ISelEnv* env,
    409                     Bool passBBP,
    410                     IRExpr* guard, IRCallee* cee, IRExpr** args )
    411 {
    412    AMD64CondCode cc;
    413    HReg          argregs[6];
    414    HReg          tmpregs[6];
    415    AMD64Instr*   fastinstrs[6];
    416    Int           n_args, i, argreg;
    417 
    418    /* Marshal args for a call and do the call.
    419 
    420       If passBBP is True, %rbp (the baseblock pointer) is to be passed
    421       as the first arg.
    422 
    423       This function only deals with a tiny set of possibilities, which
    424       cover all helpers in practice.  The restrictions are that only
    425       arguments in registers are supported, hence only 6x64 integer
    426       bits in total can be passed.  In fact the only supported arg
    427       type is I64.
    428 
    429       Generating code which is both efficient and correct when
    430       parameters are to be passed in registers is difficult, for the
    431       reasons elaborated in detail in comments attached to
    432       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
    433       of the method described in those comments.
    434 
    435       The problem is split into two cases: the fast scheme and the
    436       slow scheme.  In the fast scheme, arguments are computed
    437       directly into the target (real) registers.  This is only safe
    438       when we can be sure that computation of each argument will not
    439       trash any real registers set by computation of any other
    440       argument.
    441 
    442       In the slow scheme, all args are first computed into vregs, and
    443       once they are all done, they are moved to the relevant real
    444       regs.  This always gives correct code, but it also gives a bunch
    445       of vreg-to-rreg moves which are usually redundant but are hard
    446       for the register allocator to get rid of.
    447 
    448       To decide which scheme to use, all argument expressions are
    449       first examined.  If they are all so simple that it is clear they
    450       will be evaluated without use of any fixed registers, use the
    451       fast scheme, else use the slow scheme.  Note also that only
    452       unconditional calls may use the fast scheme, since having to
    453       compute a condition expression could itself trash real
    454       registers.
    455 
    456       Note this requires being able to examine an expression and
    457       determine whether or not evaluation of it might use a fixed
    458       register.  That requires knowledge of how the rest of this insn
    459       selector works.  Currently just the following 3 are regarded as
    460       safe -- hopefully they cover the majority of arguments in
    461       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    462    */
    463 
    464    /* Note that the cee->regparms field is meaningless on AMD64 host
    465       (since there is only one calling convention) and so we always
    466       ignore it. */
    467 
    468    n_args = 0;
    469    for (i = 0; args[i]; i++)
    470       n_args++;
    471 
    472    if (6 < n_args + (passBBP ? 1 : 0))
    473       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
    474 
    475    argregs[0] = hregAMD64_RDI();
    476    argregs[1] = hregAMD64_RSI();
    477    argregs[2] = hregAMD64_RDX();
    478    argregs[3] = hregAMD64_RCX();
    479    argregs[4] = hregAMD64_R8();
    480    argregs[5] = hregAMD64_R9();
    481 
    482    tmpregs[0] = tmpregs[1] = tmpregs[2] =
    483    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
    484 
    485    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
    486    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
    487 
    488    /* First decide which scheme (slow or fast) is to be used.  First
    489       assume the fast scheme, and select slow if any contraindications
    490       (wow) appear. */
    491 
    492    if (guard) {
    493       if (guard->tag == Iex_Const
    494           && guard->Iex.Const.con->tag == Ico_U1
    495           && guard->Iex.Const.con->Ico.U1 == True) {
    496          /* unconditional */
    497       } else {
    498          /* Not manifestly unconditional -- be conservative. */
    499          goto slowscheme;
    500       }
    501    }
    502 
    503    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
    504       use the slow scheme.  Because this is tentative, we can't call
    505       addInstr (that is, commit to) any instructions until we're
    506       handled all the arguments.  So park the resulting instructions
    507       in a buffer and emit that if we're successful. */
    508 
    509    /* FAST SCHEME */
    510    argreg = 0;
    511    if (passBBP) {
    512       fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
    513       argreg++;
    514    }
    515 
    516    for (i = 0; i < n_args; i++) {
    517       vassert(argreg < 6);
    518       vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    519       fastinstrs[argreg]
    520          = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
    521       if (fastinstrs[argreg] == NULL)
    522          goto slowscheme;
    523       argreg++;
    524    }
    525 
    526    /* Looks like we're in luck.  Emit the accumulated instructions and
    527       move on to doing the call itself. */
    528    vassert(argreg <= 6);
    529    for (i = 0; i < argreg; i++)
    530       addInstr(env, fastinstrs[i]);
    531 
    532    /* Fast scheme only applies for unconditional calls.  Hence: */
    533    cc = Acc_ALWAYS;
    534 
    535    goto handle_call;
    536 
    537 
    538    /* SLOW SCHEME; move via temporaries */
    539   slowscheme:
    540 #  if 0 /* debug only */
    541    if (n_args > 0) {for (i = 0; args[i]; i++) {
    542    ppIRExpr(args[i]); vex_printf(" "); }
    543    vex_printf("\n");}
    544 #  endif
    545    argreg = 0;
    546 
    547    if (passBBP) {
    548       /* This is pretty stupid; better to move directly to rdi
    549          after the rest of the args are done. */
    550       tmpregs[argreg] = newVRegI(env);
    551       addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
    552       argreg++;
    553    }
    554 
    555    for (i = 0; i < n_args; i++) {
    556       vassert(argreg < 6);
    557       vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    558       tmpregs[argreg] = iselIntExpr_R(env, args[i]);
    559       argreg++;
    560    }
    561 
    562    /* Now we can compute the condition.  We can't do it earlier
    563       because the argument computations could trash the condition
    564       codes.  Be a bit clever to handle the common case where the
    565       guard is 1:Bit. */
    566    cc = Acc_ALWAYS;
    567    if (guard) {
    568       if (guard->tag == Iex_Const
    569           && guard->Iex.Const.con->tag == Ico_U1
    570           && guard->Iex.Const.con->Ico.U1 == True) {
    571          /* unconditional -- do nothing */
    572       } else {
    573          cc = iselCondCode( env, guard );
    574       }
    575    }
    576 
    577    /* Move the args to their final destinations. */
    578    for (i = 0; i < argreg; i++) {
    579       /* None of these insns, including any spill code that might
    580          be generated, may alter the condition codes. */
    581       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
    582    }
    583 
    584 
    585    /* Finally, the call itself. */
    586   handle_call:
    587    addInstr(env, AMD64Instr_Call(
    588                     cc,
    589                     Ptr_to_ULong(cee->addr),
    590                     n_args + (passBBP ? 1 : 0)
    591                  )
    592    );
    593 }
    594 
    595 
    596 /* Given a guest-state array descriptor, an index expression and a
    597    bias, generate an AMD64AMode holding the relevant guest state
    598    offset. */
    599 
    600 static
    601 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    602                                   IRExpr* off, Int bias )
    603 {
    604    HReg tmp, roff;
    605    Int  elemSz = sizeofIRType(descr->elemTy);
    606    Int  nElems = descr->nElems;
    607 
    608    /* Throw out any cases not generated by an amd64 front end.  In
    609       theory there might be a day where we need to handle them -- if
    610       we ever run non-amd64-guest on amd64 host. */
    611 
    612    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
    613       vpanic("genGuestArrayOffset(amd64 host)");
    614 
    615    /* Compute off into a reg, %off.  Then return:
    616 
    617          movq %off, %tmp
    618          addq $bias, %tmp  (if bias != 0)
    619          andq %tmp, 7
    620          ... base(%rbp, %tmp, shift) ...
    621    */
    622    tmp  = newVRegI(env);
    623    roff = iselIntExpr_R(env, off);
    624    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    625    if (bias != 0) {
    626       /* Make sure the bias is sane, in the sense that there are
    627          no significant bits above bit 30 in it. */
    628       vassert(-10000 < bias && bias < 10000);
    629       addInstr(env,
    630                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
    631    }
    632    addInstr(env,
    633             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
    634    vassert(elemSz == 1 || elemSz == 8);
    635    return
    636       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
    637                                     elemSz==8 ? 3 : 0);
    638 }
    639 
    640 
    641 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
    642 static
    643 void set_SSE_rounding_default ( ISelEnv* env )
    644 {
    645    /* pushq $DEFAULT_MXCSR
    646       ldmxcsr 0(%rsp)
    647       addq $8, %rsp
    648    */
    649    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    650    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
    651    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    652    add_to_rsp(env, 8);
    653 }
    654 
    655 /* Mess with the FPU's rounding mode: set to the default rounding mode
    656    (DEFAULT_FPUCW). */
    657 static
    658 void set_FPU_rounding_default ( ISelEnv* env )
    659 {
    660    /* movq $DEFAULT_FPUCW, -8(%rsp)
    661       fldcw -8(%esp)
    662    */
    663    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    664    addInstr(env, AMD64Instr_Alu64M(
    665                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
    666    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    667 }
    668 
    669 
    670 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
    671    expression denoting a value in the range 0 .. 3, indicating a round
    672    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
    673    have the same rounding.
    674 */
    675 static
    676 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
    677 {
    678    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
    679       both rounding bits == 0.  If that wasn't the case, we couldn't
    680       create a new rounding field simply by ORing the new value into
    681       place. */
    682 
    683    /* movq $3, %reg
    684       andq [[mode]], %reg  -- shouldn't be needed; paranoia
    685       shlq $13, %reg
    686       orq $DEFAULT_MXCSR, %reg
    687       pushq %reg
    688       ldmxcsr 0(%esp)
    689       addq $8, %rsp
    690    */
    691    HReg        reg      = newVRegI(env);
    692    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    693    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
    694    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
    695                                    iselIntExpr_RMI(env, mode), reg));
    696    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
    697    addInstr(env, AMD64Instr_Alu64R(
    698                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
    699    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
    700    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    701    add_to_rsp(env, 8);
    702 }
    703 
    704 
    705 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    706    expression denoting a value in the range 0 .. 3, indicating a round
    707    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    708    the same rounding.
    709 */
    710 static
    711 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    712 {
    713    HReg rrm  = iselIntExpr_R(env, mode);
    714    HReg rrm2 = newVRegI(env);
    715    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    716 
    717    /* movq  %rrm, %rrm2
    718       andq  $3, %rrm2   -- shouldn't be needed; paranoia
    719       shlq  $10, %rrm2
    720       orq   $DEFAULT_FPUCW, %rrm2
    721       movq  %rrm2, -8(%rsp)
    722       fldcw -8(%esp)
    723    */
    724    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    725    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
    726    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
    727    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
    728                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
    729    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
    730                                    AMD64RI_Reg(rrm2), m8_rsp));
    731    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    732 }
    733 
    734 
    735 /* Generate all-zeroes into a new vector register.
    736 */
    737 static HReg generate_zeroes_V128 ( ISelEnv* env )
    738 {
    739    HReg dst = newVRegV(env);
    740    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
    741    return dst;
    742 }
    743 
    744 /* Generate all-ones into a new vector register.
    745 */
    746 static HReg generate_ones_V128 ( ISelEnv* env )
    747 {
    748    HReg dst = newVRegV(env);
    749    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
    750    return dst;
    751 }
    752 
    753 
    754 /* Generate !src into a new vector register.  Amazing that there isn't
    755    a less crappy way to do this.
    756 */
    757 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
    758 {
    759    HReg dst = generate_ones_V128(env);
    760    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
    761    return dst;
    762 }
    763 
    764 
    765 /* Expand the given byte into a 64-bit word, by cloning each bit
    766    8 times. */
    767 static ULong bitmask8_to_bytemask64 ( UShort w8 )
    768 {
    769    vassert(w8 == (w8 & 0xFF));
    770    ULong w64 = 0;
    771    Int i;
    772    for (i = 0; i < 8; i++) {
    773       if (w8 & (1<<i))
    774          w64 |= (0xFFULL << (8 * i));
    775    }
    776    return w64;
    777 }
    778 
    779 
    780 /*---------------------------------------------------------*/
    781 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
    782 /*---------------------------------------------------------*/
    783 
    784 /* Select insns for an integer-typed expression, and add them to the
    785    code list.  Return a reg holding the result.  This reg will be a
    786    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    787    want to modify it, ask for a new vreg, copy it in there, and modify
    788    the copy.  The register allocator will do its best to map both
    789    vregs to the same real register, so the copies will often disappear
    790    later in the game.
    791 
    792    This should handle expressions of 64, 32, 16 and 8-bit type.  All
    793    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
    794    expressions, the upper 32/16/24 bits are arbitrary, so you should
    795    mask or sign extend partial values if necessary.
    796 */
    797 
    798 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    799 {
    800    HReg r = iselIntExpr_R_wrk(env, e);
    801    /* sanity checks ... */
    802 #  if 0
    803    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
    804 #  endif
    805    vassert(hregClass(r) == HRcInt64);
    806    vassert(hregIsVirtual(r));
    807    return r;
    808 }
    809 
    810 /* DO NOT CALL THIS DIRECTLY ! */
    811 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    812 {
    813    /* Used for unary/binary SIMD64 ops. */
    814    HWord fn = 0;
    815    Bool second_is_UInt;
    816 
    817    MatchInfo mi;
    818    DECLARE_PATTERN(p_1Uto8_64to1);
    819    DECLARE_PATTERN(p_LDle8_then_8Uto64);
    820    DECLARE_PATTERN(p_LDle16_then_16Uto64);
    821 
    822    IRType ty = typeOfIRExpr(env->type_env,e);
    823    switch (ty) {
    824       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
    825       default: vassert(0);
    826    }
    827 
    828    switch (e->tag) {
    829 
    830    /* --------- TEMP --------- */
    831    case Iex_RdTmp: {
    832       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    833    }
    834 
    835    /* --------- LOAD --------- */
    836    case Iex_Load: {
    837       HReg dst = newVRegI(env);
    838       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    839 
    840       /* We can't handle big-endian loads, nor load-linked. */
    841       if (e->Iex.Load.end != Iend_LE)
    842          goto irreducible;
    843 
    844       if (ty == Ity_I64) {
    845          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
    846                                          AMD64RMI_Mem(amode), dst) );
    847          return dst;
    848       }
    849       if (ty == Ity_I32) {
    850          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
    851          return dst;
    852       }
    853       if (ty == Ity_I16) {
    854          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
    855          return dst;
    856       }
    857       if (ty == Ity_I8) {
    858          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
    859          return dst;
    860       }
    861       break;
    862    }
    863 
    864    /* --------- BINARY OP --------- */
    865    case Iex_Binop: {
    866       AMD64AluOp   aluOp;
    867       AMD64ShiftOp shOp;
    868 
    869       /* Pattern: Sub64(0,x) */
    870       /*     and: Sub32(0,x) */
    871       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
    872           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
    873          HReg dst = newVRegI(env);
    874          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    875          addInstr(env, mk_iMOVsd_RR(reg,dst));
    876          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
    877          return dst;
    878       }
    879 
    880       /* Is it an addition or logical style op? */
    881       switch (e->Iex.Binop.op) {
    882          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
    883             aluOp = Aalu_ADD; break;
    884          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
    885             aluOp = Aalu_SUB; break;
    886          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
    887             aluOp = Aalu_AND; break;
    888          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
    889             aluOp = Aalu_OR; break;
    890          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
    891             aluOp = Aalu_XOR; break;
    892          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
    893             aluOp = Aalu_MUL; break;
    894          default:
    895             aluOp = Aalu_INVALID; break;
    896       }
    897       /* For commutative ops we assume any literal
    898          values are on the second operand. */
    899       if (aluOp != Aalu_INVALID) {
    900          HReg dst      = newVRegI(env);
    901          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
    902          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
    903          addInstr(env, mk_iMOVsd_RR(reg,dst));
    904          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
    905          return dst;
    906       }
    907 
    908       /* Perhaps a shift op? */
    909       switch (e->Iex.Binop.op) {
    910          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    911             shOp = Ash_SHL; break;
    912          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
    913             shOp = Ash_SHR; break;
    914          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
    915             shOp = Ash_SAR; break;
    916          default:
    917             shOp = Ash_INVALID; break;
    918       }
    919       if (shOp != Ash_INVALID) {
    920          HReg dst = newVRegI(env);
    921 
    922          /* regL = the value to be shifted */
    923          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
    924          addInstr(env, mk_iMOVsd_RR(regL,dst));
    925 
    926          /* Do any necessary widening for 32/16/8 bit operands */
    927          switch (e->Iex.Binop.op) {
    928             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
    929                break;
    930             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
    931                break;
    932             case Iop_Shr8:
    933                addInstr(env, AMD64Instr_Alu64R(
    934                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
    935                break;
    936             case Iop_Shr16:
    937                addInstr(env, AMD64Instr_Alu64R(
    938                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
    939                break;
    940             case Iop_Shr32:
    941                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
    942                break;
    943             case Iop_Sar8:
    944                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
    945                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
    946                break;
    947             case Iop_Sar16:
    948                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
    949                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
    950                break;
    951             case Iop_Sar32:
    952                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
    953                break;
    954             default:
    955                ppIROp(e->Iex.Binop.op);
    956                vassert(0);
    957          }
    958 
    959          /* Now consider the shift amount.  If it's a literal, we
    960             can do a much better job than the general case. */
    961          if (e->Iex.Binop.arg2->tag == Iex_Const) {
    962             /* assert that the IR is well-typed */
    963             Int nshift;
    964             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
    965             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
    966             vassert(nshift >= 0);
    967             if (nshift > 0)
    968                /* Can't allow nshift==0 since that means %cl */
    969                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
    970          } else {
    971             /* General case; we have to force the amount into %cl. */
    972             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
    973             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
    974             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
    975          }
    976          return dst;
    977       }
    978 
    979       /* Deal with 64-bit SIMD binary ops */
    980       second_is_UInt = False;
    981       switch (e->Iex.Binop.op) {
    982          case Iop_Add8x8:
    983             fn = (HWord)h_generic_calc_Add8x8; break;
    984          case Iop_Add16x4:
    985             fn = (HWord)h_generic_calc_Add16x4; break;
    986          case Iop_Add32x2:
    987             fn = (HWord)h_generic_calc_Add32x2; break;
    988 
    989          case Iop_Avg8Ux8:
    990             fn = (HWord)h_generic_calc_Avg8Ux8; break;
    991          case Iop_Avg16Ux4:
    992             fn = (HWord)h_generic_calc_Avg16Ux4; break;
    993 
    994          case Iop_CmpEQ8x8:
    995             fn = (HWord)h_generic_calc_CmpEQ8x8; break;
    996          case Iop_CmpEQ16x4:
    997             fn = (HWord)h_generic_calc_CmpEQ16x4; break;
    998          case Iop_CmpEQ32x2:
    999             fn = (HWord)h_generic_calc_CmpEQ32x2; break;
   1000 
   1001          case Iop_CmpGT8Sx8:
   1002             fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
   1003          case Iop_CmpGT16Sx4:
   1004             fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
   1005          case Iop_CmpGT32Sx2:
   1006             fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
   1007 
   1008          case Iop_InterleaveHI8x8:
   1009             fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
   1010          case Iop_InterleaveLO8x8:
   1011             fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
   1012          case Iop_InterleaveHI16x4:
   1013             fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
   1014          case Iop_InterleaveLO16x4:
   1015             fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
   1016          case Iop_InterleaveHI32x2:
   1017             fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
   1018          case Iop_InterleaveLO32x2:
   1019             fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
   1020          case Iop_CatOddLanes16x4:
   1021             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
   1022          case Iop_CatEvenLanes16x4:
   1023             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
   1024          case Iop_Perm8x8:
   1025             fn = (HWord)h_generic_calc_Perm8x8; break;
   1026 
   1027          case Iop_Max8Ux8:
   1028             fn = (HWord)h_generic_calc_Max8Ux8; break;
   1029          case Iop_Max16Sx4:
   1030             fn = (HWord)h_generic_calc_Max16Sx4; break;
   1031          case Iop_Min8Ux8:
   1032             fn = (HWord)h_generic_calc_Min8Ux8; break;
   1033          case Iop_Min16Sx4:
   1034             fn = (HWord)h_generic_calc_Min16Sx4; break;
   1035 
   1036          case Iop_Mul16x4:
   1037             fn = (HWord)h_generic_calc_Mul16x4; break;
   1038          case Iop_Mul32x2:
   1039             fn = (HWord)h_generic_calc_Mul32x2; break;
   1040          case Iop_MulHi16Sx4:
   1041             fn = (HWord)h_generic_calc_MulHi16Sx4; break;
   1042          case Iop_MulHi16Ux4:
   1043             fn = (HWord)h_generic_calc_MulHi16Ux4; break;
   1044 
   1045          case Iop_QAdd8Sx8:
   1046             fn = (HWord)h_generic_calc_QAdd8Sx8; break;
   1047          case Iop_QAdd16Sx4:
   1048             fn = (HWord)h_generic_calc_QAdd16Sx4; break;
   1049          case Iop_QAdd8Ux8:
   1050             fn = (HWord)h_generic_calc_QAdd8Ux8; break;
   1051          case Iop_QAdd16Ux4:
   1052             fn = (HWord)h_generic_calc_QAdd16Ux4; break;
   1053 
   1054          case Iop_QNarrowBin32Sto16Sx4:
   1055             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
   1056          case Iop_QNarrowBin16Sto8Sx8:
   1057             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
   1058          case Iop_QNarrowBin16Sto8Ux8:
   1059             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
   1060          case Iop_NarrowBin16to8x8:
   1061             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
   1062          case Iop_NarrowBin32to16x4:
   1063             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
   1064 
   1065          case Iop_QSub8Sx8:
   1066             fn = (HWord)h_generic_calc_QSub8Sx8; break;
   1067          case Iop_QSub16Sx4:
   1068             fn = (HWord)h_generic_calc_QSub16Sx4; break;
   1069          case Iop_QSub8Ux8:
   1070             fn = (HWord)h_generic_calc_QSub8Ux8; break;
   1071          case Iop_QSub16Ux4:
   1072             fn = (HWord)h_generic_calc_QSub16Ux4; break;
   1073 
   1074          case Iop_Sub8x8:
   1075             fn = (HWord)h_generic_calc_Sub8x8; break;
   1076          case Iop_Sub16x4:
   1077             fn = (HWord)h_generic_calc_Sub16x4; break;
   1078          case Iop_Sub32x2:
   1079             fn = (HWord)h_generic_calc_Sub32x2; break;
   1080 
   1081          case Iop_ShlN32x2:
   1082             fn = (HWord)h_generic_calc_ShlN32x2;
   1083             second_is_UInt = True;
   1084             break;
   1085          case Iop_ShlN16x4:
   1086             fn = (HWord)h_generic_calc_ShlN16x4;
   1087             second_is_UInt = True;
   1088             break;
   1089          case Iop_ShlN8x8:
   1090             fn = (HWord)h_generic_calc_ShlN8x8;
   1091             second_is_UInt = True;
   1092             break;
   1093          case Iop_ShrN32x2:
   1094             fn = (HWord)h_generic_calc_ShrN32x2;
   1095             second_is_UInt = True;
   1096             break;
   1097          case Iop_ShrN16x4:
   1098             fn = (HWord)h_generic_calc_ShrN16x4;
   1099             second_is_UInt = True;
   1100             break;
   1101          case Iop_SarN32x2:
   1102             fn = (HWord)h_generic_calc_SarN32x2;
   1103             second_is_UInt = True;
   1104             break;
   1105          case Iop_SarN16x4:
   1106             fn = (HWord)h_generic_calc_SarN16x4;
   1107             second_is_UInt = True;
   1108             break;
   1109          case Iop_SarN8x8:
   1110             fn = (HWord)h_generic_calc_SarN8x8;
   1111             second_is_UInt = True;
   1112             break;
   1113 
   1114          default:
   1115             fn = (HWord)0; break;
   1116       }
   1117       if (fn != (HWord)0) {
   1118          /* Note: the following assumes all helpers are of signature
   1119                ULong fn ( ULong, ULong ), and they are
   1120             not marked as regparm functions.
   1121          */
   1122          HReg dst  = newVRegI(env);
   1123          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1124          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1125          if (second_is_UInt)
   1126             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
   1127          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
   1128          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
   1129          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
   1130          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1131          return dst;
   1132       }
   1133 
   1134       /* Handle misc other ops. */
   1135 
   1136       if (e->Iex.Binop.op == Iop_Max32U) {
   1137          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1138          HReg dst  = newVRegI(env);
   1139          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1140          addInstr(env, mk_iMOVsd_RR(src1, dst));
   1141          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
   1142          addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
   1143          return dst;
   1144       }
   1145 
   1146       if (e->Iex.Binop.op == Iop_DivModS64to32
   1147           || e->Iex.Binop.op == Iop_DivModU64to32) {
   1148          /* 64 x 32 -> (32(rem),32(div)) division */
   1149          /* Get the 64-bit operand into edx:eax, and the other into
   1150             any old R/M. */
   1151          HReg      rax     = hregAMD64_RAX();
   1152          HReg      rdx     = hregAMD64_RDX();
   1153          HReg      dst     = newVRegI(env);
   1154          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   1155          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   1156          /* Compute the left operand into a reg, and then
   1157             put the top half in edx and the bottom in eax. */
   1158          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1159          addInstr(env, mk_iMOVsd_RR(left64, rdx));
   1160          addInstr(env, mk_iMOVsd_RR(left64, rax));
   1161          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
   1162          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
   1163 	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
   1164 	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
   1165          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
   1166          addInstr(env, mk_iMOVsd_RR(rax, dst));
   1167          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
   1168          return dst;
   1169       }
   1170 
   1171       if (e->Iex.Binop.op == Iop_32HLto64) {
   1172          HReg hi32  = newVRegI(env);
   1173          HReg lo32  = newVRegI(env);
   1174          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1175          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1176          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
   1177          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
   1178          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
   1179 	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
   1180          addInstr(env, AMD64Instr_Alu64R(
   1181                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
   1182          return hi32;
   1183       }
   1184 
   1185       if (e->Iex.Binop.op == Iop_16HLto32) {
   1186          HReg hi16  = newVRegI(env);
   1187          HReg lo16  = newVRegI(env);
   1188          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1189          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1190          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
   1191          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
   1192          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
   1193          addInstr(env, AMD64Instr_Alu64R(
   1194                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
   1195          addInstr(env, AMD64Instr_Alu64R(
   1196                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
   1197          return hi16;
   1198       }
   1199 
   1200       if (e->Iex.Binop.op == Iop_8HLto16) {
   1201          HReg hi8  = newVRegI(env);
   1202          HReg lo8  = newVRegI(env);
   1203          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1204          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1205          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
   1206          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
   1207          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
   1208          addInstr(env, AMD64Instr_Alu64R(
   1209                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
   1210          addInstr(env, AMD64Instr_Alu64R(
   1211                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
   1212          return hi8;
   1213       }
   1214 
   1215       if (e->Iex.Binop.op == Iop_MullS32
   1216           || e->Iex.Binop.op == Iop_MullS16
   1217           || e->Iex.Binop.op == Iop_MullS8
   1218           || e->Iex.Binop.op == Iop_MullU32
   1219           || e->Iex.Binop.op == Iop_MullU16
   1220           || e->Iex.Binop.op == Iop_MullU8) {
   1221          HReg a32   = newVRegI(env);
   1222          HReg b32   = newVRegI(env);
   1223          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1224          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1225          Int          shift  = 0;
   1226          AMD64ShiftOp shr_op = Ash_SHR;
   1227          switch (e->Iex.Binop.op) {
   1228             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
   1229             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
   1230             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
   1231             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
   1232             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
   1233             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
   1234             default: vassert(0);
   1235          }
   1236 
   1237          addInstr(env, mk_iMOVsd_RR(a32s, a32));
   1238          addInstr(env, mk_iMOVsd_RR(b32s, b32));
   1239          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
   1240          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
   1241          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
   1242          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
   1243          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
   1244          return b32;
   1245       }
   1246 
   1247       if (e->Iex.Binop.op == Iop_CmpF64) {
   1248          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
   1249          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
   1250          HReg dst = newVRegI(env);
   1251          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
   1252          /* Mask out irrelevant parts of the result so as to conform
   1253             to the CmpF64 definition. */
   1254          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
   1255          return dst;
   1256       }
   1257 
   1258       if (e->Iex.Binop.op == Iop_F64toI32S
   1259           || e->Iex.Binop.op == Iop_F64toI64S) {
   1260          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
   1261          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1262          HReg dst = newVRegI(env);
   1263          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   1264          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
   1265          set_SSE_rounding_default(env);
   1266          return dst;
   1267       }
   1268 
   1269       break;
   1270    }
   1271 
   1272    /* --------- UNARY OP --------- */
   1273    case Iex_Unop: {
   1274 
   1275       /* 1Uto8(64to1(expr64)) */
   1276       {
   1277          DEFINE_PATTERN( p_1Uto8_64to1,
   1278                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
   1279          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
   1280             IRExpr* expr64 = mi.bindee[0];
   1281             HReg    dst    = newVRegI(env);
   1282             HReg    src    = iselIntExpr_R(env, expr64);
   1283             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1284             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1285                                             AMD64RMI_Imm(1), dst));
   1286             return dst;
   1287          }
   1288       }
   1289 
   1290       /* 8Uto64(LDle(expr64)) */
   1291       {
   1292          DEFINE_PATTERN(p_LDle8_then_8Uto64,
   1293                         unop(Iop_8Uto64,
   1294                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1295          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
   1296             HReg dst = newVRegI(env);
   1297             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1298             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
   1299             return dst;
   1300          }
   1301       }
   1302 
   1303       /* 16Uto64(LDle(expr64)) */
   1304       {
   1305          DEFINE_PATTERN(p_LDle16_then_16Uto64,
   1306                         unop(Iop_16Uto64,
   1307                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1308          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
   1309             HReg dst = newVRegI(env);
   1310             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1311             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
   1312             return dst;
   1313          }
   1314       }
   1315 
   1316       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
   1317          Use 32 bit arithmetic and let the default zero-extend rule
   1318          do the 32Uto64 for free. */
   1319       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
   1320          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
   1321          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
   1322          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
   1323          AMD64AluOp aluOp = Aalu_INVALID;
   1324          switch (opi) {
   1325             case Iop_Add32: aluOp = Aalu_ADD; break;
   1326             case Iop_Sub32: aluOp = Aalu_SUB; break;
   1327             case Iop_And32: aluOp = Aalu_AND; break;
   1328             case Iop_Or32:  aluOp = Aalu_OR;  break;
   1329             case Iop_Xor32: aluOp = Aalu_XOR; break;
   1330             default: break;
   1331          }
   1332          if (aluOp != Aalu_INVALID) {
   1333             /* For commutative ops we assume any literal values are on
   1334                the second operand. */
   1335             HReg dst      = newVRegI(env);
   1336             HReg reg      = iselIntExpr_R(env, argL);
   1337             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
   1338             addInstr(env, mk_iMOVsd_RR(reg,dst));
   1339             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
   1340             return dst;
   1341          }
   1342          /* just fall through to normal handling for Iop_32Uto64 */
   1343       }
   1344 
   1345       /* Fallback cases */
   1346       switch (e->Iex.Unop.op) {
   1347          case Iop_32Uto64:
   1348          case Iop_32Sto64: {
   1349             HReg dst = newVRegI(env);
   1350             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1351             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
   1352                                             src, dst) );
   1353             return dst;
   1354          }
   1355          case Iop_128HIto64: {
   1356             HReg rHi, rLo;
   1357             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1358             return rHi; /* and abandon rLo */
   1359          }
   1360          case Iop_128to64: {
   1361             HReg rHi, rLo;
   1362             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1363             return rLo; /* and abandon rHi */
   1364          }
   1365          case Iop_8Uto16:
   1366          case Iop_8Uto32:
   1367          case Iop_8Uto64:
   1368          case Iop_16Uto64:
   1369          case Iop_16Uto32: {
   1370             HReg dst     = newVRegI(env);
   1371             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1372             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
   1373                                    || e->Iex.Unop.op==Iop_16Uto64 );
   1374             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
   1375             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1376             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1377                                             AMD64RMI_Imm(mask), dst));
   1378             return dst;
   1379          }
   1380          case Iop_8Sto16:
   1381          case Iop_8Sto64:
   1382          case Iop_8Sto32:
   1383          case Iop_16Sto32:
   1384          case Iop_16Sto64: {
   1385             HReg dst     = newVRegI(env);
   1386             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1387             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
   1388                                    || e->Iex.Unop.op==Iop_16Sto64 );
   1389             UInt amt     = srcIs16 ? 48 : 56;
   1390             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1391             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
   1392             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
   1393             return dst;
   1394          }
   1395  	 case Iop_Not8:
   1396  	 case Iop_Not16:
   1397          case Iop_Not32:
   1398          case Iop_Not64: {
   1399             HReg dst = newVRegI(env);
   1400             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1401             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1402             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
   1403             return dst;
   1404          }
   1405          case Iop_16HIto8:
   1406          case Iop_32HIto16:
   1407          case Iop_64HIto32: {
   1408             HReg dst  = newVRegI(env);
   1409             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1410             Int shift = 0;
   1411             switch (e->Iex.Unop.op) {
   1412                case Iop_16HIto8:  shift = 8;  break;
   1413                case Iop_32HIto16: shift = 16; break;
   1414                case Iop_64HIto32: shift = 32; break;
   1415                default: vassert(0);
   1416             }
   1417             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1418             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
   1419             return dst;
   1420          }
   1421          case Iop_1Uto64:
   1422          case Iop_1Uto32:
   1423          case Iop_1Uto8: {
   1424             HReg dst           = newVRegI(env);
   1425             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1426             addInstr(env, AMD64Instr_Set64(cond,dst));
   1427             return dst;
   1428          }
   1429          case Iop_1Sto8:
   1430          case Iop_1Sto16:
   1431          case Iop_1Sto32:
   1432          case Iop_1Sto64: {
   1433             /* could do better than this, but for now ... */
   1434             HReg dst           = newVRegI(env);
   1435             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1436             addInstr(env, AMD64Instr_Set64(cond,dst));
   1437             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
   1438             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1439             return dst;
   1440          }
   1441          case Iop_Ctz64: {
   1442             /* Count trailing zeroes, implemented by amd64 'bsfq' */
   1443             HReg dst = newVRegI(env);
   1444             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1445             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
   1446             return dst;
   1447          }
   1448          case Iop_Clz64: {
   1449             /* Count leading zeroes.  Do 'bsrq' to establish the index
   1450                of the highest set bit, and subtract that value from
   1451                63. */
   1452             HReg tmp = newVRegI(env);
   1453             HReg dst = newVRegI(env);
   1454             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1455             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
   1456             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
   1457                                             AMD64RMI_Imm(63), dst));
   1458             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
   1459                                             AMD64RMI_Reg(tmp), dst));
   1460             return dst;
   1461          }
   1462 
   1463          case Iop_CmpwNEZ64: {
   1464             HReg dst = newVRegI(env);
   1465             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1466             addInstr(env, mk_iMOVsd_RR(src,dst));
   1467             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1468             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1469                                             AMD64RMI_Reg(src), dst));
   1470             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1471             return dst;
   1472          }
   1473 
   1474          case Iop_CmpwNEZ32: {
   1475             HReg src = newVRegI(env);
   1476             HReg dst = newVRegI(env);
   1477             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
   1478             addInstr(env, mk_iMOVsd_RR(pre,src));
   1479             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
   1480             addInstr(env, mk_iMOVsd_RR(src,dst));
   1481             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1482             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1483                                             AMD64RMI_Reg(src), dst));
   1484             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1485             return dst;
   1486          }
   1487 
   1488          case Iop_Left8:
   1489          case Iop_Left16:
   1490          case Iop_Left32:
   1491          case Iop_Left64: {
   1492             HReg dst = newVRegI(env);
   1493             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1494             addInstr(env, mk_iMOVsd_RR(src, dst));
   1495             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
   1496             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
   1497             return dst;
   1498          }
   1499 
   1500          case Iop_V128to32: {
   1501             HReg        dst     = newVRegI(env);
   1502             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
   1503             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   1504             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
   1505             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
   1506             return dst;
   1507          }
   1508 
   1509          /* V128{HI}to64 */
   1510          case Iop_V128HIto64:
   1511          case Iop_V128to64: {
   1512             HReg dst = newVRegI(env);
   1513             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
   1514             HReg rsp = hregAMD64_RSP();
   1515             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1516             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1517             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1518             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1519                                              16, vec, m16_rsp));
   1520             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1521                                              AMD64RMI_Mem(off_rsp), dst ));
   1522             return dst;
   1523          }
   1524 
   1525          case Iop_V256to64_0: case Iop_V256to64_1:
   1526          case Iop_V256to64_2: case Iop_V256to64_3: {
   1527             HReg vHi, vLo, vec;
   1528             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   1529             /* Do the first part of the selection by deciding which of
   1530                the 128 bit registers do look at, and second part using
   1531                the same scheme as for V128{HI}to64 above. */
   1532             Int off = 0;
   1533             switch (e->Iex.Unop.op) {
   1534                case Iop_V256to64_0: vec = vLo; off = -16; break;
   1535                case Iop_V256to64_1: vec = vLo; off =  -8; break;
   1536                case Iop_V256to64_2: vec = vHi; off = -16; break;
   1537                case Iop_V256to64_3: vec = vHi; off =  -8; break;
   1538                default: vassert(0);
   1539             }
   1540             HReg        dst     = newVRegI(env);
   1541             HReg        rsp     = hregAMD64_RSP();
   1542             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1543             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1544             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1545                                              16, vec, m16_rsp));
   1546             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1547                                              AMD64RMI_Mem(off_rsp), dst ));
   1548             return dst;
   1549          }
   1550 
   1551          /* ReinterpF64asI64(e) */
   1552          /* Given an IEEE754 double, produce an I64 with the same bit
   1553             pattern. */
   1554          case Iop_ReinterpF64asI64: {
   1555             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1556             HReg        dst    = newVRegI(env);
   1557             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
   1558             /* paranoia */
   1559             set_SSE_rounding_default(env);
   1560             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
   1561             addInstr(env, AMD64Instr_Alu64R(
   1562                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
   1563             return dst;
   1564          }
   1565 
   1566          /* ReinterpF32asI32(e) */
   1567          /* Given an IEEE754 single, produce an I64 with the same bit
   1568             pattern in the lower half. */
   1569          case Iop_ReinterpF32asI32: {
   1570             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1571             HReg        dst    = newVRegI(env);
   1572             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
   1573             /* paranoia */
   1574             set_SSE_rounding_default(env);
   1575             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
   1576             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
   1577             return dst;
   1578          }
   1579 
   1580          case Iop_16to8:
   1581          case Iop_32to8:
   1582          case Iop_64to8:
   1583          case Iop_32to16:
   1584          case Iop_64to16:
   1585          case Iop_64to32:
   1586             /* These are no-ops. */
   1587             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1588 
   1589          default:
   1590             break;
   1591       }
   1592 
   1593       /* Deal with unary 64-bit SIMD ops. */
   1594       switch (e->Iex.Unop.op) {
   1595          case Iop_CmpNEZ32x2:
   1596             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
   1597          case Iop_CmpNEZ16x4:
   1598             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
   1599          case Iop_CmpNEZ8x8:
   1600             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
   1601          default:
   1602             fn = (HWord)0; break;
   1603       }
   1604       if (fn != (HWord)0) {
   1605          /* Note: the following assumes all helpers are of
   1606             signature
   1607                ULong fn ( ULong ), and they are
   1608             not marked as regparm functions.
   1609          */
   1610          HReg dst = newVRegI(env);
   1611          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1612          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1613          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
   1614          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1615          return dst;
   1616       }
   1617 
   1618       break;
   1619    }
   1620 
   1621    /* --------- GET --------- */
   1622    case Iex_Get: {
   1623       if (ty == Ity_I64) {
   1624          HReg dst = newVRegI(env);
   1625          addInstr(env, AMD64Instr_Alu64R(
   1626                           Aalu_MOV,
   1627                           AMD64RMI_Mem(
   1628                              AMD64AMode_IR(e->Iex.Get.offset,
   1629                                            hregAMD64_RBP())),
   1630                           dst));
   1631          return dst;
   1632       }
   1633       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   1634          HReg dst = newVRegI(env);
   1635          addInstr(env, AMD64Instr_LoadEX(
   1636                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   1637                           False,
   1638                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
   1639                           dst));
   1640          return dst;
   1641       }
   1642       break;
   1643    }
   1644 
   1645    case Iex_GetI: {
   1646       AMD64AMode* am
   1647          = genGuestArrayOffset(
   1648               env, e->Iex.GetI.descr,
   1649                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1650       HReg dst = newVRegI(env);
   1651       if (ty == Ity_I8) {
   1652          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
   1653          return dst;
   1654       }
   1655       if (ty == Ity_I64) {
   1656          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
   1657          return dst;
   1658       }
   1659       break;
   1660    }
   1661 
   1662    /* --------- CCALL --------- */
   1663    case Iex_CCall: {
   1664       HReg    dst = newVRegI(env);
   1665       vassert(ty == e->Iex.CCall.retty);
   1666 
   1667       /* be very restrictive for now.  Only 64-bit ints allowed
   1668          for args, and 64 or 32 bits for return type. */
   1669       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
   1670          goto irreducible;
   1671 
   1672       /* Marshal args, do the call. */
   1673       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
   1674 
   1675       /* Move to dst, and zero out the top 32 bits if the result type is
   1676          Ity_I32.  Probably overkill, but still .. */
   1677       if (e->Iex.CCall.retty == Ity_I64)
   1678          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1679       else
   1680          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1681 
   1682       return dst;
   1683    }
   1684 
   1685    /* --------- LITERAL --------- */
   1686    /* 64/32/16/8-bit literals */
   1687    case Iex_Const:
   1688       if (ty == Ity_I64) {
   1689          HReg r = newVRegI(env);
   1690          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
   1691          return r;
   1692       } else {
   1693          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
   1694          HReg      r   = newVRegI(env);
   1695          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
   1696          return r;
   1697       }
   1698 
   1699    /* --------- MULTIPLEX --------- */
   1700    case Iex_Mux0X: {
   1701      if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1702          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
   1703         HReg     r8;
   1704         HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
   1705         AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
   1706         HReg dst = newVRegI(env);
   1707         addInstr(env, mk_iMOVsd_RR(rX,dst));
   1708         r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   1709         addInstr(env, AMD64Instr_Test64(0xFF, r8));
   1710         addInstr(env, AMD64Instr_CMov64(Acc_Z,r0,dst));
   1711         return dst;
   1712       }
   1713       break;
   1714    }
   1715 
   1716    /* --------- TERNARY OP --------- */
   1717    case Iex_Triop: {
   1718       IRTriop *triop = e->Iex.Triop.details;
   1719       /* C3210 flags following FPU partial remainder (fprem), both
   1720          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
   1721       if (triop->op == Iop_PRemC3210F64
   1722           || triop->op == Iop_PRem1C3210F64) {
   1723          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1724          HReg        arg1   = iselDblExpr(env, triop->arg2);
   1725          HReg        arg2   = iselDblExpr(env, triop->arg3);
   1726          HReg        dst    = newVRegI(env);
   1727          addInstr(env, AMD64Instr_A87Free(2));
   1728 
   1729          /* one arg -> top of x87 stack */
   1730          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
   1731          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1732 
   1733          /* other arg -> top of x87 stack */
   1734          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
   1735          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1736 
   1737          switch (triop->op) {
   1738             case Iop_PRemC3210F64:
   1739                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   1740                break;
   1741             case Iop_PRem1C3210F64:
   1742                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   1743                break;
   1744             default:
   1745                vassert(0);
   1746          }
   1747          /* Ignore the result, and instead make off with the FPU's
   1748 	    C3210 flags (in the status word). */
   1749          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
   1750          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
   1751          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
   1752          return dst;
   1753       }
   1754       break;
   1755    }
   1756 
   1757    default:
   1758    break;
   1759    } /* switch (e->tag) */
   1760 
   1761    /* We get here if no pattern matched. */
   1762   irreducible:
   1763    ppIRExpr(e);
   1764    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
   1765 }
   1766 
   1767 
   1768 /*---------------------------------------------------------*/
   1769 /*--- ISEL: Integer expression auxiliaries              ---*/
   1770 /*---------------------------------------------------------*/
   1771 
   1772 /* --------------------- AMODEs --------------------- */
   1773 
   1774 /* Return an AMode which computes the value of the specified
   1775    expression, possibly also adding insns to the code list as a
   1776    result.  The expression may only be a 32-bit one.
   1777 */
   1778 
   1779 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1780 {
   1781    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
   1782    vassert(sane_AMode(am));
   1783    return am;
   1784 }
   1785 
   1786 /* DO NOT CALL THIS DIRECTLY ! */
   1787 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1788 {
   1789    MatchInfo mi;
   1790    DECLARE_PATTERN(p_complex);
   1791    IRType ty = typeOfIRExpr(env->type_env,e);
   1792    vassert(ty == Ity_I64);
   1793 
   1794    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
   1795    /*              bind0        bind1  bind2   bind3   */
   1796    DEFINE_PATTERN(p_complex,
   1797       binop( Iop_Add64,
   1798              binop( Iop_Add64,
   1799                     bind(0),
   1800                     binop(Iop_Shl64, bind(1), bind(2))
   1801                   ),
   1802              bind(3)
   1803            )
   1804    );
   1805    if (matchIRExpr(&mi, p_complex, e)) {
   1806       IRExpr* expr1  = mi.bindee[0];
   1807       IRExpr* expr2  = mi.bindee[1];
   1808       IRExpr* imm8   = mi.bindee[2];
   1809       IRExpr* simm32 = mi.bindee[3];
   1810       if (imm8->tag == Iex_Const
   1811           && imm8->Iex.Const.con->tag == Ico_U8
   1812           && imm8->Iex.Const.con->Ico.U8 < 4
   1813           /* imm8 is OK, now check simm32 */
   1814           && simm32->tag == Iex_Const
   1815           && simm32->Iex.Const.con->tag == Ico_U64
   1816           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
   1817          UInt shift = imm8->Iex.Const.con->Ico.U8;
   1818          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
   1819          HReg r1 = iselIntExpr_R(env, expr1);
   1820          HReg r2 = iselIntExpr_R(env, expr2);
   1821          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
   1822          return AMD64AMode_IRRS(offset, r1, r2, shift);
   1823       }
   1824    }
   1825 
   1826    /* Add64(expr1, Shl64(expr2, imm)) */
   1827    if (e->tag == Iex_Binop
   1828        && e->Iex.Binop.op == Iop_Add64
   1829        && e->Iex.Binop.arg2->tag == Iex_Binop
   1830        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
   1831        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1832        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1833       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1834       if (shift == 1 || shift == 2 || shift == 3) {
   1835          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1836          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1837          return AMD64AMode_IRRS(0, r1, r2, shift);
   1838       }
   1839    }
   1840 
   1841    /* Add64(expr,i) */
   1842    if (e->tag == Iex_Binop
   1843        && e->Iex.Binop.op == Iop_Add64
   1844        && e->Iex.Binop.arg2->tag == Iex_Const
   1845        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
   1846        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
   1847       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1848       return AMD64AMode_IR(
   1849                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
   1850                 r1
   1851              );
   1852    }
   1853 
   1854    /* Doesn't match anything in particular.  Generate it into
   1855       a register and use that. */
   1856    {
   1857       HReg r1 = iselIntExpr_R(env, e);
   1858       return AMD64AMode_IR(0, r1);
   1859    }
   1860 }
   1861 
   1862 
   1863 /* --------------------- RMIs --------------------- */
   1864 
   1865 /* Similarly, calculate an expression into an X86RMI operand.  As with
   1866    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   1867 
   1868 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   1869 {
   1870    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   1871    /* sanity checks ... */
   1872    switch (rmi->tag) {
   1873       case Armi_Imm:
   1874          return rmi;
   1875       case Armi_Reg:
   1876          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
   1877          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
   1878          return rmi;
   1879       case Armi_Mem:
   1880          vassert(sane_AMode(rmi->Armi.Mem.am));
   1881          return rmi;
   1882       default:
   1883          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
   1884    }
   1885 }
   1886 
   1887 /* DO NOT CALL THIS DIRECTLY ! */
   1888 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   1889 {
   1890    IRType ty = typeOfIRExpr(env->type_env,e);
   1891    vassert(ty == Ity_I64 || ty == Ity_I32
   1892            || ty == Ity_I16 || ty == Ity_I8);
   1893 
   1894    /* special case: immediate 64/32/16/8 */
   1895    if (e->tag == Iex_Const) {
   1896       switch (e->Iex.Const.con->tag) {
   1897         case Ico_U64:
   1898            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   1899               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   1900            }
   1901            break;
   1902          case Ico_U32:
   1903             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
   1904          case Ico_U16:
   1905             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
   1906          case Ico_U8:
   1907             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
   1908          default:
   1909             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   1910       }
   1911    }
   1912 
   1913    /* special case: 64-bit GET */
   1914    if (e->tag == Iex_Get && ty == Ity_I64) {
   1915       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   1916                                         hregAMD64_RBP()));
   1917    }
   1918 
   1919    /* special case: 64-bit load from memory */
   1920    if (e->tag == Iex_Load && ty == Ity_I64
   1921        && e->Iex.Load.end == Iend_LE) {
   1922       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   1923       return AMD64RMI_Mem(am);
   1924    }
   1925 
   1926    /* default case: calculate into a register and return that */
   1927    {
   1928       HReg r = iselIntExpr_R ( env, e );
   1929       return AMD64RMI_Reg(r);
   1930    }
   1931 }
   1932 
   1933 
   1934 /* --------------------- RIs --------------------- */
   1935 
   1936 /* Calculate an expression into an AMD64RI operand.  As with
   1937    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   1938    bits. */
   1939 
   1940 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   1941 {
   1942    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
   1943    /* sanity checks ... */
   1944    switch (ri->tag) {
   1945       case Ari_Imm:
   1946          return ri;
   1947       case Ari_Reg:
   1948          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
   1949          vassert(hregIsVirtual(ri->Ari.Reg.reg));
   1950          return ri;
   1951       default:
   1952          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
   1953    }
   1954 }
   1955 
   1956 /* DO NOT CALL THIS DIRECTLY ! */
   1957 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   1958 {
   1959    IRType ty = typeOfIRExpr(env->type_env,e);
   1960    vassert(ty == Ity_I64 || ty == Ity_I32
   1961            || ty == Ity_I16 || ty == Ity_I8);
   1962 
   1963    /* special case: immediate */
   1964    if (e->tag == Iex_Const) {
   1965       switch (e->Iex.Const.con->tag) {
   1966         case Ico_U64:
   1967            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   1968               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   1969            }
   1970            break;
   1971          case Ico_U32:
   1972             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
   1973          case Ico_U16:
   1974             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
   1975          case Ico_U8:
   1976             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
   1977          default:
   1978             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   1979       }
   1980    }
   1981 
   1982    /* default case: calculate into a register and return that */
   1983    {
   1984       HReg r = iselIntExpr_R ( env, e );
   1985       return AMD64RI_Reg(r);
   1986    }
   1987 }
   1988 
   1989 
   1990 /* --------------------- RMs --------------------- */
   1991 
   1992 /* Similarly, calculate an expression into an AMD64RM operand.  As
   1993    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   1994    bits.  */
   1995 
   1996 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   1997 {
   1998    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
   1999    /* sanity checks ... */
   2000    switch (rm->tag) {
   2001       case Arm_Reg:
   2002          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
   2003          vassert(hregIsVirtual(rm->Arm.Reg.reg));
   2004          return rm;
   2005       case Arm_Mem:
   2006          vassert(sane_AMode(rm->Arm.Mem.am));
   2007          return rm;
   2008       default:
   2009          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
   2010    }
   2011 }
   2012 
   2013 /* DO NOT CALL THIS DIRECTLY ! */
   2014 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   2015 {
   2016    IRType ty = typeOfIRExpr(env->type_env,e);
   2017    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   2018 
   2019    /* special case: 64-bit GET */
   2020    if (e->tag == Iex_Get && ty == Ity_I64) {
   2021       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2022                                        hregAMD64_RBP()));
   2023    }
   2024 
   2025    /* special case: load from memory */
   2026 
   2027    /* default case: calculate into a register and return that */
   2028    {
   2029       HReg r = iselIntExpr_R ( env, e );
   2030       return AMD64RM_Reg(r);
   2031    }
   2032 }
   2033 
   2034 
   2035 /* --------------------- CONDCODE --------------------- */
   2036 
   2037 /* Generate code to evaluated a bit-typed expression, returning the
   2038    condition code which would correspond when the expression would
   2039    notionally have returned 1. */
   2040 
   2041 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   2042 {
   2043    /* Uh, there's nothing we can sanity check here, unfortunately. */
   2044    return iselCondCode_wrk(env,e);
   2045 }
   2046 
   2047 /* DO NOT CALL THIS DIRECTLY ! */
   2048 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   2049 {
   2050    MatchInfo mi;
   2051 
   2052    vassert(e);
   2053    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   2054 
   2055    /* var */
   2056    if (e->tag == Iex_RdTmp) {
   2057       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2058       HReg dst = newVRegI(env);
   2059       addInstr(env, mk_iMOVsd_RR(r64,dst));
   2060       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
   2061       return Acc_NZ;
   2062    }
   2063 
   2064    /* Constant 1:Bit */
   2065    if (e->tag == Iex_Const) {
   2066       HReg r;
   2067       vassert(e->Iex.Const.con->tag == Ico_U1);
   2068       vassert(e->Iex.Const.con->Ico.U1 == True
   2069               || e->Iex.Const.con->Ico.U1 == False);
   2070       r = newVRegI(env);
   2071       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
   2072       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
   2073       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
   2074    }
   2075 
   2076    /* Not1(...) */
   2077    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   2078       /* Generate code for the arg, and negate the test condition */
   2079       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   2080    }
   2081 
   2082    /* --- patterns rooted at: 64to1 --- */
   2083 
   2084    /* 64to1 */
   2085    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
   2086       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2087       addInstr(env, AMD64Instr_Test64(1,reg));
   2088       return Acc_NZ;
   2089    }
   2090 
   2091    /* --- patterns rooted at: 32to1 --- */
   2092 
   2093    /* 32to1 */
   2094    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
   2095       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2096       addInstr(env, AMD64Instr_Test64(1,reg));
   2097       return Acc_NZ;
   2098    }
   2099 
   2100    /* --- patterns rooted at: CmpNEZ8 --- */
   2101 
   2102    /* CmpNEZ8(x) */
   2103    if (e->tag == Iex_Unop
   2104        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   2105       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2106       addInstr(env, AMD64Instr_Test64(0xFF,r));
   2107       return Acc_NZ;
   2108    }
   2109 
   2110    /* --- patterns rooted at: CmpNEZ16 --- */
   2111 
   2112    /* CmpNEZ16(x) */
   2113    if (e->tag == Iex_Unop
   2114        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   2115       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2116       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
   2117       return Acc_NZ;
   2118    }
   2119 
   2120    /* --- patterns rooted at: CmpNEZ32 --- */
   2121 
   2122    /* CmpNEZ32(x) */
   2123    if (e->tag == Iex_Unop
   2124        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   2125       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2126       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2127       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2128       return Acc_NZ;
   2129    }
   2130 
   2131    /* --- patterns rooted at: CmpNEZ64 --- */
   2132 
   2133    /* CmpNEZ64(Or64(x,y)) */
   2134    {
   2135       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   2136       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   2137                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   2138       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   2139          HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
   2140          AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   2141          HReg      tmp  = newVRegI(env);
   2142          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   2143          addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
   2144          return Acc_NZ;
   2145       }
   2146    }
   2147 
   2148    /* CmpNEZ64(x) */
   2149    if (e->tag == Iex_Unop
   2150        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   2151       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2152       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2153       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2154       return Acc_NZ;
   2155    }
   2156 
   2157    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
   2158 
   2159    /* CmpEQ8 / CmpNE8 */
   2160    if (e->tag == Iex_Binop
   2161        && (e->Iex.Binop.op == Iop_CmpEQ8
   2162            || e->Iex.Binop.op == Iop_CmpNE8
   2163            || e->Iex.Binop.op == Iop_CasCmpEQ8
   2164            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   2165       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2166       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2167       HReg      r    = newVRegI(env);
   2168       addInstr(env, mk_iMOVsd_RR(r1,r));
   2169       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2170       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
   2171       switch (e->Iex.Binop.op) {
   2172          case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2173          case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2174          default: vpanic("iselCondCode(amd64): CmpXX8");
   2175       }
   2176    }
   2177 
   2178    /* CmpEQ16 / CmpNE16 */
   2179    if (e->tag == Iex_Binop
   2180        && (e->Iex.Binop.op == Iop_CmpEQ16
   2181            || e->Iex.Binop.op == Iop_CmpNE16
   2182            || e->Iex.Binop.op == Iop_CasCmpEQ16
   2183            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   2184       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2185       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2186       HReg      r    = newVRegI(env);
   2187       addInstr(env, mk_iMOVsd_RR(r1,r));
   2188       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2189       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
   2190       switch (e->Iex.Binop.op) {
   2191          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
   2192          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
   2193          default: vpanic("iselCondCode(amd64): CmpXX16");
   2194       }
   2195    }
   2196 
   2197    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
   2198       Saves a "movq %rax, %tmp" compared to the default route. */
   2199    if (e->tag == Iex_Binop
   2200        && e->Iex.Binop.op == Iop_CmpNE64
   2201        && e->Iex.Binop.arg1->tag == Iex_CCall
   2202        && e->Iex.Binop.arg2->tag == Iex_Const) {
   2203       IRExpr* cal = e->Iex.Binop.arg1;
   2204       IRExpr* con = e->Iex.Binop.arg2;
   2205       HReg    tmp = newVRegI(env);
   2206       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   2207       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
   2208       vassert(con->Iex.Const.con->tag == Ico_U64);
   2209       /* Marshal args, do the call. */
   2210       doHelperCall( env, False, NULL, cal->Iex.CCall.cee, cal->Iex.CCall.args );
   2211       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
   2212       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
   2213                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
   2214       return Acc_NZ;
   2215    }
   2216 
   2217    /* Cmp*64*(x,y) */
   2218    if (e->tag == Iex_Binop
   2219        && (e->Iex.Binop.op == Iop_CmpEQ64
   2220            || e->Iex.Binop.op == Iop_CmpNE64
   2221            || e->Iex.Binop.op == Iop_CmpLT64S
   2222            || e->Iex.Binop.op == Iop_CmpLT64U
   2223            || e->Iex.Binop.op == Iop_CmpLE64S
   2224            || e->Iex.Binop.op == Iop_CmpLE64U
   2225            || e->Iex.Binop.op == Iop_CasCmpEQ64
   2226            || e->Iex.Binop.op == Iop_CasCmpNE64)) {
   2227       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2228       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2229       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2230       switch (e->Iex.Binop.op) {
   2231          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
   2232          case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
   2233 	 case Iop_CmpLT64S: return Acc_L;
   2234 	 case Iop_CmpLT64U: return Acc_B;
   2235 	 case Iop_CmpLE64S: return Acc_LE;
   2236          case Iop_CmpLE64U: return Acc_BE;
   2237          default: vpanic("iselCondCode(amd64): CmpXX64");
   2238       }
   2239    }
   2240 
   2241    /* Cmp*32*(x,y) */
   2242    if (e->tag == Iex_Binop
   2243        && (e->Iex.Binop.op == Iop_CmpEQ32
   2244            || e->Iex.Binop.op == Iop_CmpNE32
   2245            || e->Iex.Binop.op == Iop_CmpLT32S
   2246            || e->Iex.Binop.op == Iop_CmpLT32U
   2247            || e->Iex.Binop.op == Iop_CmpLE32S
   2248            || e->Iex.Binop.op == Iop_CmpLE32U
   2249            || e->Iex.Binop.op == Iop_CasCmpEQ32
   2250            || e->Iex.Binop.op == Iop_CasCmpNE32)) {
   2251       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2252       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2253       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2254       switch (e->Iex.Binop.op) {
   2255          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
   2256          case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
   2257 	 case Iop_CmpLT32S: return Acc_L;
   2258 	 case Iop_CmpLT32U: return Acc_B;
   2259 	 case Iop_CmpLE32S: return Acc_LE;
   2260          case Iop_CmpLE32U: return Acc_BE;
   2261          default: vpanic("iselCondCode(amd64): CmpXX32");
   2262       }
   2263    }
   2264 
   2265    ppIRExpr(e);
   2266    vpanic("iselCondCode(amd64)");
   2267 }
   2268 
   2269 
   2270 /*---------------------------------------------------------*/
   2271 /*--- ISEL: Integer expressions (128 bit)               ---*/
   2272 /*---------------------------------------------------------*/
   2273 
   2274 /* Compute a 128-bit value into a register pair, which is returned as
   2275    the first two parameters.  As with iselIntExpr_R, these may be
   2276    either real or virtual regs; in any case they must not be changed
   2277    by subsequent code emitted by the caller.  */
   2278 
   2279 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
   2280                              ISelEnv* env, IRExpr* e )
   2281 {
   2282    iselInt128Expr_wrk(rHi, rLo, env, e);
   2283 #  if 0
   2284    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2285 #  endif
   2286    vassert(hregClass(*rHi) == HRcInt64);
   2287    vassert(hregIsVirtual(*rHi));
   2288    vassert(hregClass(*rLo) == HRcInt64);
   2289    vassert(hregIsVirtual(*rLo));
   2290 }
   2291 
   2292 /* DO NOT CALL THIS DIRECTLY ! */
   2293 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
   2294                                  ISelEnv* env, IRExpr* e )
   2295 {
   2296    vassert(e);
   2297    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
   2298 
   2299    /* read 128-bit IRTemp */
   2300    if (e->tag == Iex_RdTmp) {
   2301       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   2302       return;
   2303    }
   2304 
   2305    /* --------- BINARY ops --------- */
   2306    if (e->tag == Iex_Binop) {
   2307       switch (e->Iex.Binop.op) {
   2308          /* 64 x 64 -> 128 multiply */
   2309          case Iop_MullU64:
   2310          case Iop_MullS64: {
   2311             /* get one operand into %rax, and the other into a R/M.
   2312                Need to make an educated guess about which is better in
   2313                which. */
   2314             HReg     tLo    = newVRegI(env);
   2315             HReg     tHi    = newVRegI(env);
   2316             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
   2317             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2318             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2319             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
   2320             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
   2321             /* Result is now in RDX:RAX.  Tell the caller. */
   2322             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2323             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2324             *rHi = tHi;
   2325             *rLo = tLo;
   2326             return;
   2327          }
   2328 
   2329          /* 128 x 64 -> (64(rem),64(div)) division */
   2330          case Iop_DivModU128to64:
   2331          case Iop_DivModS128to64: {
   2332             /* Get the 128-bit operand into rdx:rax, and the other into
   2333                any old R/M. */
   2334             HReg sHi, sLo;
   2335             HReg     tLo     = newVRegI(env);
   2336             HReg     tHi     = newVRegI(env);
   2337             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
   2338             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2339             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2340             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
   2341             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
   2342             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
   2343             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2344             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2345             *rHi = tHi;
   2346             *rLo = tLo;
   2347             return;
   2348          }
   2349 
   2350          /* 64HLto128(e1,e2) */
   2351          case Iop_64HLto128:
   2352             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2353             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2354             return;
   2355 
   2356          default:
   2357             break;
   2358       }
   2359    } /* if (e->tag == Iex_Binop) */
   2360 
   2361    ppIRExpr(e);
   2362    vpanic("iselInt128Expr");
   2363 }
   2364 
   2365 
   2366 /*---------------------------------------------------------*/
   2367 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2368 /*---------------------------------------------------------*/
   2369 
   2370 /* Nothing interesting here; really just wrappers for
   2371    64-bit stuff. */
   2372 
   2373 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2374 {
   2375    HReg r = iselFltExpr_wrk( env, e );
   2376 #  if 0
   2377    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2378 #  endif
   2379    vassert(hregClass(r) == HRcVec128);
   2380    vassert(hregIsVirtual(r));
   2381    return r;
   2382 }
   2383 
   2384 /* DO NOT CALL THIS DIRECTLY */
   2385 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2386 {
   2387    IRType ty = typeOfIRExpr(env->type_env,e);
   2388    vassert(ty == Ity_F32);
   2389 
   2390    if (e->tag == Iex_RdTmp) {
   2391       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2392    }
   2393 
   2394    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2395       AMD64AMode* am;
   2396       HReg res = newVRegV(env);
   2397       vassert(e->Iex.Load.ty == Ity_F32);
   2398       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2399       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
   2400       return res;
   2401    }
   2402 
   2403    if (e->tag == Iex_Binop
   2404        && e->Iex.Binop.op == Iop_F64toF32) {
   2405       /* Although the result is still held in a standard SSE register,
   2406          we need to round it to reflect the loss of accuracy/range
   2407          entailed in casting it to a 32-bit float. */
   2408       HReg dst = newVRegV(env);
   2409       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2410       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2411       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
   2412       set_SSE_rounding_default( env );
   2413       return dst;
   2414    }
   2415 
   2416    if (e->tag == Iex_Get) {
   2417       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2418                                        hregAMD64_RBP() );
   2419       HReg res = newVRegV(env);
   2420       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
   2421       return res;
   2422    }
   2423 
   2424    if (e->tag == Iex_Unop
   2425        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2426        /* Given an I32, produce an IEEE754 float with the same bit
   2427           pattern. */
   2428        HReg        dst    = newVRegV(env);
   2429        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
   2430        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
   2431        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
   2432        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
   2433        return dst;
   2434    }
   2435 
   2436    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2437       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2438       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
   2439       HReg        dst    = newVRegV(env);
   2440 
   2441       /* rf now holds the value to be rounded.  The first thing to do
   2442          is set the FPU's rounding mode accordingly. */
   2443 
   2444       /* Set host x87 rounding mode */
   2445       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2446 
   2447       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
   2448       addInstr(env, AMD64Instr_A87Free(1));
   2449       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
   2450       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2451       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
   2452       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
   2453 
   2454       /* Restore default x87 rounding. */
   2455       set_FPU_rounding_default( env );
   2456 
   2457       return dst;
   2458    }
   2459 
   2460    ppIRExpr(e);
   2461    vpanic("iselFltExpr_wrk");
   2462 }
   2463 
   2464 
   2465 /*---------------------------------------------------------*/
   2466 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2467 /*---------------------------------------------------------*/
   2468 
   2469 /* Compute a 64-bit floating point value into the lower half of an xmm
   2470    register, the identity of which is returned.  As with
   2471    iselIntExpr_R, the returned reg will be virtual, and it must not be
   2472    changed by subsequent code emitted by the caller.
   2473 */
   2474 
   2475 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2476 
   2477     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2478     ----                  ---------   -----------   -----------
   2479     signalling NaN        u           2047 (max)    .0uuuuu---u
   2480                                                     (with at least
   2481                                                      one 1 bit)
   2482     quiet NaN             u           2047 (max)    .1uuuuu---u
   2483 
   2484     negative infinity     1           2047 (max)    .000000---0
   2485 
   2486     positive infinity     0           2047 (max)    .000000---0
   2487 
   2488     negative zero         1           0             .000000---0
   2489 
   2490     positive zero         0           0             .000000---0
   2491 */
   2492 
   2493 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2494 {
   2495    HReg r = iselDblExpr_wrk( env, e );
   2496 #  if 0
   2497    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2498 #  endif
   2499    vassert(hregClass(r) == HRcVec128);
   2500    vassert(hregIsVirtual(r));
   2501    return r;
   2502 }
   2503 
   2504 /* DO NOT CALL THIS DIRECTLY */
   2505 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2506 {
   2507    IRType ty = typeOfIRExpr(env->type_env,e);
   2508    vassert(e);
   2509    vassert(ty == Ity_F64);
   2510 
   2511    if (e->tag == Iex_RdTmp) {
   2512       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2513    }
   2514 
   2515    if (e->tag == Iex_Const) {
   2516       union { ULong u64; Double f64; } u;
   2517       HReg res = newVRegV(env);
   2518       HReg tmp = newVRegI(env);
   2519       vassert(sizeof(u) == 8);
   2520       vassert(sizeof(u.u64) == 8);
   2521       vassert(sizeof(u.f64) == 8);
   2522 
   2523       if (e->Iex.Const.con->tag == Ico_F64) {
   2524          u.f64 = e->Iex.Const.con->Ico.F64;
   2525       }
   2526       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2527          u.u64 = e->Iex.Const.con->Ico.F64i;
   2528       }
   2529       else
   2530          vpanic("iselDblExpr(amd64): const");
   2531 
   2532       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
   2533       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
   2534       addInstr(env, AMD64Instr_SseLdSt(
   2535                        True/*load*/, 8, res,
   2536                        AMD64AMode_IR(0, hregAMD64_RSP())
   2537               ));
   2538       add_to_rsp(env, 8);
   2539       return res;
   2540    }
   2541 
   2542    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2543       AMD64AMode* am;
   2544       HReg res = newVRegV(env);
   2545       vassert(e->Iex.Load.ty == Ity_F64);
   2546       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2547       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2548       return res;
   2549    }
   2550 
   2551    if (e->tag == Iex_Get) {
   2552       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2553                                       hregAMD64_RBP() );
   2554       HReg res = newVRegV(env);
   2555       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2556       return res;
   2557    }
   2558 
   2559    if (e->tag == Iex_GetI) {
   2560       AMD64AMode* am
   2561          = genGuestArrayOffset(
   2562               env, e->Iex.GetI.descr,
   2563                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2564       HReg res = newVRegV(env);
   2565       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2566       return res;
   2567    }
   2568 
   2569    if (e->tag == Iex_Triop) {
   2570       IRTriop *triop = e->Iex.Triop.details;
   2571       AMD64SseOp op = Asse_INVALID;
   2572       switch (triop->op) {
   2573          case Iop_AddF64: op = Asse_ADDF; break;
   2574          case Iop_SubF64: op = Asse_SUBF; break;
   2575          case Iop_MulF64: op = Asse_MULF; break;
   2576          case Iop_DivF64: op = Asse_DIVF; break;
   2577          default: break;
   2578       }
   2579       if (op != Asse_INVALID) {
   2580          HReg dst  = newVRegV(env);
   2581          HReg argL = iselDblExpr(env, triop->arg2);
   2582          HReg argR = iselDblExpr(env, triop->arg3);
   2583          addInstr(env, mk_vMOVsd_RR(argL, dst));
   2584          /* XXXROUNDINGFIXME */
   2585          /* set roundingmode here */
   2586          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   2587          return dst;
   2588       }
   2589    }
   2590 
   2591    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2592       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2593       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   2594       HReg        dst    = newVRegV(env);
   2595 
   2596       /* rf now holds the value to be rounded.  The first thing to do
   2597          is set the FPU's rounding mode accordingly. */
   2598 
   2599       /* Set host x87 rounding mode */
   2600       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2601 
   2602       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   2603       addInstr(env, AMD64Instr_A87Free(1));
   2604       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2605       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2606       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2607       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2608 
   2609       /* Restore default x87 rounding. */
   2610       set_FPU_rounding_default( env );
   2611 
   2612       return dst;
   2613    }
   2614 
   2615    IRTriop *triop = e->Iex.Triop.details;
   2616    if (e->tag == Iex_Triop
   2617        && (triop->op == Iop_ScaleF64
   2618            || triop->op == Iop_AtanF64
   2619            || triop->op == Iop_Yl2xF64
   2620            || triop->op == Iop_Yl2xp1F64
   2621            || triop->op == Iop_PRemF64
   2622            || triop->op == Iop_PRem1F64)
   2623       ) {
   2624       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2625       HReg        arg1   = iselDblExpr(env, triop->arg2);
   2626       HReg        arg2   = iselDblExpr(env, triop->arg3);
   2627       HReg        dst    = newVRegV(env);
   2628       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
   2629                                   || triop->op == Iop_PRemF64
   2630                                   || triop->op == Iop_PRem1F64);
   2631       addInstr(env, AMD64Instr_A87Free(2));
   2632 
   2633       /* one arg -> top of x87 stack */
   2634       addInstr(env, AMD64Instr_SseLdSt(
   2635                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
   2636       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2637 
   2638       /* other arg -> top of x87 stack */
   2639       addInstr(env, AMD64Instr_SseLdSt(
   2640                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
   2641       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2642 
   2643       /* do it */
   2644       /* XXXROUNDINGFIXME */
   2645       /* set roundingmode here */
   2646       switch (triop->op) {
   2647          case Iop_ScaleF64:
   2648             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
   2649             break;
   2650          case Iop_AtanF64:
   2651             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
   2652             break;
   2653          case Iop_Yl2xF64:
   2654             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
   2655             break;
   2656          case Iop_Yl2xp1F64:
   2657             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
   2658             break;
   2659          case Iop_PRemF64:
   2660             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   2661             break;
   2662          case Iop_PRem1F64:
   2663             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   2664             break;
   2665          default:
   2666             vassert(0);
   2667       }
   2668 
   2669       /* save result */
   2670       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2671       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2672       return dst;
   2673    }
   2674 
   2675    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   2676       HReg dst = newVRegV(env);
   2677       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2678       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2679       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
   2680       set_SSE_rounding_default( env );
   2681       return dst;
   2682    }
   2683 
   2684    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
   2685       HReg dst = newVRegV(env);
   2686       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2687       set_SSE_rounding_default( env );
   2688       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
   2689       return dst;
   2690    }
   2691 
   2692    if (e->tag == Iex_Unop
   2693        && (e->Iex.Unop.op == Iop_NegF64
   2694            || e->Iex.Unop.op == Iop_AbsF64)) {
   2695       /* Sigh ... very rough code.  Could do much better. */
   2696       /* Get the 128-bit literal 00---0 10---0 into a register
   2697          and xor/nand it with the value to be negated. */
   2698       HReg r1  = newVRegI(env);
   2699       HReg dst = newVRegV(env);
   2700       HReg tmp = newVRegV(env);
   2701       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   2702       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2703       addInstr(env, mk_vMOVsd_RR(src,tmp));
   2704       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   2705       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
   2706       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   2707       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   2708 
   2709       if (e->Iex.Unop.op == Iop_NegF64)
   2710          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   2711       else
   2712          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
   2713 
   2714       add_to_rsp(env, 16);
   2715       return dst;
   2716    }
   2717 
   2718    if (e->tag == Iex_Binop) {
   2719       A87FpOp fpop = Afp_INVALID;
   2720       switch (e->Iex.Binop.op) {
   2721          case Iop_SqrtF64: fpop = Afp_SQRT; break;
   2722          case Iop_SinF64:  fpop = Afp_SIN;  break;
   2723          case Iop_CosF64:  fpop = Afp_COS;  break;
   2724          case Iop_TanF64:  fpop = Afp_TAN;  break;
   2725          case Iop_2xm1F64: fpop = Afp_2XM1; break;
   2726          default: break;
   2727       }
   2728       if (fpop != Afp_INVALID) {
   2729          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2730          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   2731          HReg        dst    = newVRegV(env);
   2732          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
   2733          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   2734          addInstr(env, AMD64Instr_A87Free(nNeeded));
   2735          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2736          /* XXXROUNDINGFIXME */
   2737          /* set roundingmode here */
   2738          addInstr(env, AMD64Instr_A87FpOp(fpop));
   2739          if (e->Iex.Binop.op==Iop_TanF64) {
   2740             /* get rid of the extra 1.0 that fptan pushes */
   2741             addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2742          }
   2743          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2744          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2745          return dst;
   2746       }
   2747    }
   2748 
   2749    if (e->tag == Iex_Unop) {
   2750       switch (e->Iex.Unop.op) {
   2751 //..          case Iop_I32toF64: {
   2752 //..             HReg dst = newVRegF(env);
   2753 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   2754 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   2755 //..             set_FPU_rounding_default(env);
   2756 //..             addInstr(env, X86Instr_FpLdStI(
   2757 //..                              True/*load*/, 4, dst,
   2758 //..                              X86AMode_IR(0, hregX86_ESP())));
   2759 //..             add_to_esp(env, 4);
   2760 //..             return dst;
   2761 //..          }
   2762          case Iop_ReinterpI64asF64: {
   2763             /* Given an I64, produce an IEEE754 double with the same
   2764                bit pattern. */
   2765             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2766             HReg        dst    = newVRegV(env);
   2767             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
   2768             /* paranoia */
   2769             set_SSE_rounding_default(env);
   2770             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
   2771             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2772             return dst;
   2773          }
   2774          case Iop_F32toF64: {
   2775             HReg f32;
   2776             HReg f64 = newVRegV(env);
   2777             /* this shouldn't be necessary, but be paranoid ... */
   2778             set_SSE_rounding_default(env);
   2779             f32 = iselFltExpr(env, e->Iex.Unop.arg);
   2780             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
   2781             return f64;
   2782          }
   2783          default:
   2784             break;
   2785       }
   2786    }
   2787 
   2788    /* --------- MULTIPLEX --------- */
   2789    if (e->tag == Iex_Mux0X) {
   2790       HReg r8, rX, r0, dst;
   2791       vassert(ty == Ity_F64);
   2792       vassert(typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8);
   2793       r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   2794       rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
   2795       r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
   2796       dst = newVRegV(env);
   2797       addInstr(env, mk_vMOVsd_RR(rX,dst));
   2798       addInstr(env, AMD64Instr_Test64(0xFF, r8));
   2799       addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
   2800       return dst;
   2801    }
   2802 
   2803    ppIRExpr(e);
   2804    vpanic("iselDblExpr_wrk");
   2805 }
   2806 
   2807 
   2808 /*---------------------------------------------------------*/
   2809 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   2810 /*---------------------------------------------------------*/
   2811 
   2812 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   2813 {
   2814    HReg r = iselVecExpr_wrk( env, e );
   2815 #  if 0
   2816    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2817 #  endif
   2818    vassert(hregClass(r) == HRcVec128);
   2819    vassert(hregIsVirtual(r));
   2820    return r;
   2821 }
   2822 
   2823 
   2824 /* DO NOT CALL THIS DIRECTLY */
   2825 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   2826 {
   2827    HWord      fn = 0; /* address of helper fn, if required */
   2828    Bool       arg1isEReg = False;
   2829    AMD64SseOp op = Asse_INVALID;
   2830    IRType     ty = typeOfIRExpr(env->type_env,e);
   2831    vassert(e);
   2832    vassert(ty == Ity_V128);
   2833 
   2834    if (e->tag == Iex_RdTmp) {
   2835       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2836    }
   2837 
   2838    if (e->tag == Iex_Get) {
   2839       HReg dst = newVRegV(env);
   2840       addInstr(env, AMD64Instr_SseLdSt(
   2841                        True/*load*/,
   2842                        16,
   2843                        dst,
   2844                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
   2845                     )
   2846               );
   2847       return dst;
   2848    }
   2849 
   2850    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2851       HReg        dst = newVRegV(env);
   2852       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2853       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   2854       return dst;
   2855    }
   2856 
   2857    if (e->tag == Iex_Const) {
   2858       HReg dst = newVRegV(env);
   2859       vassert(e->Iex.Const.con->tag == Ico_V128);
   2860       switch (e->Iex.Const.con->Ico.V128) {
   2861          case 0x0000:
   2862             dst = generate_zeroes_V128(env);
   2863             break;
   2864          case 0xFFFF:
   2865             dst = generate_ones_V128(env);
   2866             break;
   2867          default: {
   2868             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2869             /* do push_uimm64 twice, first time for the high-order half. */
   2870             push_uimm64(env, bitmask8_to_bytemask64(
   2871                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
   2872                        ));
   2873             push_uimm64(env, bitmask8_to_bytemask64(
   2874                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
   2875                        ));
   2876             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
   2877             add_to_rsp(env, 16);
   2878             break;
   2879          }
   2880       }
   2881       return dst;
   2882    }
   2883 
   2884    if (e->tag == Iex_Unop) {
   2885    switch (e->Iex.Unop.op) {
   2886 
   2887       case Iop_NotV128: {
   2888          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   2889          return do_sse_NotV128(env, arg);
   2890       }
   2891 
   2892       case Iop_CmpNEZ64x2: {
   2893          /* We can use SSE2 instructions for this. */
   2894          /* Ideally, we want to do a 64Ix2 comparison against zero of
   2895             the operand.  Problem is no such insn exists.  Solution
   2896             therefore is to do a 32Ix4 comparison instead, and bitwise-
   2897             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   2898             let the not'd result of this initial comparison be a:b:c:d.
   2899             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   2900             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   2901             giving the required result.
   2902 
   2903             The required selection sequence is 2,3,0,1, which
   2904             according to Intel's documentation means the pshufd
   2905             literal value is 0xB1, that is,
   2906             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   2907          */
   2908          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   2909          HReg tmp  = generate_zeroes_V128(env);
   2910          HReg dst  = newVRegV(env);
   2911          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
   2912          tmp = do_sse_NotV128(env, tmp);
   2913          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
   2914          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
   2915          return dst;
   2916       }
   2917 
   2918       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   2919       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   2920       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   2921       do_CmpNEZ_vector:
   2922       {
   2923          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   2924          HReg tmp  = newVRegV(env);
   2925          HReg zero = generate_zeroes_V128(env);
   2926          HReg dst;
   2927          addInstr(env, mk_vMOVsd_RR(arg, tmp));
   2928          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
   2929          dst = do_sse_NotV128(env, tmp);
   2930          return dst;
   2931       }
   2932 
   2933       case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
   2934       case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
   2935       case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
   2936       do_32Fx4_unary:
   2937       {
   2938          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   2939          HReg dst = newVRegV(env);
   2940          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
   2941          return dst;
   2942       }
   2943 
   2944       case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
   2945       do_64Fx2_unary:
   2946       {
   2947          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   2948          HReg dst = newVRegV(env);
   2949          addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
   2950          return dst;
   2951       }
   2952 
   2953       case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
   2954       case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
   2955       case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
   2956       do_32F0x4_unary:
   2957       {
   2958          /* A bit subtle.  We have to copy the arg to the result
   2959             register first, because actually doing the SSE scalar insn
   2960             leaves the upper 3/4 of the destination register
   2961             unchanged.  Whereas the required semantics of these
   2962             primops is that the upper 3/4 is simply copied in from the
   2963             argument. */
   2964          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   2965          HReg dst = newVRegV(env);
   2966          addInstr(env, mk_vMOVsd_RR(arg, dst));
   2967          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
   2968          return dst;
   2969       }
   2970 
   2971       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
   2972       do_64F0x2_unary:
   2973       {
   2974          /* A bit subtle.  We have to copy the arg to the result
   2975             register first, because actually doing the SSE scalar insn
   2976             leaves the upper half of the destination register
   2977             unchanged.  Whereas the required semantics of these
   2978             primops is that the upper half is simply copied in from the
   2979             argument. */
   2980          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   2981          HReg dst = newVRegV(env);
   2982          addInstr(env, mk_vMOVsd_RR(arg, dst));
   2983          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
   2984          return dst;
   2985       }
   2986 
   2987       case Iop_32UtoV128: {
   2988          HReg        dst     = newVRegV(env);
   2989          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
   2990          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
   2991          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
   2992          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
   2993          return dst;
   2994       }
   2995 
   2996       case Iop_64UtoV128: {
   2997          HReg        dst  = newVRegV(env);
   2998          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2999          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3000          addInstr(env, AMD64Instr_Push(rmi));
   3001          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
   3002          add_to_rsp(env, 8);
   3003          return dst;
   3004       }
   3005 
   3006       case Iop_V256toV128_0:
   3007       case Iop_V256toV128_1: {
   3008          HReg vHi, vLo;
   3009          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   3010          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
   3011       }
   3012 
   3013       default:
   3014          break;
   3015    } /* switch (e->Iex.Unop.op) */
   3016    } /* if (e->tag == Iex_Unop) */
   3017 
   3018    if (e->tag == Iex_Binop) {
   3019    switch (e->Iex.Binop.op) {
   3020 
   3021       /* FIXME: could we generate MOVQ here? */
   3022       case Iop_SetV128lo64: {
   3023          HReg dst  = newVRegV(env);
   3024          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3025          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3026          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3027          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3028          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
   3029          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3030          return dst;
   3031       }
   3032 
   3033       /* FIXME: could we generate MOVD here? */
   3034       case Iop_SetV128lo32: {
   3035          HReg dst  = newVRegV(env);
   3036          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3037          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3038          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3039          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3040          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
   3041          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3042          return dst;
   3043       }
   3044 
   3045       case Iop_64HLtoV128: {
   3046          HReg        rsp     = hregAMD64_RSP();
   3047          AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   3048          AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   3049          AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
   3050          AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
   3051          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
   3052          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
   3053          HReg        dst = newVRegV(env);
   3054          /* One store-forwarding stall coming up, oh well :-( */
   3055          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
   3056          return dst;
   3057       }
   3058 
   3059       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
   3060       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
   3061       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
   3062       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
   3063       case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
   3064       case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
   3065       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
   3066       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
   3067       case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
   3068       case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
   3069       do_32Fx4:
   3070       {
   3071          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3072          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3073          HReg dst = newVRegV(env);
   3074          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3075          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3076          return dst;
   3077       }
   3078 
   3079       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
   3080       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
   3081       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
   3082       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
   3083       case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
   3084       case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
   3085       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
   3086       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
   3087       case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
   3088       case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
   3089       do_64Fx2:
   3090       {
   3091          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3092          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3093          HReg dst = newVRegV(env);
   3094          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3095          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3096          return dst;
   3097       }
   3098 
   3099       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
   3100       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
   3101       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
   3102       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
   3103       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
   3104       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
   3105       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
   3106       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
   3107       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
   3108       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
   3109       do_32F0x4: {
   3110          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3111          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3112          HReg dst = newVRegV(env);
   3113          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3114          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
   3115          return dst;
   3116       }
   3117 
   3118       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
   3119       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
   3120       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
   3121       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
   3122       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
   3123       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
   3124       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
   3125       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
   3126       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
   3127       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
   3128       do_64F0x2: {
   3129          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3130          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3131          HReg dst = newVRegV(env);
   3132          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3133          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   3134          return dst;
   3135       }
   3136 
   3137       case Iop_QNarrowBin32Sto16Sx8:
   3138          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3139       case Iop_QNarrowBin16Sto8Sx16:
   3140          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3141       case Iop_QNarrowBin16Sto8Ux16:
   3142          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3143 
   3144       case Iop_InterleaveHI8x16:
   3145          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3146       case Iop_InterleaveHI16x8:
   3147          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3148       case Iop_InterleaveHI32x4:
   3149          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3150       case Iop_InterleaveHI64x2:
   3151          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3152 
   3153       case Iop_InterleaveLO8x16:
   3154          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3155       case Iop_InterleaveLO16x8:
   3156          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3157       case Iop_InterleaveLO32x4:
   3158          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3159       case Iop_InterleaveLO64x2:
   3160          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3161 
   3162       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
   3163       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
   3164       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
   3165       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
   3166       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
   3167       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
   3168       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
   3169       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
   3170       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
   3171       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
   3172       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
   3173       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
   3174       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
   3175       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3176       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
   3177       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3178       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
   3179       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
   3180       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
   3181       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
   3182       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
   3183       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
   3184       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
   3185       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
   3186       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
   3187       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
   3188       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
   3189       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
   3190       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
   3191       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
   3192       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
   3193       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
   3194       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
   3195       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
   3196       do_SseReRg: {
   3197          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3198          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3199          HReg dst = newVRegV(env);
   3200          if (arg1isEReg) {
   3201             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3202             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
   3203          } else {
   3204             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3205             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
   3206          }
   3207          return dst;
   3208       }
   3209 
   3210       case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
   3211       case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
   3212       case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
   3213       case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
   3214       case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
   3215       case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
   3216       case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
   3217       case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
   3218       do_SseShift: {
   3219          HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3220          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3221          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3222          HReg        ereg = newVRegV(env);
   3223          HReg        dst  = newVRegV(env);
   3224          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3225          addInstr(env, AMD64Instr_Push(rmi));
   3226          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3227          addInstr(env, mk_vMOVsd_RR(greg, dst));
   3228          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
   3229          add_to_rsp(env, 16);
   3230          return dst;
   3231       }
   3232 
   3233       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
   3234                            goto do_SseAssistedBinary;
   3235       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
   3236                            goto do_SseAssistedBinary;
   3237       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
   3238                            goto do_SseAssistedBinary;
   3239       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
   3240                            goto do_SseAssistedBinary;
   3241       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
   3242                            goto do_SseAssistedBinary;
   3243       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
   3244                            goto do_SseAssistedBinary;
   3245       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
   3246                            goto do_SseAssistedBinary;
   3247       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
   3248                            goto do_SseAssistedBinary;
   3249       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
   3250                            goto do_SseAssistedBinary;
   3251       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   3252                            goto do_SseAssistedBinary;
   3253       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   3254                            goto do_SseAssistedBinary;
   3255       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
   3256                            goto do_SseAssistedBinary;
   3257       case Iop_QNarrowBin32Sto16Ux8:
   3258                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
   3259                            goto do_SseAssistedBinary;
   3260       case Iop_NarrowBin16to8x16:
   3261                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3262                            goto do_SseAssistedBinary;
   3263       case Iop_NarrowBin32to16x8:
   3264                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3265                            goto do_SseAssistedBinary;
   3266       do_SseAssistedBinary: {
   3267          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3268             well. */
   3269          vassert(fn != 0);
   3270          HReg dst = newVRegV(env);
   3271          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3272          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3273          HReg argp = newVRegI(env);
   3274          /* subq $112, %rsp         -- make a space*/
   3275          sub_from_rsp(env, 112);
   3276          /* leaq 48(%rsp), %r_argp  -- point into it */
   3277          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3278                                         argp));
   3279          /* andq $-16, %r_argp      -- 16-align the pointer */
   3280          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3281                                          AMD64RMI_Imm( ~(UInt)15 ),
   3282                                          argp));
   3283          /* Prepare 3 arg regs:
   3284             leaq 0(%r_argp), %rdi
   3285             leaq 16(%r_argp), %rsi
   3286             leaq 32(%r_argp), %rdx
   3287          */
   3288          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3289                                         hregAMD64_RDI()));
   3290          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3291                                         hregAMD64_RSI()));
   3292          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   3293                                         hregAMD64_RDX()));
   3294          /* Store the two args, at (%rsi) and (%rdx):
   3295             movupd  %argL, 0(%rsi)
   3296             movupd  %argR, 0(%rdx)
   3297          */
   3298          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3299                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3300          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
   3301                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   3302          /* call the helper */
   3303          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
   3304          /* fetch the result from memory, using %r_argp, which the
   3305             register allocator will keep alive across the call. */
   3306          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3307                                           AMD64AMode_IR(0, argp)));
   3308          /* and finally, clear the space */
   3309          add_to_rsp(env, 112);
   3310          return dst;
   3311       }
   3312 
   3313       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
   3314                          goto do_SseAssistedVectorAndScalar;
   3315       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
   3316                          goto do_SseAssistedVectorAndScalar;
   3317       do_SseAssistedVectorAndScalar: {
   3318          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3319             well. */
   3320          vassert(fn != 0);
   3321          HReg dst = newVRegV(env);
   3322          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3323          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3324          HReg argp = newVRegI(env);
   3325          /* subq $112, %rsp         -- make a space*/
   3326          sub_from_rsp(env, 112);
   3327          /* leaq 48(%rsp), %r_argp  -- point into it */
   3328          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3329                                         argp));
   3330          /* andq $-16, %r_argp      -- 16-align the pointer */
   3331          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3332                                          AMD64RMI_Imm( ~(UInt)15 ),
   3333                                          argp));
   3334          /* Prepare 2 vector arg regs:
   3335             leaq 0(%r_argp), %rdi
   3336             leaq 16(%r_argp), %rsi
   3337          */
   3338          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3339                                         hregAMD64_RDI()));
   3340          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3341                                         hregAMD64_RSI()));
   3342          /* Store the vector arg, at (%rsi):
   3343             movupd  %argL, 0(%rsi)
   3344          */
   3345          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3346                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3347          /* And get the scalar value into rdx */
   3348          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
   3349 
   3350          /* call the helper */
   3351          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
   3352          /* fetch the result from memory, using %r_argp, which the
   3353             register allocator will keep alive across the call. */
   3354          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3355                                           AMD64AMode_IR(0, argp)));
   3356          /* and finally, clear the space */
   3357          add_to_rsp(env, 112);
   3358          return dst;
   3359       }
   3360 
   3361       default:
   3362          break;
   3363    } /* switch (e->Iex.Binop.op) */
   3364    } /* if (e->tag == Iex_Binop) */
   3365 
   3366    if (e->tag == Iex_Mux0X) {
   3367       HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
   3368       HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
   3369       HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
   3370       HReg dst = newVRegV(env);
   3371       addInstr(env, mk_vMOVsd_RR(rX,dst));
   3372       addInstr(env, AMD64Instr_Test64(0xFF, r8));
   3373       addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
   3374       return dst;
   3375    }
   3376 
   3377    //vec_fail:
   3378    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
   3379               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   3380    ppIRExpr(e);
   3381    vpanic("iselVecExpr_wrk");
   3382 }
   3383 
   3384 
   3385 /*---------------------------------------------------------*/
   3386 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
   3387 /*---------------------------------------------------------*/
   3388 
   3389 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3390                            ISelEnv* env, IRExpr* e )
   3391 {
   3392    iselDVecExpr_wrk( rHi, rLo, env, e );
   3393 #  if 0
   3394    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3395 #  endif
   3396    vassert(hregClass(*rHi) == HRcVec128);
   3397    vassert(hregClass(*rLo) == HRcVec128);
   3398    vassert(hregIsVirtual(*rHi));
   3399    vassert(hregIsVirtual(*rLo));
   3400 }
   3401 
   3402 
   3403 /* DO NOT CALL THIS DIRECTLY */
   3404 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3405                                ISelEnv* env, IRExpr* e )
   3406 {
   3407    vassert(e);
   3408    IRType ty = typeOfIRExpr(env->type_env,e);
   3409    vassert(ty == Ity_V256);
   3410 
   3411    AMD64SseOp op = Asse_INVALID;
   3412 
   3413    /* read 256-bit IRTemp */
   3414    if (e->tag == Iex_RdTmp) {
   3415       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   3416       return;
   3417    }
   3418 
   3419    if (e->tag == Iex_Get) {
   3420       HReg        vHi  = newVRegV(env);
   3421       HReg        vLo  = newVRegV(env);
   3422       HReg        rbp  = hregAMD64_RBP();
   3423       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
   3424       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
   3425       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3426       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3427       *rHi = vHi;
   3428       *rLo = vLo;
   3429       return;
   3430    }
   3431 
   3432    if (e->tag == Iex_Load) {
   3433       HReg        vHi  = newVRegV(env);
   3434       HReg        vLo  = newVRegV(env);
   3435       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
   3436       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   3437       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   3438       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3439       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3440       *rHi = vHi;
   3441       *rLo = vLo;
   3442       return;
   3443    }
   3444 
   3445    if (e->tag == Iex_Const) {
   3446       vassert(e->Iex.Const.con->tag == Ico_V256);
   3447       switch (e->Iex.Const.con->Ico.V256) {
   3448          case 0x00000000: {
   3449             HReg vHi = generate_zeroes_V128(env);
   3450             HReg vLo = newVRegV(env);
   3451             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
   3452             *rHi = vHi;
   3453             *rLo = vLo;
   3454             return;
   3455          }
   3456          default:
   3457             break; /* give up.   Until such time as is necessary. */
   3458       }
   3459    }
   3460 
   3461    if (e->tag == Iex_Unop) {
   3462    switch (e->Iex.Unop.op) {
   3463 
   3464       case Iop_NotV256: {
   3465          HReg argHi, argLo;
   3466          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3467          *rHi = do_sse_NotV128(env, argHi);
   3468          *rLo = do_sse_NotV128(env, argLo);
   3469          return;
   3470       }
   3471 
   3472       case Iop_Recip32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
   3473       case Iop_Sqrt32Fx8:  op = Asse_SQRTF;  goto do_32Fx8_unary;
   3474       case Iop_RSqrt32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
   3475       do_32Fx8_unary:
   3476       {
   3477          HReg argHi, argLo;
   3478          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3479          HReg dstHi = newVRegV(env);
   3480          HReg dstLo = newVRegV(env);
   3481          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
   3482          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
   3483          *rHi = dstHi;
   3484          *rLo = dstLo;
   3485          return;
   3486       }
   3487 
   3488       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
   3489       do_64Fx4_unary:
   3490       {
   3491          HReg argHi, argLo;
   3492          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3493          HReg dstHi = newVRegV(env);
   3494          HReg dstLo = newVRegV(env);
   3495          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
   3496          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
   3497          *rHi = dstHi;
   3498          *rLo = dstLo;
   3499          return;
   3500       }
   3501 
   3502       case Iop_CmpNEZ64x4: {
   3503          /* We can use SSE2 instructions for this. */
   3504          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
   3505             (obviously).  See comment on Iop_CmpNEZ64x2 for
   3506             explanation of what's going on here. */
   3507          HReg argHi, argLo;
   3508          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3509          HReg tmpHi  = generate_zeroes_V128(env);
   3510          HReg tmpLo  = newVRegV(env);
   3511          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
   3512          HReg dstHi  = newVRegV(env);
   3513          HReg dstLo  = newVRegV(env);
   3514          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
   3515          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
   3516          tmpHi = do_sse_NotV128(env, tmpHi);
   3517          tmpLo = do_sse_NotV128(env, tmpLo);
   3518          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
   3519          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
   3520          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
   3521          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
   3522          *rHi = dstHi;
   3523          *rLo = dstLo;
   3524          return;
   3525       }
   3526 
   3527       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3528       do_CmpNEZ_vector:
   3529       {
   3530          HReg argHi, argLo;
   3531          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3532          HReg tmpHi = newVRegV(env);
   3533          HReg tmpLo = newVRegV(env);
   3534          HReg zero  = generate_zeroes_V128(env);
   3535          HReg dstHi, dstLo;
   3536          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
   3537          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
   3538          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
   3539          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
   3540          dstHi = do_sse_NotV128(env, tmpHi);
   3541          dstLo = do_sse_NotV128(env, tmpLo);
   3542          *rHi = dstHi;
   3543          *rLo = dstLo;
   3544          return;
   3545       }
   3546 
   3547       default:
   3548          break;
   3549    } /* switch (e->Iex.Unop.op) */
   3550    } /* if (e->tag == Iex_Unop) */
   3551 
   3552    if (e->tag == Iex_Binop) {
   3553    switch (e->Iex.Binop.op) {
   3554 
   3555       case Iop_Add64Fx4:   op = Asse_ADDF;   goto do_64Fx4;
   3556       case Iop_Sub64Fx4:   op = Asse_SUBF;   goto do_64Fx4;
   3557       case Iop_Mul64Fx4:   op = Asse_MULF;   goto do_64Fx4;
   3558       case Iop_Div64Fx4:   op = Asse_DIVF;   goto do_64Fx4;
   3559       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
   3560       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
   3561       do_64Fx4:
   3562       {
   3563          HReg argLhi, argLlo, argRhi, argRlo;
   3564          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3565          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3566          HReg dstHi = newVRegV(env);
   3567          HReg dstLo = newVRegV(env);
   3568          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3569          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3570          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
   3571          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
   3572          *rHi = dstHi;
   3573          *rLo = dstLo;
   3574          return;
   3575       }
   3576 
   3577       case Iop_Add32Fx8:   op = Asse_ADDF;   goto do_32Fx8;
   3578       case Iop_Sub32Fx8:   op = Asse_SUBF;   goto do_32Fx8;
   3579       case Iop_Mul32Fx8:   op = Asse_MULF;   goto do_32Fx8;
   3580       case Iop_Div32Fx8:   op = Asse_DIVF;   goto do_32Fx8;
   3581       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
   3582       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
   3583       do_32Fx8:
   3584       {
   3585          HReg argLhi, argLlo, argRhi, argRlo;
   3586          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3587          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3588          HReg dstHi = newVRegV(env);
   3589          HReg dstLo = newVRegV(env);
   3590          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3591          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3592          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
   3593          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
   3594          *rHi = dstHi;
   3595          *rLo = dstLo;
   3596          return;
   3597       }
   3598 
   3599       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
   3600       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
   3601       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
   3602       do_SseReRg:
   3603       {
   3604          HReg argLhi, argLlo, argRhi, argRlo;
   3605          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3606          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3607          HReg dstHi = newVRegV(env);
   3608          HReg dstLo = newVRegV(env);
   3609          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3610          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3611          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
   3612          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
   3613          *rHi = dstHi;
   3614          *rLo = dstLo;
   3615          return;
   3616       }
   3617 
   3618       case Iop_V128HLtoV256: {
   3619          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
   3620          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
   3621          return;
   3622       }
   3623 
   3624       default:
   3625          break;
   3626    } /* switch (e->Iex.Binop.op) */
   3627    } /* if (e->tag == Iex_Binop) */
   3628 
   3629    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
   3630       HReg        rsp     = hregAMD64_RSP();
   3631       HReg        vHi     = newVRegV(env);
   3632       HReg        vLo     = newVRegV(env);
   3633       AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   3634       AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   3635       /* arg1 is the most significant (Q3), arg4 the least (Q0) */
   3636       /* Get all the args into regs, before messing with the stack. */
   3637       AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
   3638       AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
   3639       AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
   3640       AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
   3641       /* less significant lane (Q2) at the lower address (-16(rsp)) */
   3642       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
   3643       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
   3644       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
   3645       /* and then the lower half .. */
   3646       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
   3647       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
   3648       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
   3649       *rHi = vHi;
   3650       *rLo = vLo;
   3651       return;
   3652    }
   3653 
   3654    //avx_fail:
   3655    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
   3656               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   3657    ppIRExpr(e);
   3658    vpanic("iselDVecExpr_wrk");
   3659 }
   3660 
   3661 
   3662 /*---------------------------------------------------------*/
   3663 /*--- ISEL: Statements                                  ---*/
   3664 /*---------------------------------------------------------*/
   3665 
   3666 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   3667 {
   3668    if (vex_traceflags & VEX_TRACE_VCODE) {
   3669       vex_printf("\n-- ");
   3670       ppIRStmt(stmt);
   3671       vex_printf("\n");
   3672    }
   3673 
   3674    switch (stmt->tag) {
   3675 
   3676    /* --------- STORE --------- */
   3677    case Ist_Store: {
   3678       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   3679       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   3680       IREndness end   = stmt->Ist.Store.end;
   3681 
   3682       if (tya != Ity_I64 || end != Iend_LE)
   3683          goto stmt_fail;
   3684 
   3685       if (tyd == Ity_I64) {
   3686          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3687          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   3688          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
   3689          return;
   3690       }
   3691       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
   3692          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3693          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   3694          addInstr(env, AMD64Instr_Store(
   3695                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
   3696                           r,am));
   3697          return;
   3698       }
   3699       if (tyd == Ity_F64) {
   3700          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3701          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   3702          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
   3703          return;
   3704       }
   3705       if (tyd == Ity_F32) {
   3706          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3707          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   3708          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
   3709          return;
   3710       }
   3711       if (tyd == Ity_V128) {
   3712          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   3713          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   3714          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
   3715          return;
   3716       }
   3717       if (tyd == Ity_V256) {
   3718          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
   3719          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   3720          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   3721          HReg vHi, vLo;
   3722          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
   3723          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   3724          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   3725          return;
   3726       }
   3727       break;
   3728    }
   3729 
   3730    /* --------- PUT --------- */
   3731    case Ist_Put: {
   3732       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   3733       if (ty == Ity_I64) {
   3734          /* We're going to write to memory, so compute the RHS into an
   3735             AMD64RI. */
   3736          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   3737          addInstr(env,
   3738                   AMD64Instr_Alu64M(
   3739                      Aalu_MOV,
   3740                      ri,
   3741                      AMD64AMode_IR(stmt->Ist.Put.offset,
   3742                                    hregAMD64_RBP())
   3743                  ));
   3744          return;
   3745       }
   3746       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   3747          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   3748          addInstr(env, AMD64Instr_Store(
   3749                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   3750                           r,
   3751                           AMD64AMode_IR(stmt->Ist.Put.offset,
   3752                                         hregAMD64_RBP())));
   3753          return;
   3754       }
   3755       if (ty == Ity_F32) {
   3756          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   3757          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
   3758          set_SSE_rounding_default(env); /* paranoia */
   3759          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
   3760          return;
   3761       }
   3762       if (ty == Ity_F64) {
   3763          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   3764          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
   3765                                          hregAMD64_RBP() );
   3766          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
   3767          return;
   3768       }
   3769       if (ty == Ity_V128) {
   3770          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
   3771          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
   3772                                          hregAMD64_RBP());
   3773          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
   3774          return;
   3775       }
   3776       if (ty == Ity_V256) {
   3777          HReg vHi, vLo;
   3778          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
   3779          HReg        rbp  = hregAMD64_RBP();
   3780          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
   3781          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
   3782          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   3783          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   3784          return;
   3785       }
   3786       break;
   3787    }
   3788 
   3789    /* --------- Indexed PUT --------- */
   3790    case Ist_PutI: {
   3791       IRPutI *puti = stmt->Ist.PutI.details;
   3792 
   3793       AMD64AMode* am
   3794          = genGuestArrayOffset(
   3795               env, puti->descr,
   3796                    puti->ix, puti->bias );
   3797 
   3798       IRType ty = typeOfIRExpr(env->type_env, puti->data);
   3799       if (ty == Ity_F64) {
   3800          HReg val = iselDblExpr(env, puti->data);
   3801          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
   3802          return;
   3803       }
   3804       if (ty == Ity_I8) {
   3805          HReg r = iselIntExpr_R(env, puti->data);
   3806          addInstr(env, AMD64Instr_Store( 1, r, am ));
   3807          return;
   3808       }
   3809       if (ty == Ity_I64) {
   3810          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
   3811          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
   3812          return;
   3813       }
   3814       break;
   3815    }
   3816 
   3817    /* --------- TMP --------- */
   3818    case Ist_WrTmp: {
   3819       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   3820       IRType ty = typeOfIRTemp(env->type_env, tmp);
   3821 
   3822       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
   3823          compute it into an AMode and then use LEA.  This usually
   3824          produces fewer instructions, often because (for memcheck
   3825          created IR) we get t = address-expression, (t is later used
   3826          twice) and so doing this naturally turns address-expression
   3827          back into an AMD64 amode. */
   3828       if (ty == Ity_I64
   3829           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   3830           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
   3831          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   3832          HReg dst = lookupIRTemp(env, tmp);
   3833          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
   3834             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   3835                value into a register.  Just emit a normal reg-reg move
   3836                so reg-alloc can coalesce it away in the usual way. */
   3837             HReg src = am->Aam.IR.reg;
   3838             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
   3839          } else {
   3840             addInstr(env, AMD64Instr_Lea64(am,dst));
   3841          }
   3842          return;
   3843       }
   3844 
   3845       if (ty == Ity_I64 || ty == Ity_I32
   3846           || ty == Ity_I16 || ty == Ity_I8) {
   3847          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   3848          HReg dst = lookupIRTemp(env, tmp);
   3849          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
   3850          return;
   3851       }
   3852       if (ty == Ity_I128) {
   3853          HReg rHi, rLo, dstHi, dstLo;
   3854          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3855          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   3856          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   3857          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   3858          return;
   3859       }
   3860       if (ty == Ity_I1) {
   3861          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   3862          HReg dst = lookupIRTemp(env, tmp);
   3863          addInstr(env, AMD64Instr_Set64(cond, dst));
   3864          return;
   3865       }
   3866       if (ty == Ity_F64) {
   3867          HReg dst = lookupIRTemp(env, tmp);
   3868          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   3869          addInstr(env, mk_vMOVsd_RR(src, dst));
   3870          return;
   3871       }
   3872       if (ty == Ity_F32) {
   3873          HReg dst = lookupIRTemp(env, tmp);
   3874          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   3875          addInstr(env, mk_vMOVsd_RR(src, dst));
   3876          return;
   3877       }
   3878       if (ty == Ity_V128) {
   3879          HReg dst = lookupIRTemp(env, tmp);
   3880          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   3881          addInstr(env, mk_vMOVsd_RR(src, dst));
   3882          return;
   3883       }
   3884       if (ty == Ity_V256) {
   3885          HReg rHi, rLo, dstHi, dstLo;
   3886          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   3887          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   3888          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
   3889          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
   3890          return;
   3891       }
   3892       break;
   3893    }
   3894 
   3895    /* --------- Call to DIRTY helper --------- */
   3896    case Ist_Dirty: {
   3897       IRType   retty;
   3898       IRDirty* d = stmt->Ist.Dirty.details;
   3899       Bool     passBBP = False;
   3900 
   3901       if (d->nFxState == 0)
   3902          vassert(!d->needsBBP);
   3903 
   3904       passBBP = toBool(d->nFxState > 0 && d->needsBBP);
   3905 
   3906       /* Marshal args, do the call, clear stack. */
   3907       doHelperCall( env, passBBP, d->guard, d->cee, d->args );
   3908 
   3909       /* Now figure out what to do with the returned value, if any. */
   3910       if (d->tmp == IRTemp_INVALID)
   3911          /* No return value.  Nothing to do. */
   3912          return;
   3913 
   3914       retty = typeOfIRTemp(env->type_env, d->tmp);
   3915       if (retty == Ity_I64 || retty == Ity_I32
   3916           || retty == Ity_I16 || retty == Ity_I8) {
   3917          /* The returned value is in %rax.  Park it in the register
   3918             associated with tmp. */
   3919          HReg dst = lookupIRTemp(env, d->tmp);
   3920          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
   3921          return;
   3922       }
   3923       break;
   3924    }
   3925 
   3926    /* --------- MEM FENCE --------- */
   3927    case Ist_MBE:
   3928       switch (stmt->Ist.MBE.event) {
   3929          case Imbe_Fence:
   3930             addInstr(env, AMD64Instr_MFence());
   3931             return;
   3932          default:
   3933             break;
   3934       }
   3935       break;
   3936 
   3937    /* --------- ACAS --------- */
   3938    case Ist_CAS:
   3939       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   3940          /* "normal" singleton CAS */
   3941          UChar  sz;
   3942          IRCAS* cas = stmt->Ist.CAS.details;
   3943          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3944          /* get: cas->expd into %rax, and cas->data into %rbx */
   3945          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   3946          HReg rData = iselIntExpr_R(env, cas->dataLo);
   3947          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
   3948          HReg rOld  = lookupIRTemp(env, cas->oldLo);
   3949          vassert(cas->expdHi == NULL);
   3950          vassert(cas->dataHi == NULL);
   3951          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
   3952          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
   3953          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
   3954          switch (ty) {
   3955             case Ity_I64: sz = 8; break;
   3956             case Ity_I32: sz = 4; break;
   3957             case Ity_I16: sz = 2; break;
   3958             case Ity_I8:  sz = 1; break;
   3959             default: goto unhandled_cas;
   3960          }
   3961          addInstr(env, AMD64Instr_ACAS(am, sz));
   3962          addInstr(env, AMD64Instr_CMov64(
   3963                           Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
   3964          return;
   3965       } else {
   3966          /* double CAS */
   3967          UChar  sz;
   3968          IRCAS* cas = stmt->Ist.CAS.details;
   3969          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   3970          /* only 32-bit and 64-bit allowed in this case */
   3971          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
   3972          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
   3973          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   3974          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   3975          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   3976          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   3977          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   3978          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   3979          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   3980          switch (ty) {
   3981             case Ity_I64:
   3982                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
   3983                   goto unhandled_cas; /* we'd have to generate
   3984                                          cmpxchg16b, but the host
   3985                                          doesn't support that */
   3986                sz = 8;
   3987                break;
   3988             case Ity_I32:
   3989                sz = 4;
   3990                break;
   3991             default:
   3992                goto unhandled_cas;
   3993          }
   3994          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   3995          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   3996          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
   3997          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
   3998          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
   3999          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
   4000          addInstr(env, AMD64Instr_DACAS(am, sz));
   4001          addInstr(env,
   4002                   AMD64Instr_CMov64(
   4003                      Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
   4004          addInstr(env,
   4005                   AMD64Instr_CMov64(
   4006                      Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
   4007          return;
   4008       }
   4009       unhandled_cas:
   4010       break;
   4011 
   4012    /* --------- INSTR MARK --------- */
   4013    /* Doesn't generate any executable code ... */
   4014    case Ist_IMark:
   4015        return;
   4016 
   4017    /* --------- ABI HINT --------- */
   4018    /* These have no meaning (denotation in the IR) and so we ignore
   4019       them ... if any actually made it this far. */
   4020    case Ist_AbiHint:
   4021        return;
   4022 
   4023    /* --------- NO-OP --------- */
   4024    case Ist_NoOp:
   4025        return;
   4026 
   4027    /* --------- EXIT --------- */
   4028    case Ist_Exit: {
   4029       if (stmt->Ist.Exit.dst->tag != Ico_U64)
   4030          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
   4031 
   4032       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
   4033       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
   4034                                           hregAMD64_RBP());
   4035 
   4036       /* Case: boring transfer to known address */
   4037       if (stmt->Ist.Exit.jk == Ijk_Boring) {
   4038          if (env->chainingAllowed) {
   4039             /* .. almost always true .. */
   4040             /* Skip the event check at the dst if this is a forwards
   4041                edge. */
   4042             Bool toFastEP
   4043                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
   4044             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
   4045             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
   4046                                              amRIP, cc, toFastEP));
   4047          } else {
   4048             /* .. very occasionally .. */
   4049             /* We can't use chaining, so ask for an assisted transfer,
   4050                as that's the only alternative that is allowable. */
   4051             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4052             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
   4053          }
   4054          return;
   4055       }
   4056 
   4057       /* Case: assisted transfer to arbitrary address */
   4058       switch (stmt->Ist.Exit.jk) {
   4059          /* Keep this list in sync with that in iselNext below */
   4060          case Ijk_ClientReq:
   4061          case Ijk_EmWarn:
   4062          case Ijk_NoDecode:
   4063          case Ijk_NoRedir:
   4064          case Ijk_SigSEGV:
   4065          case Ijk_SigTRAP:
   4066          case Ijk_Sys_syscall:
   4067          case Ijk_TInval:
   4068          case Ijk_Yield:
   4069          {
   4070             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4071             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
   4072             return;
   4073          }
   4074          default:
   4075             break;
   4076       }
   4077 
   4078       /* Do we ever expect to see any other kind? */
   4079       goto stmt_fail;
   4080    }
   4081 
   4082    default: break;
   4083    }
   4084   stmt_fail:
   4085    ppIRStmt(stmt);
   4086    vpanic("iselStmt(amd64)");
   4087 }
   4088 
   4089 
   4090 /*---------------------------------------------------------*/
   4091 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4092 /*---------------------------------------------------------*/
   4093 
   4094 static void iselNext ( ISelEnv* env,
   4095                        IRExpr* next, IRJumpKind jk, Int offsIP )
   4096 {
   4097    if (vex_traceflags & VEX_TRACE_VCODE) {
   4098       vex_printf( "\n-- PUT(%d) = ", offsIP);
   4099       ppIRExpr( next );
   4100       vex_printf( "; exit-");
   4101       ppIRJumpKind(jk);
   4102       vex_printf( "\n");
   4103    }
   4104 
   4105    /* Case: boring transfer to known address */
   4106    if (next->tag == Iex_Const) {
   4107       IRConst* cdst = next->Iex.Const.con;
   4108       vassert(cdst->tag == Ico_U64);
   4109       if (jk == Ijk_Boring || jk == Ijk_Call) {
   4110          /* Boring transfer to known address */
   4111          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4112          if (env->chainingAllowed) {
   4113             /* .. almost always true .. */
   4114             /* Skip the event check at the dst if this is a forwards
   4115                edge. */
   4116             Bool toFastEP
   4117                = ((Addr64)cdst->Ico.U64) > env->max_ga;
   4118             if (0) vex_printf("%s", toFastEP ? "X" : ".");
   4119             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
   4120                                              amRIP, Acc_ALWAYS,
   4121                                              toFastEP));
   4122          } else {
   4123             /* .. very occasionally .. */
   4124             /* We can't use chaining, so ask for an indirect transfer,
   4125                as that's the cheapest alternative that is
   4126                allowable. */
   4127             HReg r = iselIntExpr_R(env, next);
   4128             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4129                                                Ijk_Boring));
   4130          }
   4131          return;
   4132       }
   4133    }
   4134 
   4135    /* Case: call/return (==boring) transfer to any address */
   4136    switch (jk) {
   4137       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
   4138          HReg        r     = iselIntExpr_R(env, next);
   4139          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4140          if (env->chainingAllowed) {
   4141             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
   4142          } else {
   4143             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4144                                                Ijk_Boring));
   4145          }
   4146          return;
   4147       }
   4148       default:
   4149          break;
   4150    }
   4151 
   4152    /* Case: assisted transfer to arbitrary address */
   4153    switch (jk) {
   4154       /* Keep this list in sync with that for Ist_Exit above */
   4155       case Ijk_ClientReq:
   4156       case Ijk_EmWarn:
   4157       case Ijk_NoDecode:
   4158       case Ijk_NoRedir:
   4159       case Ijk_SigSEGV:
   4160       case Ijk_SigTRAP:
   4161       case Ijk_Sys_syscall:
   4162       case Ijk_TInval:
   4163       case Ijk_Yield: {
   4164          HReg        r     = iselIntExpr_R(env, next);
   4165          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4166          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
   4167          return;
   4168       }
   4169       default:
   4170          break;
   4171    }
   4172 
   4173    vex_printf( "\n-- PUT(%d) = ", offsIP);
   4174    ppIRExpr( next );
   4175    vex_printf( "; exit-");
   4176    ppIRJumpKind(jk);
   4177    vex_printf( "\n");
   4178    vassert(0); // are we expecting any other kind?
   4179 }
   4180 
   4181 
   4182 /*---------------------------------------------------------*/
   4183 /*--- Insn selector top-level                           ---*/
   4184 /*---------------------------------------------------------*/
   4185 
   4186 /* Translate an entire SB to amd64 code. */
   4187 
   4188 HInstrArray* iselSB_AMD64 ( IRSB* bb,
   4189                             VexArch      arch_host,
   4190                             VexArchInfo* archinfo_host,
   4191                             VexAbiInfo*  vbi/*UNUSED*/,
   4192                             Int offs_Host_EvC_Counter,
   4193                             Int offs_Host_EvC_FailAddr,
   4194                             Bool chainingAllowed,
   4195                             Bool addProfInc,
   4196                             Addr64 max_ga )
   4197 {
   4198    Int        i, j;
   4199    HReg       hreg, hregHI;
   4200    ISelEnv*   env;
   4201    UInt       hwcaps_host = archinfo_host->hwcaps;
   4202    AMD64AMode *amCounter, *amFailAddr;
   4203 
   4204    /* sanity ... */
   4205    vassert(arch_host == VexArchAMD64);
   4206    vassert(0 == (hwcaps_host
   4207                  & ~(VEX_HWCAPS_AMD64_SSE3
   4208                      | VEX_HWCAPS_AMD64_CX16
   4209                      | VEX_HWCAPS_AMD64_LZCNT
   4210                      | VEX_HWCAPS_AMD64_AVX)));
   4211 
   4212    /* Make up an initial environment to use. */
   4213    env = LibVEX_Alloc(sizeof(ISelEnv));
   4214    env->vreg_ctr = 0;
   4215 
   4216    /* Set up output code array. */
   4217    env->code = newHInstrArray();
   4218 
   4219    /* Copy BB's type env. */
   4220    env->type_env = bb->tyenv;
   4221 
   4222    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4223       change as we go along. */
   4224    env->n_vregmap = bb->tyenv->types_used;
   4225    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4226    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4227 
   4228    /* and finally ... */
   4229    env->chainingAllowed = chainingAllowed;
   4230    env->hwcaps          = hwcaps_host;
   4231    env->max_ga          = max_ga;
   4232 
   4233    /* For each IR temporary, allocate a suitably-kinded virtual
   4234       register. */
   4235    j = 0;
   4236    for (i = 0; i < env->n_vregmap; i++) {
   4237       hregHI = hreg = INVALID_HREG;
   4238       switch (bb->tyenv->types[i]) {
   4239          case Ity_I1:
   4240          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
   4241             hreg = mkHReg(j++, HRcInt64, True);
   4242             break;
   4243          case Ity_I128:
   4244             hreg   = mkHReg(j++, HRcInt64, True);
   4245             hregHI = mkHReg(j++, HRcInt64, True);
   4246             break;
   4247          case Ity_F32:
   4248          case Ity_F64:
   4249          case Ity_V128:
   4250             hreg = mkHReg(j++, HRcVec128, True);
   4251             break;
   4252          case Ity_V256:
   4253             hreg   = mkHReg(j++, HRcVec128, True);
   4254             hregHI = mkHReg(j++, HRcVec128, True);
   4255             break;
   4256          default:
   4257             ppIRType(bb->tyenv->types[i]);
   4258             vpanic("iselBB(amd64): IRTemp type");
   4259       }
   4260       env->vregmap[i]   = hreg;
   4261       env->vregmapHI[i] = hregHI;
   4262    }
   4263    env->vreg_ctr = j;
   4264 
   4265    /* The very first instruction must be an event check. */
   4266    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
   4267    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
   4268    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
   4269 
   4270    /* Possibly a block counter increment (for profiling).  At this
   4271       point we don't know the address of the counter, so just pretend
   4272       it is zero.  It will have to be patched later, but before this
   4273       translation is used, by a call to LibVEX_patchProfCtr. */
   4274    if (addProfInc) {
   4275       addInstr(env, AMD64Instr_ProfInc());
   4276    }
   4277 
   4278    /* Ok, finally we can iterate over the statements. */
   4279    for (i = 0; i < bb->stmts_used; i++)
   4280       if (bb->stmts[i])
   4281          iselStmt(env, bb->stmts[i]);
   4282 
   4283    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
   4284 
   4285    /* record the number of vregs we used. */
   4286    env->code->n_vregs = env->vreg_ctr;
   4287    return env->code;
   4288 }
   4289 
   4290 
   4291 /*---------------------------------------------------------------*/
   4292 /*--- end                                   host_amd64_isel.c ---*/
   4293 /*---------------------------------------------------------------*/
   4294