Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_generic_simd256.h"
     47 #include "host_generic_maddf.h"
     48 #include "host_amd64_defs.h"
     49 
     50 
     51 /*---------------------------------------------------------*/
     52 /*--- x87/SSE control word stuff                        ---*/
     53 /*---------------------------------------------------------*/
     54 
     55 /* Vex-generated code expects to run with the FPU set as follows: all
     56    exceptions masked, round-to-nearest, precision = 53 bits.  This
     57    corresponds to a FPU control word value of 0x027F.
     58 
     59    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     60 
     61    %fpucw and %mxcsr should have these values on entry to
     62    Vex-generated code, and should those values should be
     63    unchanged at exit.
     64 */
     65 
     66 #define DEFAULT_FPUCW 0x027F
     67 
     68 #define DEFAULT_MXCSR 0x1F80
     69 
     70 /* debugging only, do not use */
     71 /* define DEFAULT_FPUCW 0x037F */
     72 
     73 
     74 /*---------------------------------------------------------*/
     75 /*--- misc helpers                                      ---*/
     76 /*---------------------------------------------------------*/
     77 
     78 /* These are duplicated in guest-amd64/toIR.c */
     79 static IRExpr* unop ( IROp op, IRExpr* a )
     80 {
     81    return IRExpr_Unop(op, a);
     82 }
     83 
     84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     85 {
     86    return IRExpr_Binop(op, a1, a2);
     87 }
     88 
     89 static IRExpr* bind ( Int binder )
     90 {
     91    return IRExpr_Binder(binder);
     92 }
     93 
     94 static Bool isZeroU8 ( IRExpr* e )
     95 {
     96    return e->tag == Iex_Const
     97           && e->Iex.Const.con->tag == Ico_U8
     98           && e->Iex.Const.con->Ico.U8 == 0;
     99 }
    100 
    101 
    102 /*---------------------------------------------------------*/
    103 /*--- ISelEnv                                           ---*/
    104 /*---------------------------------------------------------*/
    105 
    106 /* This carries around:
    107 
    108    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    109      might encounter.  This is computed before insn selection starts,
    110      and does not change.
    111 
    112    - A mapping from IRTemp to HReg.  This tells the insn selector
    113      which virtual register is associated with each IRTemp
    114      temporary.  This is computed before insn selection starts, and
    115      does not change.  We expect this mapping to map precisely the
    116      same set of IRTemps as the type mapping does.
    117 
    118         - vregmap   holds the primary register for the IRTemp.
    119         - vregmapHI is only used for 128-bit integer-typed
    120              IRTemps.  It holds the identity of a second
    121              64-bit virtual HReg, which holds the high half
    122              of the value.
    123 
    124    - The host subarchitecture we are selecting insns for.
    125      This is set at the start and does not change.
    126 
    127    - The code array, that is, the insns selected so far.
    128 
    129    - A counter, for generating new virtual registers.
    130 
    131    - A Bool for indicating whether we may generate chain-me
    132      instructions for control flow transfers, or whether we must use
    133      XAssisted.
    134 
    135    - The maximum guest address of any guest insn in this block.
    136      Actually, the address of the highest-addressed byte from any insn
    137      in this block.  Is set at the start and does not change.  This is
    138      used for detecting jumps which are definitely forward-edges from
    139      this block, and therefore can be made (chained) to the fast entry
    140      point of the destination, thereby avoiding the destination's
    141      event check.
    142 
    143    Note, this is all host-independent.  (JRS 20050201: well, kinda
    144    ... not completely.  Compare with ISelEnv for X86.)
    145 */
    146 
    147 typedef
    148    struct {
    149       /* Constant -- are set at the start and do not change. */
    150       IRTypeEnv*   type_env;
    151 
    152       HReg*        vregmap;
    153       HReg*        vregmapHI;
    154       Int          n_vregmap;
    155 
    156       UInt         hwcaps;
    157 
    158       Bool         chainingAllowed;
    159       Addr64       max_ga;
    160 
    161       /* These are modified as we go along. */
    162       HInstrArray* code;
    163       Int          vreg_ctr;
    164    }
    165    ISelEnv;
    166 
    167 
    168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    169 {
    170    vassert(tmp >= 0);
    171    vassert(tmp < env->n_vregmap);
    172    return env->vregmap[tmp];
    173 }
    174 
    175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
    176                                ISelEnv* env, IRTemp tmp )
    177 {
    178    vassert(tmp >= 0);
    179    vassert(tmp < env->n_vregmap);
    180    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
    181    *vrLO = env->vregmap[tmp];
    182    *vrHI = env->vregmapHI[tmp];
    183 }
    184 
    185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
    186 {
    187    addHInstr(env->code, instr);
    188    if (vex_traceflags & VEX_TRACE_VCODE) {
    189       ppAMD64Instr(instr, True);
    190       vex_printf("\n");
    191    }
    192 }
    193 
    194 static HReg newVRegI ( ISelEnv* env )
    195 {
    196    HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr);
    197    env->vreg_ctr++;
    198    return reg;
    199 }
    200 
    201 static HReg newVRegV ( ISelEnv* env )
    202 {
    203    HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr);
    204    env->vreg_ctr++;
    205    return reg;
    206 }
    207 
    208 
    209 /*---------------------------------------------------------*/
    210 /*--- ISEL: Forward declarations                        ---*/
    211 /*---------------------------------------------------------*/
    212 
    213 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    214    iselXXX_wrk do the real work, but are not to be called directly.
    215    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    216    checks that all returned registers are virtual.  You should not
    217    call the _wrk version directly.
    218 */
    219 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    220 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    221 
    222 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
    223 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
    224 
    225 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
    226 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
    227 
    228 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
    229 static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
    230 
    231 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    232 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    233 
    234 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    235                                           ISelEnv* env, IRExpr* e );
    236 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
    237                                           ISelEnv* env, IRExpr* e );
    238 
    239 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
    240 static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
    241 
    242 static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
    243 static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
    244 
    245 static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
    246 static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
    247 
    248 static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
    249 static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
    250 
    251 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    252                                         ISelEnv* env, IRExpr* e );
    253 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
    254                                         ISelEnv* env, IRExpr* e );
    255 
    256 
    257 /*---------------------------------------------------------*/
    258 /*--- ISEL: Misc helpers                                ---*/
    259 /*---------------------------------------------------------*/
    260 
    261 static Bool sane_AMode ( AMD64AMode* am )
    262 {
    263    switch (am->tag) {
    264       case Aam_IR:
    265          return
    266             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
    267                     && (hregIsVirtual(am->Aam.IR.reg)
    268                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
    269       case Aam_IRRS:
    270          return
    271             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
    272                     && hregIsVirtual(am->Aam.IRRS.base)
    273                     && hregClass(am->Aam.IRRS.index) == HRcInt64
    274                     && hregIsVirtual(am->Aam.IRRS.index) );
    275       default:
    276         vpanic("sane_AMode: unknown amd64 amode tag");
    277    }
    278 }
    279 
    280 
    281 /* Can the lower 32 bits be signedly widened to produce the whole
    282    64-bit value?  In other words, are the top 33 bits either all 0 or
    283    all 1 ? */
    284 static Bool fitsIn32Bits ( ULong x )
    285 {
    286    Long y1;
    287    y1 = x << 32;
    288    y1 >>=/*s*/ 32;
    289    return toBool(x == y1);
    290 }
    291 
    292 /* Is this a 64-bit zero expression? */
    293 
    294 static Bool isZeroU64 ( IRExpr* e )
    295 {
    296    return e->tag == Iex_Const
    297           && e->Iex.Const.con->tag == Ico_U64
    298           && e->Iex.Const.con->Ico.U64 == 0ULL;
    299 }
    300 
    301 static Bool isZeroU32 ( IRExpr* e )
    302 {
    303    return e->tag == Iex_Const
    304           && e->Iex.Const.con->tag == Ico_U32
    305           && e->Iex.Const.con->Ico.U32 == 0;
    306 }
    307 
    308 /* Make a int reg-reg move. */
    309 
    310 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    311 {
    312    vassert(hregClass(src) == HRcInt64);
    313    vassert(hregClass(dst) == HRcInt64);
    314    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
    315 }
    316 
    317 /* Make a vector (128 bit) reg-reg move. */
    318 
    319 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    320 {
    321    vassert(hregClass(src) == HRcVec128);
    322    vassert(hregClass(dst) == HRcVec128);
    323    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
    324 }
    325 
    326 /* Advance/retreat %rsp by n. */
    327 
    328 static void add_to_rsp ( ISelEnv* env, Int n )
    329 {
    330    vassert(n > 0 && n < 256 && (n%8) == 0);
    331    addInstr(env,
    332             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
    333                                         hregAMD64_RSP()));
    334 }
    335 
    336 static void sub_from_rsp ( ISelEnv* env, Int n )
    337 {
    338    vassert(n > 0 && n < 256 && (n%8) == 0);
    339    addInstr(env,
    340             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
    341                                         hregAMD64_RSP()));
    342 }
    343 
    344 /* Push 64-bit constants on the stack. */
    345 static void push_uimm64( ISelEnv* env, ULong uimm64 )
    346 {
    347    /* If uimm64 can be expressed as the sign extension of its
    348       lower 32 bits, we can do it the easy way. */
    349    Long simm64 = (Long)uimm64;
    350    if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) {
    351       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
    352    } else {
    353       HReg tmp = newVRegI(env);
    354       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
    355       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
    356    }
    357 }
    358 
    359 
    360 /* Used only in doHelperCall.  If possible, produce a single
    361    instruction which computes 'e' into 'dst'.  If not possible, return
    362    NULL. */
    363 
    364 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
    365                                                     HReg     dst,
    366                                                     IRExpr*  e )
    367 {
    368    /* Per comments in doHelperCall below, appearance of
    369       Iex_VECRET implies ill-formed IR. */
    370    vassert(e->tag != Iex_VECRET);
    371 
    372    /* In this case we give out a copy of the BaseBlock pointer. */
    373    if (UNLIKELY(e->tag == Iex_BBPTR)) {
    374       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
    375    }
    376 
    377    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
    378 
    379    if (e->tag == Iex_Const) {
    380       vassert(e->Iex.Const.con->tag == Ico_U64);
    381       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
    382          return AMD64Instr_Alu64R(
    383                    Aalu_MOV,
    384                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
    385                    dst
    386                 );
    387       } else {
    388          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
    389       }
    390    }
    391 
    392    if (e->tag == Iex_RdTmp) {
    393       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
    394       return mk_iMOVsd_RR(src, dst);
    395    }
    396 
    397    if (e->tag == Iex_Get) {
    398       vassert(e->Iex.Get.ty == Ity_I64);
    399       return AMD64Instr_Alu64R(
    400                 Aalu_MOV,
    401                 AMD64RMI_Mem(
    402                    AMD64AMode_IR(e->Iex.Get.offset,
    403                                  hregAMD64_RBP())),
    404                 dst);
    405    }
    406 
    407    if (e->tag == Iex_Unop
    408        && e->Iex.Unop.op == Iop_32Uto64
    409        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
    410       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
    411       return AMD64Instr_MovxLQ(False, src, dst);
    412    }
    413 
    414    if (0) { ppIRExpr(e); vex_printf("\n"); }
    415 
    416    return NULL;
    417 }
    418 
    419 
    420 /* Do a complete function call.  |guard| is a Ity_Bit expression
    421    indicating whether or not the call happens.  If guard==NULL, the
    422    call is unconditional.  |retloc| is set to indicate where the
    423    return value is after the call.  The caller (of this fn) must
    424    generate code to add |stackAdjustAfterCall| to the stack pointer
    425    after the call is done. */
    426 
    427 static
    428 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
    429                     /*OUT*/RetLoc* retloc,
    430                     ISelEnv* env,
    431                     IRExpr* guard,
    432                     IRCallee* cee, IRType retTy, IRExpr** args )
    433 {
    434    AMD64CondCode cc;
    435    HReg          argregs[6];
    436    HReg          tmpregs[6];
    437    AMD64Instr*   fastinstrs[6];
    438    UInt          n_args, i;
    439 
    440    /* Set default returns.  We'll update them later if needed. */
    441    *stackAdjustAfterCall = 0;
    442    *retloc               = mk_RetLoc_INVALID();
    443 
    444    /* These are used for cross-checking that IR-level constraints on
    445       the use of IRExpr_VECRET() and IRExpr_BBPTR() are observed. */
    446    UInt nVECRETs = 0;
    447    UInt nBBPTRs  = 0;
    448 
    449    /* Marshal args for a call and do the call.
    450 
    451       This function only deals with a tiny set of possibilities, which
    452       cover all helpers in practice.  The restrictions are that only
    453       arguments in registers are supported, hence only 6x64 integer
    454       bits in total can be passed.  In fact the only supported arg
    455       type is I64.
    456 
    457       The return type can be I{64,32,16,8} or V{128,256}.  In the
    458       latter two cases, it is expected that |args| will contain the
    459       special node IRExpr_VECRET(), in which case this routine
    460       generates code to allocate space on the stack for the vector
    461       return value.  Since we are not passing any scalars on the
    462       stack, it is enough to preallocate the return space before
    463       marshalling any arguments, in this case.
    464 
    465       |args| may also contain IRExpr_BBPTR(), in which case the
    466       value in %rbp is passed as the corresponding argument.
    467 
    468       Generating code which is both efficient and correct when
    469       parameters are to be passed in registers is difficult, for the
    470       reasons elaborated in detail in comments attached to
    471       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
    472       of the method described in those comments.
    473 
    474       The problem is split into two cases: the fast scheme and the
    475       slow scheme.  In the fast scheme, arguments are computed
    476       directly into the target (real) registers.  This is only safe
    477       when we can be sure that computation of each argument will not
    478       trash any real registers set by computation of any other
    479       argument.
    480 
    481       In the slow scheme, all args are first computed into vregs, and
    482       once they are all done, they are moved to the relevant real
    483       regs.  This always gives correct code, but it also gives a bunch
    484       of vreg-to-rreg moves which are usually redundant but are hard
    485       for the register allocator to get rid of.
    486 
    487       To decide which scheme to use, all argument expressions are
    488       first examined.  If they are all so simple that it is clear they
    489       will be evaluated without use of any fixed registers, use the
    490       fast scheme, else use the slow scheme.  Note also that only
    491       unconditional calls may use the fast scheme, since having to
    492       compute a condition expression could itself trash real
    493       registers.  Note that for simplicity, in the case where
    494       IRExpr_VECRET() is present, we use the slow scheme.  This is
    495       motivated by the desire to avoid any possible complexity
    496       w.r.t. nested calls.
    497 
    498       Note this requires being able to examine an expression and
    499       determine whether or not evaluation of it might use a fixed
    500       register.  That requires knowledge of how the rest of this insn
    501       selector works.  Currently just the following 3 are regarded as
    502       safe -- hopefully they cover the majority of arguments in
    503       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    504    */
    505 
    506    /* Note that the cee->regparms field is meaningless on AMD64 host
    507       (since there is only one calling convention) and so we always
    508       ignore it. */
    509    n_args = 0;
    510    for (i = 0; args[i]; i++)
    511       n_args++;
    512 
    513    if (n_args > 6)
    514       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
    515 
    516    argregs[0] = hregAMD64_RDI();
    517    argregs[1] = hregAMD64_RSI();
    518    argregs[2] = hregAMD64_RDX();
    519    argregs[3] = hregAMD64_RCX();
    520    argregs[4] = hregAMD64_R8();
    521    argregs[5] = hregAMD64_R9();
    522 
    523    tmpregs[0] = tmpregs[1] = tmpregs[2] =
    524    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
    525 
    526    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
    527    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
    528 
    529    /* First decide which scheme (slow or fast) is to be used.  First
    530       assume the fast scheme, and select slow if any contraindications
    531       (wow) appear. */
    532 
    533    /* We'll need space on the stack for the return value.  Avoid
    534       possible complications with nested calls by using the slow
    535       scheme. */
    536    if (retTy == Ity_V128 || retTy == Ity_V256)
    537       goto slowscheme;
    538 
    539    if (guard) {
    540       if (guard->tag == Iex_Const
    541           && guard->Iex.Const.con->tag == Ico_U1
    542           && guard->Iex.Const.con->Ico.U1 == True) {
    543          /* unconditional */
    544       } else {
    545          /* Not manifestly unconditional -- be conservative. */
    546          goto slowscheme;
    547       }
    548    }
    549 
    550    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
    551       use the slow scheme.  Because this is tentative, we can't call
    552       addInstr (that is, commit to) any instructions until we're
    553       handled all the arguments.  So park the resulting instructions
    554       in a buffer and emit that if we're successful. */
    555 
    556    /* FAST SCHEME */
    557    /* In this loop, we process args that can be computed into the
    558       destination (real) register with a single instruction, without
    559       using any fixed regs.  That also includes IRExpr_BBPTR(), but
    560       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
    561       never see IRExpr_VECRET() at this point, since the return-type
    562       check above should ensure all those cases use the slow scheme
    563       instead. */
    564    vassert(n_args >= 0 && n_args <= 6);
    565    for (i = 0; i < n_args; i++) {
    566       IRExpr* arg = args[i];
    567       if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
    568          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    569       }
    570       fastinstrs[i]
    571          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
    572       if (fastinstrs[i] == NULL)
    573          goto slowscheme;
    574    }
    575 
    576    /* Looks like we're in luck.  Emit the accumulated instructions and
    577       move on to doing the call itself. */
    578    for (i = 0; i < n_args; i++)
    579       addInstr(env, fastinstrs[i]);
    580 
    581    /* Fast scheme only applies for unconditional calls.  Hence: */
    582    cc = Acc_ALWAYS;
    583 
    584    goto handle_call;
    585 
    586 
    587    /* SLOW SCHEME; move via temporaries */
    588   slowscheme:
    589    {}
    590 #  if 0 /* debug only */
    591    if (n_args > 0) {for (i = 0; args[i]; i++) {
    592    ppIRExpr(args[i]); vex_printf(" "); }
    593    vex_printf("\n");}
    594 #  endif
    595 
    596    /* If we have a vector return type, allocate a place for it on the
    597       stack and record its address. */
    598    HReg r_vecRetAddr = INVALID_HREG;
    599    if (retTy == Ity_V128) {
    600       r_vecRetAddr = newVRegI(env);
    601       sub_from_rsp(env, 16);
    602       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
    603    }
    604    else if (retTy == Ity_V256) {
    605       r_vecRetAddr = newVRegI(env);
    606       sub_from_rsp(env, 32);
    607       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
    608    }
    609 
    610    vassert(n_args >= 0 && n_args <= 6);
    611    for (i = 0; i < n_args; i++) {
    612       IRExpr* arg = args[i];
    613       if (UNLIKELY(arg->tag == Iex_BBPTR)) {
    614          tmpregs[i] = newVRegI(env);
    615          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
    616          nBBPTRs++;
    617       }
    618       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
    619          /* We stashed the address of the return slot earlier, so just
    620             retrieve it now. */
    621          vassert(!hregIsInvalid(r_vecRetAddr));
    622          tmpregs[i] = r_vecRetAddr;
    623          nVECRETs++;
    624       }
    625       else {
    626          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    627          tmpregs[i] = iselIntExpr_R(env, args[i]);
    628       }
    629    }
    630 
    631    /* Now we can compute the condition.  We can't do it earlier
    632       because the argument computations could trash the condition
    633       codes.  Be a bit clever to handle the common case where the
    634       guard is 1:Bit. */
    635    cc = Acc_ALWAYS;
    636    if (guard) {
    637       if (guard->tag == Iex_Const
    638           && guard->Iex.Const.con->tag == Ico_U1
    639           && guard->Iex.Const.con->Ico.U1 == True) {
    640          /* unconditional -- do nothing */
    641       } else {
    642          cc = iselCondCode( env, guard );
    643       }
    644    }
    645 
    646    /* Move the args to their final destinations. */
    647    for (i = 0; i < n_args; i++) {
    648       /* None of these insns, including any spill code that might
    649          be generated, may alter the condition codes. */
    650       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
    651    }
    652 
    653 
    654    /* Do final checks, set the return values, and generate the call
    655       instruction proper. */
    656   handle_call:
    657 
    658    if (retTy == Ity_V128 || retTy == Ity_V256) {
    659       vassert(nVECRETs == 1);
    660    } else {
    661       vassert(nVECRETs == 0);
    662    }
    663 
    664    vassert(nBBPTRs == 0 || nBBPTRs == 1);
    665 
    666    vassert(*stackAdjustAfterCall == 0);
    667    vassert(is_RetLoc_INVALID(*retloc));
    668    switch (retTy) {
    669          case Ity_INVALID:
    670             /* Function doesn't return a value. */
    671             *retloc = mk_RetLoc_simple(RLPri_None);
    672             break;
    673          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
    674             *retloc = mk_RetLoc_simple(RLPri_Int);
    675             break;
    676          case Ity_V128:
    677             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
    678             *stackAdjustAfterCall = 16;
    679             break;
    680          case Ity_V256:
    681             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
    682             *stackAdjustAfterCall = 32;
    683             break;
    684          default:
    685             /* IR can denote other possible return types, but we don't
    686                handle those here. */
    687            vassert(0);
    688    }
    689 
    690    /* Finally, generate the call itself.  This needs the *retloc value
    691       set in the switch above, which is why it's at the end. */
    692    addInstr(env,
    693             AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc));
    694 }
    695 
    696 
    697 /* Given a guest-state array descriptor, an index expression and a
    698    bias, generate an AMD64AMode holding the relevant guest state
    699    offset. */
    700 
    701 static
    702 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    703                                   IRExpr* off, Int bias )
    704 {
    705    HReg tmp, roff;
    706    Int  elemSz = sizeofIRType(descr->elemTy);
    707    Int  nElems = descr->nElems;
    708 
    709    /* Throw out any cases not generated by an amd64 front end.  In
    710       theory there might be a day where we need to handle them -- if
    711       we ever run non-amd64-guest on amd64 host. */
    712 
    713    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
    714       vpanic("genGuestArrayOffset(amd64 host)");
    715 
    716    /* Compute off into a reg, %off.  Then return:
    717 
    718          movq %off, %tmp
    719          addq $bias, %tmp  (if bias != 0)
    720          andq %tmp, 7
    721          ... base(%rbp, %tmp, shift) ...
    722    */
    723    tmp  = newVRegI(env);
    724    roff = iselIntExpr_R(env, off);
    725    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    726    if (bias != 0) {
    727       /* Make sure the bias is sane, in the sense that there are
    728          no significant bits above bit 30 in it. */
    729       vassert(-10000 < bias && bias < 10000);
    730       addInstr(env,
    731                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
    732    }
    733    addInstr(env,
    734             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
    735    vassert(elemSz == 1 || elemSz == 8);
    736    return
    737       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
    738                                     elemSz==8 ? 3 : 0);
    739 }
    740 
    741 
    742 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
    743 static
    744 void set_SSE_rounding_default ( ISelEnv* env )
    745 {
    746    /* pushq $DEFAULT_MXCSR
    747       ldmxcsr 0(%rsp)
    748       addq $8, %rsp
    749    */
    750    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    751    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
    752    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    753    add_to_rsp(env, 8);
    754 }
    755 
    756 /* Mess with the FPU's rounding mode: set to the default rounding mode
    757    (DEFAULT_FPUCW). */
    758 static
    759 void set_FPU_rounding_default ( ISelEnv* env )
    760 {
    761    /* movq $DEFAULT_FPUCW, -8(%rsp)
    762       fldcw -8(%esp)
    763    */
    764    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    765    addInstr(env, AMD64Instr_Alu64M(
    766                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
    767    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    768 }
    769 
    770 
    771 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
    772    expression denoting a value in the range 0 .. 3, indicating a round
    773    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
    774    have the same rounding.
    775 */
    776 static
    777 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
    778 {
    779    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
    780       both rounding bits == 0.  If that wasn't the case, we couldn't
    781       create a new rounding field simply by ORing the new value into
    782       place. */
    783 
    784    /* movq $3, %reg
    785       andq [[mode]], %reg  -- shouldn't be needed; paranoia
    786       shlq $13, %reg
    787       orq $DEFAULT_MXCSR, %reg
    788       pushq %reg
    789       ldmxcsr 0(%esp)
    790       addq $8, %rsp
    791    */
    792    HReg        reg      = newVRegI(env);
    793    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    794    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
    795    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
    796                                    iselIntExpr_RMI(env, mode), reg));
    797    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
    798    addInstr(env, AMD64Instr_Alu64R(
    799                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
    800    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
    801    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    802    add_to_rsp(env, 8);
    803 }
    804 
    805 
    806 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    807    expression denoting a value in the range 0 .. 3, indicating a round
    808    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    809    the same rounding.
    810 */
    811 static
    812 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    813 {
    814    HReg rrm  = iselIntExpr_R(env, mode);
    815    HReg rrm2 = newVRegI(env);
    816    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    817 
    818    /* movq  %rrm, %rrm2
    819       andq  $3, %rrm2   -- shouldn't be needed; paranoia
    820       shlq  $10, %rrm2
    821       orq   $DEFAULT_FPUCW, %rrm2
    822       movq  %rrm2, -8(%rsp)
    823       fldcw -8(%esp)
    824    */
    825    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    826    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
    827    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
    828    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
    829                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
    830    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
    831                                    AMD64RI_Reg(rrm2), m8_rsp));
    832    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    833 }
    834 
    835 
    836 /* Generate all-zeroes into a new vector register.
    837 */
    838 static HReg generate_zeroes_V128 ( ISelEnv* env )
    839 {
    840    HReg dst = newVRegV(env);
    841    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
    842    return dst;
    843 }
    844 
    845 /* Generate all-ones into a new vector register.
    846 */
    847 static HReg generate_ones_V128 ( ISelEnv* env )
    848 {
    849    HReg dst = newVRegV(env);
    850    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
    851    return dst;
    852 }
    853 
    854 
    855 /* Generate !src into a new vector register.  Amazing that there isn't
    856    a less crappy way to do this.
    857 */
    858 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
    859 {
    860    HReg dst = generate_ones_V128(env);
    861    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
    862    return dst;
    863 }
    864 
    865 
    866 /* Expand the given byte into a 64-bit word, by cloning each bit
    867    8 times. */
    868 static ULong bitmask8_to_bytemask64 ( UShort w8 )
    869 {
    870    vassert(w8 == (w8 & 0xFF));
    871    ULong w64 = 0;
    872    Int i;
    873    for (i = 0; i < 8; i++) {
    874       if (w8 & (1<<i))
    875          w64 |= (0xFFULL << (8 * i));
    876    }
    877    return w64;
    878 }
    879 
    880 
    881 /*---------------------------------------------------------*/
    882 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
    883 /*---------------------------------------------------------*/
    884 
    885 /* Select insns for an integer-typed expression, and add them to the
    886    code list.  Return a reg holding the result.  This reg will be a
    887    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    888    want to modify it, ask for a new vreg, copy it in there, and modify
    889    the copy.  The register allocator will do its best to map both
    890    vregs to the same real register, so the copies will often disappear
    891    later in the game.
    892 
    893    This should handle expressions of 64, 32, 16 and 8-bit type.  All
    894    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
    895    expressions, the upper 32/48/56 bits are arbitrary, so you should
    896    mask or sign extend partial values if necessary.
    897 */
    898 
    899 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    900 {
    901    HReg r = iselIntExpr_R_wrk(env, e);
    902    /* sanity checks ... */
    903 #  if 0
    904    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
    905 #  endif
    906    vassert(hregClass(r) == HRcInt64);
    907    vassert(hregIsVirtual(r));
    908    return r;
    909 }
    910 
    911 /* DO NOT CALL THIS DIRECTLY ! */
    912 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    913 {
    914    /* Used for unary/binary SIMD64 ops. */
    915    HWord fn = 0;
    916    Bool second_is_UInt;
    917 
    918    MatchInfo mi;
    919    DECLARE_PATTERN(p_1Uto8_64to1);
    920    DECLARE_PATTERN(p_LDle8_then_8Uto64);
    921    DECLARE_PATTERN(p_LDle16_then_16Uto64);
    922 
    923    IRType ty = typeOfIRExpr(env->type_env,e);
    924    switch (ty) {
    925       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
    926       default: vassert(0);
    927    }
    928 
    929    switch (e->tag) {
    930 
    931    /* --------- TEMP --------- */
    932    case Iex_RdTmp: {
    933       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    934    }
    935 
    936    /* --------- LOAD --------- */
    937    case Iex_Load: {
    938       HReg dst = newVRegI(env);
    939       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    940 
    941       /* We can't handle big-endian loads, nor load-linked. */
    942       if (e->Iex.Load.end != Iend_LE)
    943          goto irreducible;
    944 
    945       if (ty == Ity_I64) {
    946          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
    947                                          AMD64RMI_Mem(amode), dst) );
    948          return dst;
    949       }
    950       if (ty == Ity_I32) {
    951          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
    952          return dst;
    953       }
    954       if (ty == Ity_I16) {
    955          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
    956          return dst;
    957       }
    958       if (ty == Ity_I8) {
    959          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
    960          return dst;
    961       }
    962       break;
    963    }
    964 
    965    /* --------- BINARY OP --------- */
    966    case Iex_Binop: {
    967       AMD64AluOp   aluOp;
    968       AMD64ShiftOp shOp;
    969 
    970       /* Pattern: Sub64(0,x) */
    971       /*     and: Sub32(0,x) */
    972       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
    973           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
    974          HReg dst = newVRegI(env);
    975          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    976          addInstr(env, mk_iMOVsd_RR(reg,dst));
    977          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
    978          return dst;
    979       }
    980 
    981       /* Is it an addition or logical style op? */
    982       switch (e->Iex.Binop.op) {
    983          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
    984             aluOp = Aalu_ADD; break;
    985          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
    986             aluOp = Aalu_SUB; break;
    987          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
    988             aluOp = Aalu_AND; break;
    989          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
    990             aluOp = Aalu_OR; break;
    991          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
    992             aluOp = Aalu_XOR; break;
    993          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
    994             aluOp = Aalu_MUL; break;
    995          default:
    996             aluOp = Aalu_INVALID; break;
    997       }
    998       /* For commutative ops we assume any literal
    999          values are on the second operand. */
   1000       if (aluOp != Aalu_INVALID) {
   1001          HReg dst      = newVRegI(env);
   1002          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1003          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1004          addInstr(env, mk_iMOVsd_RR(reg,dst));
   1005          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
   1006          return dst;
   1007       }
   1008 
   1009       /* Perhaps a shift op? */
   1010       switch (e->Iex.Binop.op) {
   1011          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
   1012             shOp = Ash_SHL; break;
   1013          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
   1014             shOp = Ash_SHR; break;
   1015          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
   1016             shOp = Ash_SAR; break;
   1017          default:
   1018             shOp = Ash_INVALID; break;
   1019       }
   1020       if (shOp != Ash_INVALID) {
   1021          HReg dst = newVRegI(env);
   1022 
   1023          /* regL = the value to be shifted */
   1024          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1025          addInstr(env, mk_iMOVsd_RR(regL,dst));
   1026 
   1027          /* Do any necessary widening for 32/16/8 bit operands */
   1028          switch (e->Iex.Binop.op) {
   1029             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
   1030                break;
   1031             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
   1032                break;
   1033             case Iop_Shr8:
   1034                addInstr(env, AMD64Instr_Alu64R(
   1035                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
   1036                break;
   1037             case Iop_Shr16:
   1038                addInstr(env, AMD64Instr_Alu64R(
   1039                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
   1040                break;
   1041             case Iop_Shr32:
   1042                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
   1043                break;
   1044             case Iop_Sar8:
   1045                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
   1046                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
   1047                break;
   1048             case Iop_Sar16:
   1049                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
   1050                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
   1051                break;
   1052             case Iop_Sar32:
   1053                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
   1054                break;
   1055             default:
   1056                ppIROp(e->Iex.Binop.op);
   1057                vassert(0);
   1058          }
   1059 
   1060          /* Now consider the shift amount.  If it's a literal, we
   1061             can do a much better job than the general case. */
   1062          if (e->Iex.Binop.arg2->tag == Iex_Const) {
   1063             /* assert that the IR is well-typed */
   1064             Int nshift;
   1065             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
   1066             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1067             vassert(nshift >= 0);
   1068             if (nshift > 0)
   1069                /* Can't allow nshift==0 since that means %cl */
   1070                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
   1071          } else {
   1072             /* General case; we have to force the amount into %cl. */
   1073             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1074             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
   1075             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
   1076          }
   1077          return dst;
   1078       }
   1079 
   1080       /* Deal with 64-bit SIMD binary ops */
   1081       second_is_UInt = False;
   1082       switch (e->Iex.Binop.op) {
   1083          case Iop_Add8x8:
   1084             fn = (HWord)h_generic_calc_Add8x8; break;
   1085          case Iop_Add16x4:
   1086             fn = (HWord)h_generic_calc_Add16x4; break;
   1087          case Iop_Add32x2:
   1088             fn = (HWord)h_generic_calc_Add32x2; break;
   1089 
   1090          case Iop_Avg8Ux8:
   1091             fn = (HWord)h_generic_calc_Avg8Ux8; break;
   1092          case Iop_Avg16Ux4:
   1093             fn = (HWord)h_generic_calc_Avg16Ux4; break;
   1094 
   1095          case Iop_CmpEQ8x8:
   1096             fn = (HWord)h_generic_calc_CmpEQ8x8; break;
   1097          case Iop_CmpEQ16x4:
   1098             fn = (HWord)h_generic_calc_CmpEQ16x4; break;
   1099          case Iop_CmpEQ32x2:
   1100             fn = (HWord)h_generic_calc_CmpEQ32x2; break;
   1101 
   1102          case Iop_CmpGT8Sx8:
   1103             fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
   1104          case Iop_CmpGT16Sx4:
   1105             fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
   1106          case Iop_CmpGT32Sx2:
   1107             fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
   1108 
   1109          case Iop_InterleaveHI8x8:
   1110             fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
   1111          case Iop_InterleaveLO8x8:
   1112             fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
   1113          case Iop_InterleaveHI16x4:
   1114             fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
   1115          case Iop_InterleaveLO16x4:
   1116             fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
   1117          case Iop_InterleaveHI32x2:
   1118             fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
   1119          case Iop_InterleaveLO32x2:
   1120             fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
   1121          case Iop_CatOddLanes16x4:
   1122             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
   1123          case Iop_CatEvenLanes16x4:
   1124             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
   1125          case Iop_Perm8x8:
   1126             fn = (HWord)h_generic_calc_Perm8x8; break;
   1127 
   1128          case Iop_Max8Ux8:
   1129             fn = (HWord)h_generic_calc_Max8Ux8; break;
   1130          case Iop_Max16Sx4:
   1131             fn = (HWord)h_generic_calc_Max16Sx4; break;
   1132          case Iop_Min8Ux8:
   1133             fn = (HWord)h_generic_calc_Min8Ux8; break;
   1134          case Iop_Min16Sx4:
   1135             fn = (HWord)h_generic_calc_Min16Sx4; break;
   1136 
   1137          case Iop_Mul16x4:
   1138             fn = (HWord)h_generic_calc_Mul16x4; break;
   1139          case Iop_Mul32x2:
   1140             fn = (HWord)h_generic_calc_Mul32x2; break;
   1141          case Iop_MulHi16Sx4:
   1142             fn = (HWord)h_generic_calc_MulHi16Sx4; break;
   1143          case Iop_MulHi16Ux4:
   1144             fn = (HWord)h_generic_calc_MulHi16Ux4; break;
   1145 
   1146          case Iop_QAdd8Sx8:
   1147             fn = (HWord)h_generic_calc_QAdd8Sx8; break;
   1148          case Iop_QAdd16Sx4:
   1149             fn = (HWord)h_generic_calc_QAdd16Sx4; break;
   1150          case Iop_QAdd8Ux8:
   1151             fn = (HWord)h_generic_calc_QAdd8Ux8; break;
   1152          case Iop_QAdd16Ux4:
   1153             fn = (HWord)h_generic_calc_QAdd16Ux4; break;
   1154 
   1155          case Iop_QNarrowBin32Sto16Sx4:
   1156             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
   1157          case Iop_QNarrowBin16Sto8Sx8:
   1158             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
   1159          case Iop_QNarrowBin16Sto8Ux8:
   1160             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
   1161          case Iop_NarrowBin16to8x8:
   1162             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
   1163          case Iop_NarrowBin32to16x4:
   1164             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
   1165 
   1166          case Iop_QSub8Sx8:
   1167             fn = (HWord)h_generic_calc_QSub8Sx8; break;
   1168          case Iop_QSub16Sx4:
   1169             fn = (HWord)h_generic_calc_QSub16Sx4; break;
   1170          case Iop_QSub8Ux8:
   1171             fn = (HWord)h_generic_calc_QSub8Ux8; break;
   1172          case Iop_QSub16Ux4:
   1173             fn = (HWord)h_generic_calc_QSub16Ux4; break;
   1174 
   1175          case Iop_Sub8x8:
   1176             fn = (HWord)h_generic_calc_Sub8x8; break;
   1177          case Iop_Sub16x4:
   1178             fn = (HWord)h_generic_calc_Sub16x4; break;
   1179          case Iop_Sub32x2:
   1180             fn = (HWord)h_generic_calc_Sub32x2; break;
   1181 
   1182          case Iop_ShlN32x2:
   1183             fn = (HWord)h_generic_calc_ShlN32x2;
   1184             second_is_UInt = True;
   1185             break;
   1186          case Iop_ShlN16x4:
   1187             fn = (HWord)h_generic_calc_ShlN16x4;
   1188             second_is_UInt = True;
   1189             break;
   1190          case Iop_ShlN8x8:
   1191             fn = (HWord)h_generic_calc_ShlN8x8;
   1192             second_is_UInt = True;
   1193             break;
   1194          case Iop_ShrN32x2:
   1195             fn = (HWord)h_generic_calc_ShrN32x2;
   1196             second_is_UInt = True;
   1197             break;
   1198          case Iop_ShrN16x4:
   1199             fn = (HWord)h_generic_calc_ShrN16x4;
   1200             second_is_UInt = True;
   1201             break;
   1202          case Iop_SarN32x2:
   1203             fn = (HWord)h_generic_calc_SarN32x2;
   1204             second_is_UInt = True;
   1205             break;
   1206          case Iop_SarN16x4:
   1207             fn = (HWord)h_generic_calc_SarN16x4;
   1208             second_is_UInt = True;
   1209             break;
   1210          case Iop_SarN8x8:
   1211             fn = (HWord)h_generic_calc_SarN8x8;
   1212             second_is_UInt = True;
   1213             break;
   1214 
   1215          default:
   1216             fn = (HWord)0; break;
   1217       }
   1218       if (fn != (HWord)0) {
   1219          /* Note: the following assumes all helpers are of signature
   1220                ULong fn ( ULong, ULong ), and they are
   1221             not marked as regparm functions.
   1222          */
   1223          HReg dst  = newVRegI(env);
   1224          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1225          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1226          if (second_is_UInt)
   1227             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
   1228          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
   1229          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
   1230          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
   1231                                         mk_RetLoc_simple(RLPri_Int) ));
   1232          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1233          return dst;
   1234       }
   1235 
   1236       /* Handle misc other ops. */
   1237 
   1238       if (e->Iex.Binop.op == Iop_Max32U) {
   1239          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1240          HReg dst  = newVRegI(env);
   1241          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1242          addInstr(env, mk_iMOVsd_RR(src1, dst));
   1243          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
   1244          addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst));
   1245          return dst;
   1246       }
   1247 
   1248       if (e->Iex.Binop.op == Iop_DivModS64to32
   1249           || e->Iex.Binop.op == Iop_DivModU64to32) {
   1250          /* 64 x 32 -> (32(rem),32(div)) division */
   1251          /* Get the 64-bit operand into edx:eax, and the other into
   1252             any old R/M. */
   1253          HReg      rax     = hregAMD64_RAX();
   1254          HReg      rdx     = hregAMD64_RDX();
   1255          HReg      dst     = newVRegI(env);
   1256          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   1257          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   1258          /* Compute the left operand into a reg, and then
   1259             put the top half in edx and the bottom in eax. */
   1260          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1261          addInstr(env, mk_iMOVsd_RR(left64, rdx));
   1262          addInstr(env, mk_iMOVsd_RR(left64, rax));
   1263          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
   1264          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
   1265 	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
   1266 	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
   1267          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
   1268          addInstr(env, mk_iMOVsd_RR(rax, dst));
   1269          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
   1270          return dst;
   1271       }
   1272 
   1273       if (e->Iex.Binop.op == Iop_32HLto64) {
   1274          HReg hi32  = newVRegI(env);
   1275          HReg lo32  = newVRegI(env);
   1276          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1277          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1278          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
   1279          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
   1280          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
   1281 	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
   1282          addInstr(env, AMD64Instr_Alu64R(
   1283                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
   1284          return hi32;
   1285       }
   1286 
   1287       if (e->Iex.Binop.op == Iop_16HLto32) {
   1288          HReg hi16  = newVRegI(env);
   1289          HReg lo16  = newVRegI(env);
   1290          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1291          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1292          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
   1293          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
   1294          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
   1295          addInstr(env, AMD64Instr_Alu64R(
   1296                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
   1297          addInstr(env, AMD64Instr_Alu64R(
   1298                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
   1299          return hi16;
   1300       }
   1301 
   1302       if (e->Iex.Binop.op == Iop_8HLto16) {
   1303          HReg hi8  = newVRegI(env);
   1304          HReg lo8  = newVRegI(env);
   1305          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1306          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1307          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
   1308          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
   1309          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
   1310          addInstr(env, AMD64Instr_Alu64R(
   1311                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
   1312          addInstr(env, AMD64Instr_Alu64R(
   1313                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
   1314          return hi8;
   1315       }
   1316 
   1317       if (e->Iex.Binop.op == Iop_MullS32
   1318           || e->Iex.Binop.op == Iop_MullS16
   1319           || e->Iex.Binop.op == Iop_MullS8
   1320           || e->Iex.Binop.op == Iop_MullU32
   1321           || e->Iex.Binop.op == Iop_MullU16
   1322           || e->Iex.Binop.op == Iop_MullU8) {
   1323          HReg a32   = newVRegI(env);
   1324          HReg b32   = newVRegI(env);
   1325          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1326          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1327          Int          shift  = 0;
   1328          AMD64ShiftOp shr_op = Ash_SHR;
   1329          switch (e->Iex.Binop.op) {
   1330             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
   1331             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
   1332             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
   1333             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
   1334             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
   1335             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
   1336             default: vassert(0);
   1337          }
   1338 
   1339          addInstr(env, mk_iMOVsd_RR(a32s, a32));
   1340          addInstr(env, mk_iMOVsd_RR(b32s, b32));
   1341          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
   1342          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
   1343          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
   1344          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
   1345          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
   1346          return b32;
   1347       }
   1348 
   1349       if (e->Iex.Binop.op == Iop_CmpF64) {
   1350          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
   1351          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
   1352          HReg dst = newVRegI(env);
   1353          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
   1354          /* Mask out irrelevant parts of the result so as to conform
   1355             to the CmpF64 definition. */
   1356          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
   1357          return dst;
   1358       }
   1359 
   1360       if (e->Iex.Binop.op == Iop_F64toI32S
   1361           || e->Iex.Binop.op == Iop_F64toI64S) {
   1362          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
   1363          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1364          HReg dst = newVRegI(env);
   1365          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   1366          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
   1367          set_SSE_rounding_default(env);
   1368          return dst;
   1369       }
   1370 
   1371       break;
   1372    }
   1373 
   1374    /* --------- UNARY OP --------- */
   1375    case Iex_Unop: {
   1376 
   1377       /* 1Uto8(64to1(expr64)) */
   1378       {
   1379          DEFINE_PATTERN( p_1Uto8_64to1,
   1380                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
   1381          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
   1382             IRExpr* expr64 = mi.bindee[0];
   1383             HReg    dst    = newVRegI(env);
   1384             HReg    src    = iselIntExpr_R(env, expr64);
   1385             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1386             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1387                                             AMD64RMI_Imm(1), dst));
   1388             return dst;
   1389          }
   1390       }
   1391 
   1392       /* 8Uto64(LDle(expr64)) */
   1393       {
   1394          DEFINE_PATTERN(p_LDle8_then_8Uto64,
   1395                         unop(Iop_8Uto64,
   1396                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1397          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
   1398             HReg dst = newVRegI(env);
   1399             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1400             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
   1401             return dst;
   1402          }
   1403       }
   1404 
   1405       /* 16Uto64(LDle(expr64)) */
   1406       {
   1407          DEFINE_PATTERN(p_LDle16_then_16Uto64,
   1408                         unop(Iop_16Uto64,
   1409                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1410          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
   1411             HReg dst = newVRegI(env);
   1412             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1413             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
   1414             return dst;
   1415          }
   1416       }
   1417 
   1418       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
   1419          Use 32 bit arithmetic and let the default zero-extend rule
   1420          do the 32Uto64 for free. */
   1421       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
   1422          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
   1423          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
   1424          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
   1425          AMD64AluOp aluOp = Aalu_INVALID;
   1426          switch (opi) {
   1427             case Iop_Add32: aluOp = Aalu_ADD; break;
   1428             case Iop_Sub32: aluOp = Aalu_SUB; break;
   1429             case Iop_And32: aluOp = Aalu_AND; break;
   1430             case Iop_Or32:  aluOp = Aalu_OR;  break;
   1431             case Iop_Xor32: aluOp = Aalu_XOR; break;
   1432             default: break;
   1433          }
   1434          if (aluOp != Aalu_INVALID) {
   1435             /* For commutative ops we assume any literal values are on
   1436                the second operand. */
   1437             HReg dst      = newVRegI(env);
   1438             HReg reg      = iselIntExpr_R(env, argL);
   1439             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
   1440             addInstr(env, mk_iMOVsd_RR(reg,dst));
   1441             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
   1442             return dst;
   1443          }
   1444          /* just fall through to normal handling for Iop_32Uto64 */
   1445       }
   1446 
   1447       /* Fallback cases */
   1448       switch (e->Iex.Unop.op) {
   1449          case Iop_32Uto64:
   1450          case Iop_32Sto64: {
   1451             HReg dst = newVRegI(env);
   1452             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1453             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
   1454                                             src, dst) );
   1455             return dst;
   1456          }
   1457          case Iop_128HIto64: {
   1458             HReg rHi, rLo;
   1459             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1460             return rHi; /* and abandon rLo */
   1461          }
   1462          case Iop_128to64: {
   1463             HReg rHi, rLo;
   1464             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1465             return rLo; /* and abandon rHi */
   1466          }
   1467          case Iop_8Uto16:
   1468          case Iop_8Uto32:
   1469          case Iop_8Uto64:
   1470          case Iop_16Uto64:
   1471          case Iop_16Uto32: {
   1472             HReg dst     = newVRegI(env);
   1473             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1474             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
   1475                                    || e->Iex.Unop.op==Iop_16Uto64 );
   1476             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
   1477             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1478             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1479                                             AMD64RMI_Imm(mask), dst));
   1480             return dst;
   1481          }
   1482          case Iop_8Sto16:
   1483          case Iop_8Sto64:
   1484          case Iop_8Sto32:
   1485          case Iop_16Sto32:
   1486          case Iop_16Sto64: {
   1487             HReg dst     = newVRegI(env);
   1488             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1489             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
   1490                                    || e->Iex.Unop.op==Iop_16Sto64 );
   1491             UInt amt     = srcIs16 ? 48 : 56;
   1492             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1493             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
   1494             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
   1495             return dst;
   1496          }
   1497  	 case Iop_Not8:
   1498  	 case Iop_Not16:
   1499          case Iop_Not32:
   1500          case Iop_Not64: {
   1501             HReg dst = newVRegI(env);
   1502             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1503             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1504             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
   1505             return dst;
   1506          }
   1507          case Iop_16HIto8:
   1508          case Iop_32HIto16:
   1509          case Iop_64HIto32: {
   1510             HReg dst  = newVRegI(env);
   1511             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1512             Int shift = 0;
   1513             switch (e->Iex.Unop.op) {
   1514                case Iop_16HIto8:  shift = 8;  break;
   1515                case Iop_32HIto16: shift = 16; break;
   1516                case Iop_64HIto32: shift = 32; break;
   1517                default: vassert(0);
   1518             }
   1519             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1520             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
   1521             return dst;
   1522          }
   1523          case Iop_1Uto64:
   1524          case Iop_1Uto32:
   1525          case Iop_1Uto8: {
   1526             HReg dst           = newVRegI(env);
   1527             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1528             addInstr(env, AMD64Instr_Set64(cond,dst));
   1529             return dst;
   1530          }
   1531          case Iop_1Sto8:
   1532          case Iop_1Sto16:
   1533          case Iop_1Sto32:
   1534          case Iop_1Sto64: {
   1535             /* could do better than this, but for now ... */
   1536             HReg dst           = newVRegI(env);
   1537             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1538             addInstr(env, AMD64Instr_Set64(cond,dst));
   1539             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
   1540             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1541             return dst;
   1542          }
   1543          case Iop_Ctz64: {
   1544             /* Count trailing zeroes, implemented by amd64 'bsfq' */
   1545             HReg dst = newVRegI(env);
   1546             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1547             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
   1548             return dst;
   1549          }
   1550          case Iop_Clz64: {
   1551             /* Count leading zeroes.  Do 'bsrq' to establish the index
   1552                of the highest set bit, and subtract that value from
   1553                63. */
   1554             HReg tmp = newVRegI(env);
   1555             HReg dst = newVRegI(env);
   1556             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1557             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
   1558             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
   1559                                             AMD64RMI_Imm(63), dst));
   1560             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
   1561                                             AMD64RMI_Reg(tmp), dst));
   1562             return dst;
   1563          }
   1564 
   1565          case Iop_CmpwNEZ64: {
   1566             HReg dst = newVRegI(env);
   1567             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1568             addInstr(env, mk_iMOVsd_RR(src,dst));
   1569             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1570             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1571                                             AMD64RMI_Reg(src), dst));
   1572             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1573             return dst;
   1574          }
   1575 
   1576          case Iop_CmpwNEZ32: {
   1577             HReg src = newVRegI(env);
   1578             HReg dst = newVRegI(env);
   1579             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
   1580             addInstr(env, mk_iMOVsd_RR(pre,src));
   1581             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
   1582             addInstr(env, mk_iMOVsd_RR(src,dst));
   1583             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1584             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1585                                             AMD64RMI_Reg(src), dst));
   1586             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1587             return dst;
   1588          }
   1589 
   1590          case Iop_Left8:
   1591          case Iop_Left16:
   1592          case Iop_Left32:
   1593          case Iop_Left64: {
   1594             HReg dst = newVRegI(env);
   1595             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1596             addInstr(env, mk_iMOVsd_RR(src, dst));
   1597             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
   1598             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
   1599             return dst;
   1600          }
   1601 
   1602          case Iop_V128to32: {
   1603             HReg        dst     = newVRegI(env);
   1604             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
   1605             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   1606             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
   1607             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
   1608             return dst;
   1609          }
   1610 
   1611          /* V128{HI}to64 */
   1612          case Iop_V128HIto64:
   1613          case Iop_V128to64: {
   1614             HReg dst = newVRegI(env);
   1615             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
   1616             HReg rsp = hregAMD64_RSP();
   1617             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1618             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1619             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1620             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1621                                              16, vec, m16_rsp));
   1622             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1623                                              AMD64RMI_Mem(off_rsp), dst ));
   1624             return dst;
   1625          }
   1626 
   1627          case Iop_V256to64_0: case Iop_V256to64_1:
   1628          case Iop_V256to64_2: case Iop_V256to64_3: {
   1629             HReg vHi, vLo, vec;
   1630             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   1631             /* Do the first part of the selection by deciding which of
   1632                the 128 bit registers do look at, and second part using
   1633                the same scheme as for V128{HI}to64 above. */
   1634             Int off = 0;
   1635             switch (e->Iex.Unop.op) {
   1636                case Iop_V256to64_0: vec = vLo; off = -16; break;
   1637                case Iop_V256to64_1: vec = vLo; off =  -8; break;
   1638                case Iop_V256to64_2: vec = vHi; off = -16; break;
   1639                case Iop_V256to64_3: vec = vHi; off =  -8; break;
   1640                default: vassert(0);
   1641             }
   1642             HReg        dst     = newVRegI(env);
   1643             HReg        rsp     = hregAMD64_RSP();
   1644             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1645             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1646             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1647                                              16, vec, m16_rsp));
   1648             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1649                                              AMD64RMI_Mem(off_rsp), dst ));
   1650             return dst;
   1651          }
   1652 
   1653          /* ReinterpF64asI64(e) */
   1654          /* Given an IEEE754 double, produce an I64 with the same bit
   1655             pattern. */
   1656          case Iop_ReinterpF64asI64: {
   1657             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1658             HReg        dst    = newVRegI(env);
   1659             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
   1660             /* paranoia */
   1661             set_SSE_rounding_default(env);
   1662             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
   1663             addInstr(env, AMD64Instr_Alu64R(
   1664                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
   1665             return dst;
   1666          }
   1667 
   1668          /* ReinterpF32asI32(e) */
   1669          /* Given an IEEE754 single, produce an I64 with the same bit
   1670             pattern in the lower half. */
   1671          case Iop_ReinterpF32asI32: {
   1672             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1673             HReg        dst    = newVRegI(env);
   1674             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
   1675             /* paranoia */
   1676             set_SSE_rounding_default(env);
   1677             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
   1678             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
   1679             return dst;
   1680          }
   1681 
   1682          case Iop_16to8:
   1683          case Iop_32to8:
   1684          case Iop_64to8:
   1685          case Iop_32to16:
   1686          case Iop_64to16:
   1687          case Iop_64to32:
   1688             /* These are no-ops. */
   1689             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1690 
   1691          case Iop_GetMSBs8x8: {
   1692             /* Note: the following assumes the helper is of
   1693                signature
   1694                   UInt fn ( ULong ), and is not a regparm fn.
   1695             */
   1696             HReg dst = newVRegI(env);
   1697             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1698             fn = (HWord)h_generic_calc_GetMSBs8x8;
   1699             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1700             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   1701                                            1, mk_RetLoc_simple(RLPri_Int) ));
   1702             /* MovxLQ is not exactly the right thing here.  We just
   1703                need to get the bottom 8 bits of RAX into dst, and zero
   1704                out everything else.  Assuming that the helper returns
   1705                a UInt with the top 24 bits zeroed out, it'll do,
   1706                though. */
   1707             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1708             return dst;
   1709          }
   1710 
   1711          case Iop_GetMSBs8x16: {
   1712             /* Note: the following assumes the helper is of signature
   1713                   UInt fn ( ULong w64hi, ULong w64Lo ),
   1714                and is not a regparm fn. */
   1715             HReg dst = newVRegI(env);
   1716             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1717             HReg rsp = hregAMD64_RSP();
   1718             fn = (HWord)h_generic_calc_GetMSBs8x16;
   1719             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
   1720             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1721             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1722                                              16, vec, m16_rsp));
   1723             /* hi 64 bits into RDI -- the first arg */
   1724             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1725                                              AMD64RMI_Mem(m8_rsp),
   1726                                              hregAMD64_RDI() )); /* 1st arg */
   1727             /* lo 64 bits into RSI -- the 2nd arg */
   1728             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1729                                              AMD64RMI_Mem(m16_rsp),
   1730                                              hregAMD64_RSI() )); /* 2nd arg */
   1731             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   1732                                            2, mk_RetLoc_simple(RLPri_Int) ));
   1733             /* MovxLQ is not exactly the right thing here.  We just
   1734                need to get the bottom 16 bits of RAX into dst, and zero
   1735                out everything else.  Assuming that the helper returns
   1736                a UInt with the top 16 bits zeroed out, it'll do,
   1737                though. */
   1738             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1739             return dst;
   1740          }
   1741 
   1742          default:
   1743             break;
   1744       }
   1745 
   1746       /* Deal with unary 64-bit SIMD ops. */
   1747       switch (e->Iex.Unop.op) {
   1748          case Iop_CmpNEZ32x2:
   1749             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
   1750          case Iop_CmpNEZ16x4:
   1751             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
   1752          case Iop_CmpNEZ8x8:
   1753             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
   1754          default:
   1755             fn = (HWord)0; break;
   1756       }
   1757       if (fn != (HWord)0) {
   1758          /* Note: the following assumes all helpers are of
   1759             signature
   1760                ULong fn ( ULong ), and they are
   1761             not marked as regparm functions.
   1762          */
   1763          HReg dst = newVRegI(env);
   1764          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1765          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1766          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
   1767                                         mk_RetLoc_simple(RLPri_Int) ));
   1768          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1769          return dst;
   1770       }
   1771 
   1772       break;
   1773    }
   1774 
   1775    /* --------- GET --------- */
   1776    case Iex_Get: {
   1777       if (ty == Ity_I64) {
   1778          HReg dst = newVRegI(env);
   1779          addInstr(env, AMD64Instr_Alu64R(
   1780                           Aalu_MOV,
   1781                           AMD64RMI_Mem(
   1782                              AMD64AMode_IR(e->Iex.Get.offset,
   1783                                            hregAMD64_RBP())),
   1784                           dst));
   1785          return dst;
   1786       }
   1787       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   1788          HReg dst = newVRegI(env);
   1789          addInstr(env, AMD64Instr_LoadEX(
   1790                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   1791                           False,
   1792                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
   1793                           dst));
   1794          return dst;
   1795       }
   1796       break;
   1797    }
   1798 
   1799    case Iex_GetI: {
   1800       AMD64AMode* am
   1801          = genGuestArrayOffset(
   1802               env, e->Iex.GetI.descr,
   1803                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1804       HReg dst = newVRegI(env);
   1805       if (ty == Ity_I8) {
   1806          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
   1807          return dst;
   1808       }
   1809       if (ty == Ity_I64) {
   1810          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
   1811          return dst;
   1812       }
   1813       break;
   1814    }
   1815 
   1816    /* --------- CCALL --------- */
   1817    case Iex_CCall: {
   1818       HReg    dst = newVRegI(env);
   1819       vassert(ty == e->Iex.CCall.retty);
   1820 
   1821       /* be very restrictive for now.  Only 64-bit ints allowed for
   1822          args, and 64 or 32 bits for return type. */
   1823       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
   1824          goto irreducible;
   1825 
   1826       /* Marshal args, do the call. */
   1827       UInt   addToSp = 0;
   1828       RetLoc rloc    = mk_RetLoc_INVALID();
   1829       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
   1830                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
   1831       vassert(is_sane_RetLoc(rloc));
   1832       vassert(rloc.pri == RLPri_Int);
   1833       vassert(addToSp == 0);
   1834 
   1835       /* Move to dst, and zero out the top 32 bits if the result type is
   1836          Ity_I32.  Probably overkill, but still .. */
   1837       if (e->Iex.CCall.retty == Ity_I64)
   1838          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1839       else
   1840          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1841 
   1842       return dst;
   1843    }
   1844 
   1845    /* --------- LITERAL --------- */
   1846    /* 64/32/16/8-bit literals */
   1847    case Iex_Const:
   1848       if (ty == Ity_I64) {
   1849          HReg r = newVRegI(env);
   1850          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
   1851          return r;
   1852       } else {
   1853          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
   1854          HReg      r   = newVRegI(env);
   1855          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
   1856          return r;
   1857       }
   1858 
   1859    /* --------- MULTIPLEX --------- */
   1860    case Iex_ITE: { // VFD
   1861       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1862           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
   1863          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
   1864          HReg     r0  = iselIntExpr_R(env, e->Iex.ITE.iffalse);
   1865          HReg     dst = newVRegI(env);
   1866          addInstr(env, mk_iMOVsd_RR(r1,dst));
   1867          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   1868          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
   1869          return dst;
   1870       }
   1871       break;
   1872    }
   1873 
   1874    /* --------- TERNARY OP --------- */
   1875    case Iex_Triop: {
   1876       IRTriop *triop = e->Iex.Triop.details;
   1877       /* C3210 flags following FPU partial remainder (fprem), both
   1878          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
   1879       if (triop->op == Iop_PRemC3210F64
   1880           || triop->op == Iop_PRem1C3210F64) {
   1881          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1882          HReg        arg1   = iselDblExpr(env, triop->arg2);
   1883          HReg        arg2   = iselDblExpr(env, triop->arg3);
   1884          HReg        dst    = newVRegI(env);
   1885          addInstr(env, AMD64Instr_A87Free(2));
   1886 
   1887          /* one arg -> top of x87 stack */
   1888          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
   1889          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1890 
   1891          /* other arg -> top of x87 stack */
   1892          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
   1893          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1894 
   1895          switch (triop->op) {
   1896             case Iop_PRemC3210F64:
   1897                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   1898                break;
   1899             case Iop_PRem1C3210F64:
   1900                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   1901                break;
   1902             default:
   1903                vassert(0);
   1904          }
   1905          /* Ignore the result, and instead make off with the FPU's
   1906 	    C3210 flags (in the status word). */
   1907          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
   1908          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
   1909          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
   1910          return dst;
   1911       }
   1912       break;
   1913    }
   1914 
   1915    default:
   1916    break;
   1917    } /* switch (e->tag) */
   1918 
   1919    /* We get here if no pattern matched. */
   1920   irreducible:
   1921    ppIRExpr(e);
   1922    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
   1923 }
   1924 
   1925 
   1926 /*---------------------------------------------------------*/
   1927 /*--- ISEL: Integer expression auxiliaries              ---*/
   1928 /*---------------------------------------------------------*/
   1929 
   1930 /* --------------------- AMODEs --------------------- */
   1931 
   1932 /* Return an AMode which computes the value of the specified
   1933    expression, possibly also adding insns to the code list as a
   1934    result.  The expression may only be a 32-bit one.
   1935 */
   1936 
   1937 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1938 {
   1939    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
   1940    vassert(sane_AMode(am));
   1941    return am;
   1942 }
   1943 
   1944 /* DO NOT CALL THIS DIRECTLY ! */
   1945 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1946 {
   1947    MatchInfo mi;
   1948    DECLARE_PATTERN(p_complex);
   1949    IRType ty = typeOfIRExpr(env->type_env,e);
   1950    vassert(ty == Ity_I64);
   1951 
   1952    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
   1953    /*              bind0        bind1  bind2   bind3   */
   1954    DEFINE_PATTERN(p_complex,
   1955       binop( Iop_Add64,
   1956              binop( Iop_Add64,
   1957                     bind(0),
   1958                     binop(Iop_Shl64, bind(1), bind(2))
   1959                   ),
   1960              bind(3)
   1961            )
   1962    );
   1963    if (matchIRExpr(&mi, p_complex, e)) {
   1964       IRExpr* expr1  = mi.bindee[0];
   1965       IRExpr* expr2  = mi.bindee[1];
   1966       IRExpr* imm8   = mi.bindee[2];
   1967       IRExpr* simm32 = mi.bindee[3];
   1968       if (imm8->tag == Iex_Const
   1969           && imm8->Iex.Const.con->tag == Ico_U8
   1970           && imm8->Iex.Const.con->Ico.U8 < 4
   1971           /* imm8 is OK, now check simm32 */
   1972           && simm32->tag == Iex_Const
   1973           && simm32->Iex.Const.con->tag == Ico_U64
   1974           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
   1975          UInt shift = imm8->Iex.Const.con->Ico.U8;
   1976          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
   1977          HReg r1 = iselIntExpr_R(env, expr1);
   1978          HReg r2 = iselIntExpr_R(env, expr2);
   1979          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
   1980          return AMD64AMode_IRRS(offset, r1, r2, shift);
   1981       }
   1982    }
   1983 
   1984    /* Add64(expr1, Shl64(expr2, imm)) */
   1985    if (e->tag == Iex_Binop
   1986        && e->Iex.Binop.op == Iop_Add64
   1987        && e->Iex.Binop.arg2->tag == Iex_Binop
   1988        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
   1989        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1990        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1991       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1992       if (shift == 1 || shift == 2 || shift == 3) {
   1993          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1994          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1995          return AMD64AMode_IRRS(0, r1, r2, shift);
   1996       }
   1997    }
   1998 
   1999    /* Add64(expr,i) */
   2000    if (e->tag == Iex_Binop
   2001        && e->Iex.Binop.op == Iop_Add64
   2002        && e->Iex.Binop.arg2->tag == Iex_Const
   2003        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
   2004        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
   2005       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2006       return AMD64AMode_IR(
   2007                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
   2008                 r1
   2009              );
   2010    }
   2011 
   2012    /* Doesn't match anything in particular.  Generate it into
   2013       a register and use that. */
   2014    {
   2015       HReg r1 = iselIntExpr_R(env, e);
   2016       return AMD64AMode_IR(0, r1);
   2017    }
   2018 }
   2019 
   2020 
   2021 /* --------------------- RMIs --------------------- */
   2022 
   2023 /* Similarly, calculate an expression into an X86RMI operand.  As with
   2024    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   2025 
   2026 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   2027 {
   2028    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   2029    /* sanity checks ... */
   2030    switch (rmi->tag) {
   2031       case Armi_Imm:
   2032          return rmi;
   2033       case Armi_Reg:
   2034          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
   2035          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
   2036          return rmi;
   2037       case Armi_Mem:
   2038          vassert(sane_AMode(rmi->Armi.Mem.am));
   2039          return rmi;
   2040       default:
   2041          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
   2042    }
   2043 }
   2044 
   2045 /* DO NOT CALL THIS DIRECTLY ! */
   2046 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   2047 {
   2048    IRType ty = typeOfIRExpr(env->type_env,e);
   2049    vassert(ty == Ity_I64 || ty == Ity_I32
   2050            || ty == Ity_I16 || ty == Ity_I8);
   2051 
   2052    /* special case: immediate 64/32/16/8 */
   2053    if (e->tag == Iex_Const) {
   2054       switch (e->Iex.Const.con->tag) {
   2055         case Ico_U64:
   2056            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   2057               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   2058            }
   2059            break;
   2060          case Ico_U32:
   2061             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
   2062          case Ico_U16:
   2063             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
   2064          case Ico_U8:
   2065             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
   2066          default:
   2067             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   2068       }
   2069    }
   2070 
   2071    /* special case: 64-bit GET */
   2072    if (e->tag == Iex_Get && ty == Ity_I64) {
   2073       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2074                                         hregAMD64_RBP()));
   2075    }
   2076 
   2077    /* special case: 64-bit load from memory */
   2078    if (e->tag == Iex_Load && ty == Ity_I64
   2079        && e->Iex.Load.end == Iend_LE) {
   2080       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2081       return AMD64RMI_Mem(am);
   2082    }
   2083 
   2084    /* default case: calculate into a register and return that */
   2085    {
   2086       HReg r = iselIntExpr_R ( env, e );
   2087       return AMD64RMI_Reg(r);
   2088    }
   2089 }
   2090 
   2091 
   2092 /* --------------------- RIs --------------------- */
   2093 
   2094 /* Calculate an expression into an AMD64RI operand.  As with
   2095    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2096    bits. */
   2097 
   2098 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   2099 {
   2100    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
   2101    /* sanity checks ... */
   2102    switch (ri->tag) {
   2103       case Ari_Imm:
   2104          return ri;
   2105       case Ari_Reg:
   2106          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
   2107          vassert(hregIsVirtual(ri->Ari.Reg.reg));
   2108          return ri;
   2109       default:
   2110          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
   2111    }
   2112 }
   2113 
   2114 /* DO NOT CALL THIS DIRECTLY ! */
   2115 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   2116 {
   2117    IRType ty = typeOfIRExpr(env->type_env,e);
   2118    vassert(ty == Ity_I64 || ty == Ity_I32
   2119            || ty == Ity_I16 || ty == Ity_I8);
   2120 
   2121    /* special case: immediate */
   2122    if (e->tag == Iex_Const) {
   2123       switch (e->Iex.Const.con->tag) {
   2124         case Ico_U64:
   2125            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   2126               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   2127            }
   2128            break;
   2129          case Ico_U32:
   2130             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
   2131          case Ico_U16:
   2132             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
   2133          case Ico_U8:
   2134             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
   2135          default:
   2136             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   2137       }
   2138    }
   2139 
   2140    /* default case: calculate into a register and return that */
   2141    {
   2142       HReg r = iselIntExpr_R ( env, e );
   2143       return AMD64RI_Reg(r);
   2144    }
   2145 }
   2146 
   2147 
   2148 /* --------------------- RMs --------------------- */
   2149 
   2150 /* Similarly, calculate an expression into an AMD64RM operand.  As
   2151    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2152    bits.  */
   2153 
   2154 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   2155 {
   2156    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
   2157    /* sanity checks ... */
   2158    switch (rm->tag) {
   2159       case Arm_Reg:
   2160          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
   2161          vassert(hregIsVirtual(rm->Arm.Reg.reg));
   2162          return rm;
   2163       case Arm_Mem:
   2164          vassert(sane_AMode(rm->Arm.Mem.am));
   2165          return rm;
   2166       default:
   2167          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
   2168    }
   2169 }
   2170 
   2171 /* DO NOT CALL THIS DIRECTLY ! */
   2172 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   2173 {
   2174    IRType ty = typeOfIRExpr(env->type_env,e);
   2175    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   2176 
   2177    /* special case: 64-bit GET */
   2178    if (e->tag == Iex_Get && ty == Ity_I64) {
   2179       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2180                                        hregAMD64_RBP()));
   2181    }
   2182 
   2183    /* special case: load from memory */
   2184 
   2185    /* default case: calculate into a register and return that */
   2186    {
   2187       HReg r = iselIntExpr_R ( env, e );
   2188       return AMD64RM_Reg(r);
   2189    }
   2190 }
   2191 
   2192 
   2193 /* --------------------- CONDCODE --------------------- */
   2194 
   2195 /* Generate code to evaluated a bit-typed expression, returning the
   2196    condition code which would correspond when the expression would
   2197    notionally have returned 1. */
   2198 
   2199 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   2200 {
   2201    /* Uh, there's nothing we can sanity check here, unfortunately. */
   2202    return iselCondCode_wrk(env,e);
   2203 }
   2204 
   2205 /* DO NOT CALL THIS DIRECTLY ! */
   2206 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   2207 {
   2208    MatchInfo mi;
   2209 
   2210    vassert(e);
   2211    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   2212 
   2213    /* var */
   2214    if (e->tag == Iex_RdTmp) {
   2215       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2216       HReg dst = newVRegI(env);
   2217       addInstr(env, mk_iMOVsd_RR(r64,dst));
   2218       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
   2219       return Acc_NZ;
   2220    }
   2221 
   2222    /* Constant 1:Bit */
   2223    if (e->tag == Iex_Const) {
   2224       HReg r;
   2225       vassert(e->Iex.Const.con->tag == Ico_U1);
   2226       vassert(e->Iex.Const.con->Ico.U1 == True
   2227               || e->Iex.Const.con->Ico.U1 == False);
   2228       r = newVRegI(env);
   2229       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
   2230       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
   2231       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
   2232    }
   2233 
   2234    /* Not1(...) */
   2235    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   2236       /* Generate code for the arg, and negate the test condition */
   2237       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   2238    }
   2239 
   2240    /* --- patterns rooted at: 64to1 --- */
   2241 
   2242    /* 64to1 */
   2243    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
   2244       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2245       addInstr(env, AMD64Instr_Test64(1,reg));
   2246       return Acc_NZ;
   2247    }
   2248 
   2249    /* --- patterns rooted at: 32to1 --- */
   2250 
   2251    /* 32to1 */
   2252    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
   2253       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2254       addInstr(env, AMD64Instr_Test64(1,reg));
   2255       return Acc_NZ;
   2256    }
   2257 
   2258    /* --- patterns rooted at: CmpNEZ8 --- */
   2259 
   2260    /* CmpNEZ8(x) */
   2261    if (e->tag == Iex_Unop
   2262        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   2263       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2264       addInstr(env, AMD64Instr_Test64(0xFF,r));
   2265       return Acc_NZ;
   2266    }
   2267 
   2268    /* --- patterns rooted at: CmpNEZ16 --- */
   2269 
   2270    /* CmpNEZ16(x) */
   2271    if (e->tag == Iex_Unop
   2272        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   2273       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2274       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
   2275       return Acc_NZ;
   2276    }
   2277 
   2278    /* --- patterns rooted at: CmpNEZ32 --- */
   2279 
   2280    /* CmpNEZ32(x) */
   2281    if (e->tag == Iex_Unop
   2282        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   2283       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2284       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2285       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2286       return Acc_NZ;
   2287    }
   2288 
   2289    /* --- patterns rooted at: CmpNEZ64 --- */
   2290 
   2291    /* CmpNEZ64(Or64(x,y)) */
   2292    {
   2293       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   2294       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   2295                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   2296       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   2297          HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
   2298          AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   2299          HReg      tmp  = newVRegI(env);
   2300          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   2301          addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
   2302          return Acc_NZ;
   2303       }
   2304    }
   2305 
   2306    /* CmpNEZ64(x) */
   2307    if (e->tag == Iex_Unop
   2308        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   2309       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2310       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2311       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2312       return Acc_NZ;
   2313    }
   2314 
   2315    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
   2316 
   2317    /* CmpEQ8 / CmpNE8 */
   2318    if (e->tag == Iex_Binop
   2319        && (e->Iex.Binop.op == Iop_CmpEQ8
   2320            || e->Iex.Binop.op == Iop_CmpNE8
   2321            || e->Iex.Binop.op == Iop_CasCmpEQ8
   2322            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   2323       if (isZeroU8(e->Iex.Binop.arg2)) {
   2324          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2325          addInstr(env, AMD64Instr_Test64(0xFF,r1));
   2326          switch (e->Iex.Binop.op) {
   2327             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2328             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2329             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
   2330          }
   2331       } else {
   2332          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2333          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2334          HReg      r    = newVRegI(env);
   2335          addInstr(env, mk_iMOVsd_RR(r1,r));
   2336          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2337          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
   2338          switch (e->Iex.Binop.op) {
   2339             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2340             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2341             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
   2342          }
   2343       }
   2344    }
   2345 
   2346    /* CmpEQ16 / CmpNE16 */
   2347    if (e->tag == Iex_Binop
   2348        && (e->Iex.Binop.op == Iop_CmpEQ16
   2349            || e->Iex.Binop.op == Iop_CmpNE16
   2350            || e->Iex.Binop.op == Iop_CasCmpEQ16
   2351            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   2352       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2353       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2354       HReg      r    = newVRegI(env);
   2355       addInstr(env, mk_iMOVsd_RR(r1,r));
   2356       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2357       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
   2358       switch (e->Iex.Binop.op) {
   2359          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
   2360          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
   2361          default: vpanic("iselCondCode(amd64): CmpXX16");
   2362       }
   2363    }
   2364 
   2365    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
   2366       Saves a "movq %rax, %tmp" compared to the default route. */
   2367    if (e->tag == Iex_Binop
   2368        && e->Iex.Binop.op == Iop_CmpNE64
   2369        && e->Iex.Binop.arg1->tag == Iex_CCall
   2370        && e->Iex.Binop.arg2->tag == Iex_Const) {
   2371       IRExpr* cal = e->Iex.Binop.arg1;
   2372       IRExpr* con = e->Iex.Binop.arg2;
   2373       HReg    tmp = newVRegI(env);
   2374       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   2375       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
   2376       vassert(con->Iex.Const.con->tag == Ico_U64);
   2377       /* Marshal args, do the call. */
   2378       UInt   addToSp = 0;
   2379       RetLoc rloc    = mk_RetLoc_INVALID();
   2380       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
   2381                     cal->Iex.CCall.cee,
   2382                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
   2383       vassert(is_sane_RetLoc(rloc));
   2384       vassert(rloc.pri == RLPri_Int);
   2385       vassert(addToSp == 0);
   2386       /* */
   2387       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
   2388       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
   2389                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
   2390       return Acc_NZ;
   2391    }
   2392 
   2393    /* Cmp*64*(x,y) */
   2394    if (e->tag == Iex_Binop
   2395        && (e->Iex.Binop.op == Iop_CmpEQ64
   2396            || e->Iex.Binop.op == Iop_CmpNE64
   2397            || e->Iex.Binop.op == Iop_CmpLT64S
   2398            || e->Iex.Binop.op == Iop_CmpLT64U
   2399            || e->Iex.Binop.op == Iop_CmpLE64S
   2400            || e->Iex.Binop.op == Iop_CmpLE64U
   2401            || e->Iex.Binop.op == Iop_CasCmpEQ64
   2402            || e->Iex.Binop.op == Iop_CasCmpNE64
   2403            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
   2404       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2405       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2406       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2407       switch (e->Iex.Binop.op) {
   2408          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
   2409          case Iop_CmpNE64:
   2410          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
   2411 	 case Iop_CmpLT64S: return Acc_L;
   2412 	 case Iop_CmpLT64U: return Acc_B;
   2413 	 case Iop_CmpLE64S: return Acc_LE;
   2414          case Iop_CmpLE64U: return Acc_BE;
   2415          default: vpanic("iselCondCode(amd64): CmpXX64");
   2416       }
   2417    }
   2418 
   2419    /* Cmp*32*(x,y) */
   2420    if (e->tag == Iex_Binop
   2421        && (e->Iex.Binop.op == Iop_CmpEQ32
   2422            || e->Iex.Binop.op == Iop_CmpNE32
   2423            || e->Iex.Binop.op == Iop_CmpLT32S
   2424            || e->Iex.Binop.op == Iop_CmpLT32U
   2425            || e->Iex.Binop.op == Iop_CmpLE32S
   2426            || e->Iex.Binop.op == Iop_CmpLE32U
   2427            || e->Iex.Binop.op == Iop_CasCmpEQ32
   2428            || e->Iex.Binop.op == Iop_CasCmpNE32
   2429            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
   2430       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2431       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2432       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2433       switch (e->Iex.Binop.op) {
   2434          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
   2435          case Iop_CmpNE32:
   2436          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
   2437 	 case Iop_CmpLT32S: return Acc_L;
   2438 	 case Iop_CmpLT32U: return Acc_B;
   2439 	 case Iop_CmpLE32S: return Acc_LE;
   2440          case Iop_CmpLE32U: return Acc_BE;
   2441          default: vpanic("iselCondCode(amd64): CmpXX32");
   2442       }
   2443    }
   2444 
   2445    ppIRExpr(e);
   2446    vpanic("iselCondCode(amd64)");
   2447 }
   2448 
   2449 
   2450 /*---------------------------------------------------------*/
   2451 /*--- ISEL: Integer expressions (128 bit)               ---*/
   2452 /*---------------------------------------------------------*/
   2453 
   2454 /* Compute a 128-bit value into a register pair, which is returned as
   2455    the first two parameters.  As with iselIntExpr_R, these may be
   2456    either real or virtual regs; in any case they must not be changed
   2457    by subsequent code emitted by the caller.  */
   2458 
   2459 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
   2460                              ISelEnv* env, IRExpr* e )
   2461 {
   2462    iselInt128Expr_wrk(rHi, rLo, env, e);
   2463 #  if 0
   2464    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2465 #  endif
   2466    vassert(hregClass(*rHi) == HRcInt64);
   2467    vassert(hregIsVirtual(*rHi));
   2468    vassert(hregClass(*rLo) == HRcInt64);
   2469    vassert(hregIsVirtual(*rLo));
   2470 }
   2471 
   2472 /* DO NOT CALL THIS DIRECTLY ! */
   2473 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
   2474                                  ISelEnv* env, IRExpr* e )
   2475 {
   2476    vassert(e);
   2477    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
   2478 
   2479    /* read 128-bit IRTemp */
   2480    if (e->tag == Iex_RdTmp) {
   2481       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   2482       return;
   2483    }
   2484 
   2485    /* --------- BINARY ops --------- */
   2486    if (e->tag == Iex_Binop) {
   2487       switch (e->Iex.Binop.op) {
   2488          /* 64 x 64 -> 128 multiply */
   2489          case Iop_MullU64:
   2490          case Iop_MullS64: {
   2491             /* get one operand into %rax, and the other into a R/M.
   2492                Need to make an educated guess about which is better in
   2493                which. */
   2494             HReg     tLo    = newVRegI(env);
   2495             HReg     tHi    = newVRegI(env);
   2496             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
   2497             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2498             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2499             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
   2500             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
   2501             /* Result is now in RDX:RAX.  Tell the caller. */
   2502             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2503             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2504             *rHi = tHi;
   2505             *rLo = tLo;
   2506             return;
   2507          }
   2508 
   2509          /* 128 x 64 -> (64(rem),64(div)) division */
   2510          case Iop_DivModU128to64:
   2511          case Iop_DivModS128to64: {
   2512             /* Get the 128-bit operand into rdx:rax, and the other into
   2513                any old R/M. */
   2514             HReg sHi, sLo;
   2515             HReg     tLo     = newVRegI(env);
   2516             HReg     tHi     = newVRegI(env);
   2517             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
   2518             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2519             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2520             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
   2521             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
   2522             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
   2523             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2524             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2525             *rHi = tHi;
   2526             *rLo = tLo;
   2527             return;
   2528          }
   2529 
   2530          /* 64HLto128(e1,e2) */
   2531          case Iop_64HLto128:
   2532             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2533             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2534             return;
   2535 
   2536          default:
   2537             break;
   2538       }
   2539    } /* if (e->tag == Iex_Binop) */
   2540 
   2541    ppIRExpr(e);
   2542    vpanic("iselInt128Expr");
   2543 }
   2544 
   2545 
   2546 /*---------------------------------------------------------*/
   2547 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2548 /*---------------------------------------------------------*/
   2549 
   2550 /* Nothing interesting here; really just wrappers for
   2551    64-bit stuff. */
   2552 
   2553 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2554 {
   2555    HReg r = iselFltExpr_wrk( env, e );
   2556 #  if 0
   2557    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2558 #  endif
   2559    vassert(hregClass(r) == HRcVec128);
   2560    vassert(hregIsVirtual(r));
   2561    return r;
   2562 }
   2563 
   2564 /* DO NOT CALL THIS DIRECTLY */
   2565 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2566 {
   2567    IRType ty = typeOfIRExpr(env->type_env,e);
   2568    vassert(ty == Ity_F32);
   2569 
   2570    if (e->tag == Iex_RdTmp) {
   2571       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2572    }
   2573 
   2574    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2575       AMD64AMode* am;
   2576       HReg res = newVRegV(env);
   2577       vassert(e->Iex.Load.ty == Ity_F32);
   2578       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2579       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
   2580       return res;
   2581    }
   2582 
   2583    if (e->tag == Iex_Binop
   2584        && e->Iex.Binop.op == Iop_F64toF32) {
   2585       /* Although the result is still held in a standard SSE register,
   2586          we need to round it to reflect the loss of accuracy/range
   2587          entailed in casting it to a 32-bit float. */
   2588       HReg dst = newVRegV(env);
   2589       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2590       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2591       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
   2592       set_SSE_rounding_default( env );
   2593       return dst;
   2594    }
   2595 
   2596    if (e->tag == Iex_Get) {
   2597       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2598                                        hregAMD64_RBP() );
   2599       HReg res = newVRegV(env);
   2600       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
   2601       return res;
   2602    }
   2603 
   2604    if (e->tag == Iex_Unop
   2605        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2606        /* Given an I32, produce an IEEE754 float with the same bit
   2607           pattern. */
   2608        HReg        dst    = newVRegV(env);
   2609        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
   2610        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
   2611        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
   2612        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
   2613        return dst;
   2614    }
   2615 
   2616    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2617       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2618       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
   2619       HReg        dst    = newVRegV(env);
   2620 
   2621       /* rf now holds the value to be rounded.  The first thing to do
   2622          is set the FPU's rounding mode accordingly. */
   2623 
   2624       /* Set host x87 rounding mode */
   2625       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2626 
   2627       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
   2628       addInstr(env, AMD64Instr_A87Free(1));
   2629       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
   2630       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2631       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
   2632       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
   2633 
   2634       /* Restore default x87 rounding. */
   2635       set_FPU_rounding_default( env );
   2636 
   2637       return dst;
   2638    }
   2639 
   2640    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
   2641       /* Sigh ... very rough code.  Could do much better. */
   2642       /* Get the 128-bit literal 00---0 10---0 into a register
   2643          and xor it with the value to be negated. */
   2644       HReg r1  = newVRegI(env);
   2645       HReg dst = newVRegV(env);
   2646       HReg tmp = newVRegV(env);
   2647       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
   2648       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2649       addInstr(env, mk_vMOVsd_RR(src,tmp));
   2650       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   2651       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
   2652       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   2653       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   2654       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   2655       add_to_rsp(env, 16);
   2656       return dst;
   2657    }
   2658 
   2659    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
   2660       IRQop *qop = e->Iex.Qop.details;
   2661       HReg dst  = newVRegV(env);
   2662       HReg argX = iselFltExpr(env, qop->arg2);
   2663       HReg argY = iselFltExpr(env, qop->arg3);
   2664       HReg argZ = iselFltExpr(env, qop->arg4);
   2665       /* XXXROUNDINGFIXME */
   2666       /* set roundingmode here */
   2667       /* subq $16, %rsp         -- make a space*/
   2668       sub_from_rsp(env, 16);
   2669       /* Prepare 4 arg regs:
   2670          leaq 0(%rsp), %rdi
   2671          leaq 4(%rsp), %rsi
   2672          leaq 8(%rsp), %rdx
   2673          leaq 12(%rsp), %rcx
   2674       */
   2675       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
   2676                                      hregAMD64_RDI()));
   2677       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
   2678                                      hregAMD64_RSI()));
   2679       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
   2680                                      hregAMD64_RDX()));
   2681       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
   2682                                      hregAMD64_RCX()));
   2683       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
   2684          movss  %argX, 0(%rsi)
   2685          movss  %argY, 0(%rdx)
   2686          movss  %argZ, 0(%rcx)
   2687          */
   2688       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
   2689                                        AMD64AMode_IR(0, hregAMD64_RSI())));
   2690       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
   2691                                        AMD64AMode_IR(0, hregAMD64_RDX())));
   2692       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
   2693                                        AMD64AMode_IR(0, hregAMD64_RCX())));
   2694       /* call the helper */
   2695       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
   2696                                      (ULong)(HWord)h_generic_calc_MAddF32,
   2697                                      4, mk_RetLoc_simple(RLPri_None) ));
   2698       /* fetch the result from memory, using %r_argp, which the
   2699          register allocator will keep alive across the call. */
   2700       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
   2701                                        AMD64AMode_IR(0, hregAMD64_RSP())));
   2702       /* and finally, clear the space */
   2703       add_to_rsp(env, 16);
   2704       return dst;
   2705    }
   2706 
   2707    ppIRExpr(e);
   2708    vpanic("iselFltExpr_wrk");
   2709 }
   2710 
   2711 
   2712 /*---------------------------------------------------------*/
   2713 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2714 /*---------------------------------------------------------*/
   2715 
   2716 /* Compute a 64-bit floating point value into the lower half of an xmm
   2717    register, the identity of which is returned.  As with
   2718    iselIntExpr_R, the returned reg will be virtual, and it must not be
   2719    changed by subsequent code emitted by the caller.
   2720 */
   2721 
   2722 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2723 
   2724     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2725     ----                  ---------   -----------   -----------
   2726     signalling NaN        u           2047 (max)    .0uuuuu---u
   2727                                                     (with at least
   2728                                                      one 1 bit)
   2729     quiet NaN             u           2047 (max)    .1uuuuu---u
   2730 
   2731     negative infinity     1           2047 (max)    .000000---0
   2732 
   2733     positive infinity     0           2047 (max)    .000000---0
   2734 
   2735     negative zero         1           0             .000000---0
   2736 
   2737     positive zero         0           0             .000000---0
   2738 */
   2739 
   2740 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2741 {
   2742    HReg r = iselDblExpr_wrk( env, e );
   2743 #  if 0
   2744    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2745 #  endif
   2746    vassert(hregClass(r) == HRcVec128);
   2747    vassert(hregIsVirtual(r));
   2748    return r;
   2749 }
   2750 
   2751 /* DO NOT CALL THIS DIRECTLY */
   2752 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2753 {
   2754    IRType ty = typeOfIRExpr(env->type_env,e);
   2755    vassert(e);
   2756    vassert(ty == Ity_F64);
   2757 
   2758    if (e->tag == Iex_RdTmp) {
   2759       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2760    }
   2761 
   2762    if (e->tag == Iex_Const) {
   2763       union { ULong u64; Double f64; } u;
   2764       HReg res = newVRegV(env);
   2765       HReg tmp = newVRegI(env);
   2766       vassert(sizeof(u) == 8);
   2767       vassert(sizeof(u.u64) == 8);
   2768       vassert(sizeof(u.f64) == 8);
   2769 
   2770       if (e->Iex.Const.con->tag == Ico_F64) {
   2771          u.f64 = e->Iex.Const.con->Ico.F64;
   2772       }
   2773       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2774          u.u64 = e->Iex.Const.con->Ico.F64i;
   2775       }
   2776       else
   2777          vpanic("iselDblExpr(amd64): const");
   2778 
   2779       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
   2780       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
   2781       addInstr(env, AMD64Instr_SseLdSt(
   2782                        True/*load*/, 8, res,
   2783                        AMD64AMode_IR(0, hregAMD64_RSP())
   2784               ));
   2785       add_to_rsp(env, 8);
   2786       return res;
   2787    }
   2788 
   2789    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2790       AMD64AMode* am;
   2791       HReg res = newVRegV(env);
   2792       vassert(e->Iex.Load.ty == Ity_F64);
   2793       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2794       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2795       return res;
   2796    }
   2797 
   2798    if (e->tag == Iex_Get) {
   2799       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2800                                       hregAMD64_RBP() );
   2801       HReg res = newVRegV(env);
   2802       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2803       return res;
   2804    }
   2805 
   2806    if (e->tag == Iex_GetI) {
   2807       AMD64AMode* am
   2808          = genGuestArrayOffset(
   2809               env, e->Iex.GetI.descr,
   2810                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2811       HReg res = newVRegV(env);
   2812       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2813       return res;
   2814    }
   2815 
   2816    if (e->tag == Iex_Triop) {
   2817       IRTriop *triop = e->Iex.Triop.details;
   2818       AMD64SseOp op = Asse_INVALID;
   2819       switch (triop->op) {
   2820          case Iop_AddF64: op = Asse_ADDF; break;
   2821          case Iop_SubF64: op = Asse_SUBF; break;
   2822          case Iop_MulF64: op = Asse_MULF; break;
   2823          case Iop_DivF64: op = Asse_DIVF; break;
   2824          default: break;
   2825       }
   2826       if (op != Asse_INVALID) {
   2827          HReg dst  = newVRegV(env);
   2828          HReg argL = iselDblExpr(env, triop->arg2);
   2829          HReg argR = iselDblExpr(env, triop->arg3);
   2830          addInstr(env, mk_vMOVsd_RR(argL, dst));
   2831          /* XXXROUNDINGFIXME */
   2832          /* set roundingmode here */
   2833          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   2834          return dst;
   2835       }
   2836    }
   2837 
   2838    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
   2839       IRQop *qop = e->Iex.Qop.details;
   2840       HReg dst  = newVRegV(env);
   2841       HReg argX = iselDblExpr(env, qop->arg2);
   2842       HReg argY = iselDblExpr(env, qop->arg3);
   2843       HReg argZ = iselDblExpr(env, qop->arg4);
   2844       /* XXXROUNDINGFIXME */
   2845       /* set roundingmode here */
   2846       /* subq $32, %rsp         -- make a space*/
   2847       sub_from_rsp(env, 32);
   2848       /* Prepare 4 arg regs:
   2849          leaq 0(%rsp), %rdi
   2850          leaq 8(%rsp), %rsi
   2851          leaq 16(%rsp), %rdx
   2852          leaq 24(%rsp), %rcx
   2853       */
   2854       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
   2855                                      hregAMD64_RDI()));
   2856       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
   2857                                      hregAMD64_RSI()));
   2858       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
   2859                                      hregAMD64_RDX()));
   2860       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
   2861                                      hregAMD64_RCX()));
   2862       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
   2863          movsd  %argX, 0(%rsi)
   2864          movsd  %argY, 0(%rdx)
   2865          movsd  %argZ, 0(%rcx)
   2866          */
   2867       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
   2868                                        AMD64AMode_IR(0, hregAMD64_RSI())));
   2869       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
   2870                                        AMD64AMode_IR(0, hregAMD64_RDX())));
   2871       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
   2872                                        AMD64AMode_IR(0, hregAMD64_RCX())));
   2873       /* call the helper */
   2874       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
   2875                                      (ULong)(HWord)h_generic_calc_MAddF64,
   2876                                      4, mk_RetLoc_simple(RLPri_None) ));
   2877       /* fetch the result from memory, using %r_argp, which the
   2878          register allocator will keep alive across the call. */
   2879       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
   2880                                        AMD64AMode_IR(0, hregAMD64_RSP())));
   2881       /* and finally, clear the space */
   2882       add_to_rsp(env, 32);
   2883       return dst;
   2884    }
   2885 
   2886    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2887       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2888       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   2889       HReg        dst    = newVRegV(env);
   2890 
   2891       /* rf now holds the value to be rounded.  The first thing to do
   2892          is set the FPU's rounding mode accordingly. */
   2893 
   2894       /* Set host x87 rounding mode */
   2895       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2896 
   2897       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   2898       addInstr(env, AMD64Instr_A87Free(1));
   2899       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2900       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2901       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2902       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2903 
   2904       /* Restore default x87 rounding. */
   2905       set_FPU_rounding_default( env );
   2906 
   2907       return dst;
   2908    }
   2909 
   2910    IRTriop *triop = e->Iex.Triop.details;
   2911    if (e->tag == Iex_Triop
   2912        && (triop->op == Iop_ScaleF64
   2913            || triop->op == Iop_AtanF64
   2914            || triop->op == Iop_Yl2xF64
   2915            || triop->op == Iop_Yl2xp1F64
   2916            || triop->op == Iop_PRemF64
   2917            || triop->op == Iop_PRem1F64)
   2918       ) {
   2919       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2920       HReg        arg1   = iselDblExpr(env, triop->arg2);
   2921       HReg        arg2   = iselDblExpr(env, triop->arg3);
   2922       HReg        dst    = newVRegV(env);
   2923       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
   2924                                   || triop->op == Iop_PRemF64
   2925                                   || triop->op == Iop_PRem1F64);
   2926       addInstr(env, AMD64Instr_A87Free(2));
   2927 
   2928       /* one arg -> top of x87 stack */
   2929       addInstr(env, AMD64Instr_SseLdSt(
   2930                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
   2931       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2932 
   2933       /* other arg -> top of x87 stack */
   2934       addInstr(env, AMD64Instr_SseLdSt(
   2935                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
   2936       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2937 
   2938       /* do it */
   2939       /* XXXROUNDINGFIXME */
   2940       /* set roundingmode here */
   2941       switch (triop->op) {
   2942          case Iop_ScaleF64:
   2943             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
   2944             break;
   2945          case Iop_AtanF64:
   2946             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
   2947             break;
   2948          case Iop_Yl2xF64:
   2949             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
   2950             break;
   2951          case Iop_Yl2xp1F64:
   2952             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
   2953             break;
   2954          case Iop_PRemF64:
   2955             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   2956             break;
   2957          case Iop_PRem1F64:
   2958             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   2959             break;
   2960          default:
   2961             vassert(0);
   2962       }
   2963 
   2964       /* save result */
   2965       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2966       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2967       return dst;
   2968    }
   2969 
   2970    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   2971       HReg dst = newVRegV(env);
   2972       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2973       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2974       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
   2975       set_SSE_rounding_default( env );
   2976       return dst;
   2977    }
   2978 
   2979    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
   2980       HReg dst = newVRegV(env);
   2981       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2982       set_SSE_rounding_default( env );
   2983       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
   2984       return dst;
   2985    }
   2986 
   2987    if (e->tag == Iex_Unop
   2988        && (e->Iex.Unop.op == Iop_NegF64
   2989            || e->Iex.Unop.op == Iop_AbsF64)) {
   2990       /* Sigh ... very rough code.  Could do much better. */
   2991       /* Get the 128-bit literal 00---0 10---0 into a register
   2992          and xor/nand it with the value to be negated. */
   2993       HReg r1  = newVRegI(env);
   2994       HReg dst = newVRegV(env);
   2995       HReg tmp = newVRegV(env);
   2996       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   2997       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2998       addInstr(env, mk_vMOVsd_RR(src,tmp));
   2999       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3000       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
   3001       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   3002       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   3003 
   3004       if (e->Iex.Unop.op == Iop_NegF64)
   3005          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   3006       else
   3007          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
   3008 
   3009       add_to_rsp(env, 16);
   3010       return dst;
   3011    }
   3012 
   3013    if (e->tag == Iex_Binop) {
   3014       A87FpOp fpop = Afp_INVALID;
   3015       switch (e->Iex.Binop.op) {
   3016          case Iop_SqrtF64: fpop = Afp_SQRT; break;
   3017          case Iop_SinF64:  fpop = Afp_SIN;  break;
   3018          case Iop_CosF64:  fpop = Afp_COS;  break;
   3019          case Iop_TanF64:  fpop = Afp_TAN;  break;
   3020          case Iop_2xm1F64: fpop = Afp_2XM1; break;
   3021          default: break;
   3022       }
   3023       if (fpop != Afp_INVALID) {
   3024          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3025          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   3026          HReg        dst    = newVRegV(env);
   3027          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
   3028          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   3029          addInstr(env, AMD64Instr_A87Free(nNeeded));
   3030          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3031          /* XXXROUNDINGFIXME */
   3032          /* set roundingmode here */
   3033          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
   3034             codes.  I don't think that matters, since this insn
   3035             selector never generates such an instruction intervening
   3036             between an flag-setting instruction and a flag-using
   3037             instruction. */
   3038          addInstr(env, AMD64Instr_A87FpOp(fpop));
   3039          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3040          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3041          return dst;
   3042       }
   3043    }
   3044 
   3045    if (e->tag == Iex_Unop) {
   3046       switch (e->Iex.Unop.op) {
   3047 //..          case Iop_I32toF64: {
   3048 //..             HReg dst = newVRegF(env);
   3049 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3050 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3051 //..             set_FPU_rounding_default(env);
   3052 //..             addInstr(env, X86Instr_FpLdStI(
   3053 //..                              True/*load*/, 4, dst,
   3054 //..                              X86AMode_IR(0, hregX86_ESP())));
   3055 //..             add_to_esp(env, 4);
   3056 //..             return dst;
   3057 //..          }
   3058          case Iop_ReinterpI64asF64: {
   3059             /* Given an I64, produce an IEEE754 double with the same
   3060                bit pattern. */
   3061             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3062             HReg        dst    = newVRegV(env);
   3063             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3064             /* paranoia */
   3065             set_SSE_rounding_default(env);
   3066             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
   3067             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3068             return dst;
   3069          }
   3070          case Iop_F32toF64: {
   3071             HReg f32;
   3072             HReg f64 = newVRegV(env);
   3073             /* this shouldn't be necessary, but be paranoid ... */
   3074             set_SSE_rounding_default(env);
   3075             f32 = iselFltExpr(env, e->Iex.Unop.arg);
   3076             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
   3077             return f64;
   3078          }
   3079          default:
   3080             break;
   3081       }
   3082    }
   3083 
   3084    /* --------- MULTIPLEX --------- */
   3085    if (e->tag == Iex_ITE) { // VFD
   3086       HReg r1, r0, dst;
   3087       vassert(ty == Ity_F64);
   3088       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
   3089       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
   3090       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
   3091       dst = newVRegV(env);
   3092       addInstr(env, mk_vMOVsd_RR(r1,dst));
   3093       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   3094       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
   3095       return dst;
   3096    }
   3097 
   3098    ppIRExpr(e);
   3099    vpanic("iselDblExpr_wrk");
   3100 }
   3101 
   3102 
   3103 /*---------------------------------------------------------*/
   3104 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3105 /*---------------------------------------------------------*/
   3106 
   3107 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3108 {
   3109    HReg r = iselVecExpr_wrk( env, e );
   3110 #  if 0
   3111    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3112 #  endif
   3113    vassert(hregClass(r) == HRcVec128);
   3114    vassert(hregIsVirtual(r));
   3115    return r;
   3116 }
   3117 
   3118 
   3119 /* DO NOT CALL THIS DIRECTLY */
   3120 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3121 {
   3122    HWord      fn = 0; /* address of helper fn, if required */
   3123    Bool       arg1isEReg = False;
   3124    AMD64SseOp op = Asse_INVALID;
   3125    IRType     ty = typeOfIRExpr(env->type_env,e);
   3126    vassert(e);
   3127    vassert(ty == Ity_V128);
   3128 
   3129    if (e->tag == Iex_RdTmp) {
   3130       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3131    }
   3132 
   3133    if (e->tag == Iex_Get) {
   3134       HReg dst = newVRegV(env);
   3135       addInstr(env, AMD64Instr_SseLdSt(
   3136                        True/*load*/,
   3137                        16,
   3138                        dst,
   3139                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
   3140                     )
   3141               );
   3142       return dst;
   3143    }
   3144 
   3145    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3146       HReg        dst = newVRegV(env);
   3147       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3148       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   3149       return dst;
   3150    }
   3151 
   3152    if (e->tag == Iex_Const) {
   3153       HReg dst = newVRegV(env);
   3154       vassert(e->Iex.Const.con->tag == Ico_V128);
   3155       switch (e->Iex.Const.con->Ico.V128) {
   3156          case 0x0000:
   3157             dst = generate_zeroes_V128(env);
   3158             break;
   3159          case 0xFFFF:
   3160             dst = generate_ones_V128(env);
   3161             break;
   3162          default: {
   3163             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3164             /* do push_uimm64 twice, first time for the high-order half. */
   3165             push_uimm64(env, bitmask8_to_bytemask64(
   3166                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
   3167                        ));
   3168             push_uimm64(env, bitmask8_to_bytemask64(
   3169                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
   3170                        ));
   3171             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
   3172             add_to_rsp(env, 16);
   3173             break;
   3174          }
   3175       }
   3176       return dst;
   3177    }
   3178 
   3179    if (e->tag == Iex_Unop) {
   3180    switch (e->Iex.Unop.op) {
   3181 
   3182       case Iop_NotV128: {
   3183          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3184          return do_sse_NotV128(env, arg);
   3185       }
   3186 
   3187       case Iop_CmpNEZ64x2: {
   3188          /* We can use SSE2 instructions for this. */
   3189          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3190             the operand.  Problem is no such insn exists.  Solution
   3191             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3192             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3193             let the not'd result of this initial comparison be a:b:c:d.
   3194             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3195             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3196             giving the required result.
   3197 
   3198             The required selection sequence is 2,3,0,1, which
   3199             according to Intel's documentation means the pshufd
   3200             literal value is 0xB1, that is,
   3201             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3202          */
   3203          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3204          HReg tmp  = generate_zeroes_V128(env);
   3205          HReg dst  = newVRegV(env);
   3206          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
   3207          tmp = do_sse_NotV128(env, tmp);
   3208          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
   3209          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
   3210          return dst;
   3211       }
   3212 
   3213       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3214       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   3215       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   3216       do_CmpNEZ_vector:
   3217       {
   3218          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3219          HReg tmp  = newVRegV(env);
   3220          HReg zero = generate_zeroes_V128(env);
   3221          HReg dst;
   3222          addInstr(env, mk_vMOVsd_RR(arg, tmp));
   3223          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
   3224          dst = do_sse_NotV128(env, tmp);
   3225          return dst;
   3226       }
   3227 
   3228       case Iop_RecipEst32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
   3229       case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
   3230       do_32Fx4_unary:
   3231       {
   3232          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3233          HReg dst = newVRegV(env);
   3234          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
   3235          return dst;
   3236       }
   3237 
   3238       case Iop_RecipEst32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
   3239       case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
   3240       case Iop_Sqrt32F0x4:     op = Asse_SQRTF;  goto do_32F0x4_unary;
   3241       do_32F0x4_unary:
   3242       {
   3243          /* A bit subtle.  We have to copy the arg to the result
   3244             register first, because actually doing the SSE scalar insn
   3245             leaves the upper 3/4 of the destination register
   3246             unchanged.  Whereas the required semantics of these
   3247             primops is that the upper 3/4 is simply copied in from the
   3248             argument. */
   3249          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3250          HReg dst = newVRegV(env);
   3251          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3252          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
   3253          return dst;
   3254       }
   3255 
   3256       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
   3257       do_64F0x2_unary:
   3258       {
   3259          /* A bit subtle.  We have to copy the arg to the result
   3260             register first, because actually doing the SSE scalar insn
   3261             leaves the upper half of the destination register
   3262             unchanged.  Whereas the required semantics of these
   3263             primops is that the upper half is simply copied in from the
   3264             argument. */
   3265          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3266          HReg dst = newVRegV(env);
   3267          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3268          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
   3269          return dst;
   3270       }
   3271 
   3272       case Iop_32UtoV128: {
   3273          HReg        dst     = newVRegV(env);
   3274          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
   3275          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3276          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
   3277          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
   3278          return dst;
   3279       }
   3280 
   3281       case Iop_64UtoV128: {
   3282          HReg        dst  = newVRegV(env);
   3283          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3284          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3285          addInstr(env, AMD64Instr_Push(rmi));
   3286          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
   3287          add_to_rsp(env, 8);
   3288          return dst;
   3289       }
   3290 
   3291       case Iop_V256toV128_0:
   3292       case Iop_V256toV128_1: {
   3293          HReg vHi, vLo;
   3294          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   3295          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
   3296       }
   3297 
   3298       default:
   3299          break;
   3300    } /* switch (e->Iex.Unop.op) */
   3301    } /* if (e->tag == Iex_Unop) */
   3302 
   3303    if (e->tag == Iex_Binop) {
   3304    switch (e->Iex.Binop.op) {
   3305 
   3306       case Iop_Sqrt64Fx2:
   3307       case Iop_Sqrt32Fx4: {
   3308          /* :: (rmode, vec) -> vec */
   3309          HReg arg = iselVecExpr(env, e->Iex.Binop.arg2);
   3310          HReg dst = newVRegV(env);
   3311          /* XXXROUNDINGFIXME */
   3312          /* set roundingmode here */
   3313          addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2
   3314                            ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4)
   3315                        (Asse_SQRTF, arg, dst));
   3316          return dst;
   3317       }
   3318 
   3319       /* FIXME: could we generate MOVQ here? */
   3320       case Iop_SetV128lo64: {
   3321          HReg dst  = newVRegV(env);
   3322          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3323          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3324          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3325          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3326          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
   3327          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3328          return dst;
   3329       }
   3330 
   3331       /* FIXME: could we generate MOVD here? */
   3332       case Iop_SetV128lo32: {
   3333          HReg dst  = newVRegV(env);
   3334          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3335          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3336          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3337          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3338          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
   3339          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3340          return dst;
   3341       }
   3342 
   3343       case Iop_64HLtoV128: {
   3344          HReg        rsp     = hregAMD64_RSP();
   3345          AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   3346          AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   3347          AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
   3348          AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
   3349          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
   3350          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
   3351          HReg        dst = newVRegV(env);
   3352          /* One store-forwarding stall coming up, oh well :-( */
   3353          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
   3354          return dst;
   3355       }
   3356 
   3357       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
   3358       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
   3359       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
   3360       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
   3361       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
   3362       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
   3363       do_32Fx4:
   3364       {
   3365          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3366          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3367          HReg dst = newVRegV(env);
   3368          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3369          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3370          return dst;
   3371       }
   3372 
   3373       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
   3374       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
   3375       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
   3376       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
   3377       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
   3378       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
   3379       do_64Fx2:
   3380       {
   3381          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3382          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3383          HReg dst = newVRegV(env);
   3384          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3385          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3386          return dst;
   3387       }
   3388 
   3389       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
   3390       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
   3391       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
   3392       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
   3393       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
   3394       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
   3395       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
   3396       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
   3397       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
   3398       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
   3399       do_32F0x4: {
   3400          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3401          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3402          HReg dst = newVRegV(env);
   3403          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3404          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
   3405          return dst;
   3406       }
   3407 
   3408       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
   3409       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
   3410       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
   3411       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
   3412       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
   3413       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
   3414       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
   3415       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
   3416       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
   3417       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
   3418       do_64F0x2: {
   3419          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3420          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3421          HReg dst = newVRegV(env);
   3422          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3423          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   3424          return dst;
   3425       }
   3426 
   3427       case Iop_QNarrowBin32Sto16Sx8:
   3428          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3429       case Iop_QNarrowBin16Sto8Sx16:
   3430          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3431       case Iop_QNarrowBin16Sto8Ux16:
   3432          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3433 
   3434       case Iop_InterleaveHI8x16:
   3435          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3436       case Iop_InterleaveHI16x8:
   3437          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3438       case Iop_InterleaveHI32x4:
   3439          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3440       case Iop_InterleaveHI64x2:
   3441          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3442 
   3443       case Iop_InterleaveLO8x16:
   3444          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3445       case Iop_InterleaveLO16x8:
   3446          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3447       case Iop_InterleaveLO32x4:
   3448          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3449       case Iop_InterleaveLO64x2:
   3450          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3451 
   3452       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
   3453       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
   3454       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
   3455       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
   3456       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
   3457       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
   3458       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
   3459       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
   3460       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
   3461       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
   3462       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
   3463       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
   3464       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
   3465       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3466       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
   3467       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3468       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
   3469       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
   3470       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
   3471       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
   3472       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
   3473       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
   3474       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
   3475       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
   3476       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
   3477       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
   3478       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
   3479       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
   3480       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
   3481       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
   3482       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
   3483       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
   3484       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
   3485       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
   3486       do_SseReRg: {
   3487          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3488          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3489          HReg dst = newVRegV(env);
   3490          if (arg1isEReg) {
   3491             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3492             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
   3493          } else {
   3494             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3495             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
   3496          }
   3497          return dst;
   3498       }
   3499 
   3500       case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
   3501       case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
   3502       case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
   3503       case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
   3504       case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
   3505       case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
   3506       case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
   3507       case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
   3508       do_SseShift: {
   3509          HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3510          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3511          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3512          HReg        ereg = newVRegV(env);
   3513          HReg        dst  = newVRegV(env);
   3514          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3515          addInstr(env, AMD64Instr_Push(rmi));
   3516          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3517          addInstr(env, mk_vMOVsd_RR(greg, dst));
   3518          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
   3519          add_to_rsp(env, 16);
   3520          return dst;
   3521       }
   3522 
   3523       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
   3524                            goto do_SseAssistedBinary;
   3525       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
   3526                            goto do_SseAssistedBinary;
   3527       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
   3528                            goto do_SseAssistedBinary;
   3529       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
   3530                            goto do_SseAssistedBinary;
   3531       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
   3532                            goto do_SseAssistedBinary;
   3533       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
   3534                            goto do_SseAssistedBinary;
   3535       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
   3536                            goto do_SseAssistedBinary;
   3537       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
   3538                            goto do_SseAssistedBinary;
   3539       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
   3540                            goto do_SseAssistedBinary;
   3541       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   3542                            goto do_SseAssistedBinary;
   3543       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   3544                            goto do_SseAssistedBinary;
   3545       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
   3546                            goto do_SseAssistedBinary;
   3547       case Iop_QNarrowBin32Sto16Ux8:
   3548                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
   3549                            goto do_SseAssistedBinary;
   3550       case Iop_NarrowBin16to8x16:
   3551                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3552                            goto do_SseAssistedBinary;
   3553       case Iop_NarrowBin32to16x8:
   3554                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3555                            goto do_SseAssistedBinary;
   3556       do_SseAssistedBinary: {
   3557          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3558             well. */
   3559          vassert(fn != 0);
   3560          HReg dst = newVRegV(env);
   3561          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3562          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3563          HReg argp = newVRegI(env);
   3564          /* subq $112, %rsp         -- make a space*/
   3565          sub_from_rsp(env, 112);
   3566          /* leaq 48(%rsp), %r_argp  -- point into it */
   3567          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3568                                         argp));
   3569          /* andq $-16, %r_argp      -- 16-align the pointer */
   3570          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3571                                          AMD64RMI_Imm( ~(UInt)15 ),
   3572                                          argp));
   3573          /* Prepare 3 arg regs:
   3574             leaq 0(%r_argp), %rdi
   3575             leaq 16(%r_argp), %rsi
   3576             leaq 32(%r_argp), %rdx
   3577          */
   3578          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3579                                         hregAMD64_RDI()));
   3580          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3581                                         hregAMD64_RSI()));
   3582          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   3583                                         hregAMD64_RDX()));
   3584          /* Store the two args, at (%rsi) and (%rdx):
   3585             movupd  %argL, 0(%rsi)
   3586             movupd  %argR, 0(%rdx)
   3587          */
   3588          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3589                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3590          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
   3591                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   3592          /* call the helper */
   3593          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   3594                                         3, mk_RetLoc_simple(RLPri_None) ));
   3595          /* fetch the result from memory, using %r_argp, which the
   3596             register allocator will keep alive across the call. */
   3597          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3598                                           AMD64AMode_IR(0, argp)));
   3599          /* and finally, clear the space */
   3600          add_to_rsp(env, 112);
   3601          return dst;
   3602       }
   3603 
   3604       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
   3605                          goto do_SseAssistedVectorAndScalar;
   3606       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
   3607                          goto do_SseAssistedVectorAndScalar;
   3608       do_SseAssistedVectorAndScalar: {
   3609          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3610             well. */
   3611          vassert(fn != 0);
   3612          HReg dst = newVRegV(env);
   3613          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3614          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3615          HReg argp = newVRegI(env);
   3616          /* subq $112, %rsp         -- make a space*/
   3617          sub_from_rsp(env, 112);
   3618          /* leaq 48(%rsp), %r_argp  -- point into it */
   3619          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3620                                         argp));
   3621          /* andq $-16, %r_argp      -- 16-align the pointer */
   3622          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3623                                          AMD64RMI_Imm( ~(UInt)15 ),
   3624                                          argp));
   3625          /* Prepare 2 vector arg regs:
   3626             leaq 0(%r_argp), %rdi
   3627             leaq 16(%r_argp), %rsi
   3628          */
   3629          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3630                                         hregAMD64_RDI()));
   3631          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3632                                         hregAMD64_RSI()));
   3633          /* Store the vector arg, at (%rsi):
   3634             movupd  %argL, 0(%rsi)
   3635          */
   3636          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3637                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3638          /* And get the scalar value into rdx */
   3639          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
   3640 
   3641          /* call the helper */
   3642          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   3643                                         3, mk_RetLoc_simple(RLPri_None) ));
   3644          /* fetch the result from memory, using %r_argp, which the
   3645             register allocator will keep alive across the call. */
   3646          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3647                                           AMD64AMode_IR(0, argp)));
   3648          /* and finally, clear the space */
   3649          add_to_rsp(env, 112);
   3650          return dst;
   3651       }
   3652 
   3653       default:
   3654          break;
   3655    } /* switch (e->Iex.Binop.op) */
   3656    } /* if (e->tag == Iex_Binop) */
   3657 
   3658    if (e->tag == Iex_Triop) {
   3659    IRTriop *triop = e->Iex.Triop.details;
   3660    switch (triop->op) {
   3661 
   3662       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
   3663       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
   3664       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
   3665       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
   3666       do_64Fx2_w_rm:
   3667       {
   3668          HReg argL = iselVecExpr(env, triop->arg2);
   3669          HReg argR = iselVecExpr(env, triop->arg3);
   3670          HReg dst = newVRegV(env);
   3671          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3672          /* XXXROUNDINGFIXME */
   3673          /* set roundingmode here */
   3674          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3675          return dst;
   3676       }
   3677 
   3678       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
   3679       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
   3680       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
   3681       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
   3682       do_32Fx4_w_rm:
   3683       {
   3684          HReg argL = iselVecExpr(env, triop->arg2);
   3685          HReg argR = iselVecExpr(env, triop->arg3);
   3686          HReg dst = newVRegV(env);
   3687          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3688          /* XXXROUNDINGFIXME */
   3689          /* set roundingmode here */
   3690          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3691          return dst;
   3692       }
   3693 
   3694       default:
   3695          break;
   3696    } /* switch (triop->op) */
   3697    } /* if (e->tag == Iex_Triop) */
   3698 
   3699    if (e->tag == Iex_ITE) { // VFD
   3700       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
   3701       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
   3702       HReg dst = newVRegV(env);
   3703       addInstr(env, mk_vMOVsd_RR(r1,dst));
   3704       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   3705       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
   3706       return dst;
   3707    }
   3708 
   3709    //vec_fail:
   3710    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
   3711               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   3712    ppIRExpr(e);
   3713    vpanic("iselVecExpr_wrk");
   3714 }
   3715 
   3716 
   3717 /*---------------------------------------------------------*/
   3718 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
   3719 /*---------------------------------------------------------*/
   3720 
   3721 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3722                            ISelEnv* env, IRExpr* e )
   3723 {
   3724    iselDVecExpr_wrk( rHi, rLo, env, e );
   3725 #  if 0
   3726    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3727 #  endif
   3728    vassert(hregClass(*rHi) == HRcVec128);
   3729    vassert(hregClass(*rLo) == HRcVec128);
   3730    vassert(hregIsVirtual(*rHi));
   3731    vassert(hregIsVirtual(*rLo));
   3732 }
   3733 
   3734 
   3735 /* DO NOT CALL THIS DIRECTLY */
   3736 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3737                                ISelEnv* env, IRExpr* e )
   3738 {
   3739    HWord fn = 0; /* address of helper fn, if required */
   3740    vassert(e);
   3741    IRType ty = typeOfIRExpr(env->type_env,e);
   3742    vassert(ty == Ity_V256);
   3743 
   3744    AMD64SseOp op = Asse_INVALID;
   3745 
   3746    /* read 256-bit IRTemp */
   3747    if (e->tag == Iex_RdTmp) {
   3748       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   3749       return;
   3750    }
   3751 
   3752    if (e->tag == Iex_Get) {
   3753       HReg        vHi  = newVRegV(env);
   3754       HReg        vLo  = newVRegV(env);
   3755       HReg        rbp  = hregAMD64_RBP();
   3756       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
   3757       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
   3758       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3759       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3760       *rHi = vHi;
   3761       *rLo = vLo;
   3762       return;
   3763    }
   3764 
   3765    if (e->tag == Iex_Load) {
   3766       HReg        vHi  = newVRegV(env);
   3767       HReg        vLo  = newVRegV(env);
   3768       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
   3769       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   3770       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   3771       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3772       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3773       *rHi = vHi;
   3774       *rLo = vLo;
   3775       return;
   3776    }
   3777 
   3778    if (e->tag == Iex_Const) {
   3779       vassert(e->Iex.Const.con->tag == Ico_V256);
   3780       switch (e->Iex.Const.con->Ico.V256) {
   3781          case 0x00000000: {
   3782             HReg vHi = generate_zeroes_V128(env);
   3783             HReg vLo = newVRegV(env);
   3784             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
   3785             *rHi = vHi;
   3786             *rLo = vLo;
   3787             return;
   3788          }
   3789          default:
   3790             break; /* give up.   Until such time as is necessary. */
   3791       }
   3792    }
   3793 
   3794    if (e->tag == Iex_Unop) {
   3795    switch (e->Iex.Unop.op) {
   3796 
   3797       case Iop_NotV256: {
   3798          HReg argHi, argLo;
   3799          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3800          *rHi = do_sse_NotV128(env, argHi);
   3801          *rLo = do_sse_NotV128(env, argLo);
   3802          return;
   3803       }
   3804 
   3805       case Iop_RecipEst32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
   3806       case Iop_Sqrt32Fx8:     op = Asse_SQRTF;  goto do_32Fx8_unary;
   3807       case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
   3808       do_32Fx8_unary:
   3809       {
   3810          HReg argHi, argLo;
   3811          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3812          HReg dstHi = newVRegV(env);
   3813          HReg dstLo = newVRegV(env);
   3814          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
   3815          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
   3816          *rHi = dstHi;
   3817          *rLo = dstLo;
   3818          return;
   3819       }
   3820 
   3821       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
   3822       do_64Fx4_unary:
   3823       {
   3824          HReg argHi, argLo;
   3825          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3826          HReg dstHi = newVRegV(env);
   3827          HReg dstLo = newVRegV(env);
   3828          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
   3829          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
   3830          *rHi = dstHi;
   3831          *rLo = dstLo;
   3832          return;
   3833       }
   3834 
   3835       case Iop_CmpNEZ64x4: {
   3836          /* We can use SSE2 instructions for this. */
   3837          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
   3838             (obviously).  See comment on Iop_CmpNEZ64x2 for
   3839             explanation of what's going on here. */
   3840          HReg argHi, argLo;
   3841          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3842          HReg tmpHi  = generate_zeroes_V128(env);
   3843          HReg tmpLo  = newVRegV(env);
   3844          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
   3845          HReg dstHi  = newVRegV(env);
   3846          HReg dstLo  = newVRegV(env);
   3847          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
   3848          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
   3849          tmpHi = do_sse_NotV128(env, tmpHi);
   3850          tmpLo = do_sse_NotV128(env, tmpLo);
   3851          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
   3852          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
   3853          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
   3854          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
   3855          *rHi = dstHi;
   3856          *rLo = dstLo;
   3857          return;
   3858       }
   3859 
   3860       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3861       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   3862       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   3863       do_CmpNEZ_vector:
   3864       {
   3865          HReg argHi, argLo;
   3866          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3867          HReg tmpHi = newVRegV(env);
   3868          HReg tmpLo = newVRegV(env);
   3869          HReg zero  = generate_zeroes_V128(env);
   3870          HReg dstHi, dstLo;
   3871          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
   3872          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
   3873          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
   3874          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
   3875          dstHi = do_sse_NotV128(env, tmpHi);
   3876          dstLo = do_sse_NotV128(env, tmpLo);
   3877          *rHi = dstHi;
   3878          *rLo = dstLo;
   3879          return;
   3880       }
   3881 
   3882       default:
   3883          break;
   3884    } /* switch (e->Iex.Unop.op) */
   3885    } /* if (e->tag == Iex_Unop) */
   3886 
   3887    if (e->tag == Iex_Binop) {
   3888    switch (e->Iex.Binop.op) {
   3889 
   3890       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
   3891       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
   3892       do_64Fx4:
   3893       {
   3894          HReg argLhi, argLlo, argRhi, argRlo;
   3895          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3896          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3897          HReg dstHi = newVRegV(env);
   3898          HReg dstLo = newVRegV(env);
   3899          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3900          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3901          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
   3902          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
   3903          *rHi = dstHi;
   3904          *rLo = dstLo;
   3905          return;
   3906       }
   3907 
   3908       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
   3909       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
   3910       do_32Fx8:
   3911       {
   3912          HReg argLhi, argLlo, argRhi, argRlo;
   3913          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3914          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3915          HReg dstHi = newVRegV(env);
   3916          HReg dstLo = newVRegV(env);
   3917          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3918          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3919          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
   3920          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
   3921          *rHi = dstHi;
   3922          *rLo = dstLo;
   3923          return;
   3924       }
   3925 
   3926       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
   3927       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
   3928       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
   3929       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
   3930       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
   3931       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
   3932       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
   3933       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
   3934       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
   3935       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
   3936       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
   3937       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
   3938       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
   3939       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3940       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
   3941       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3942       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
   3943       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
   3944       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
   3945       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
   3946       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
   3947       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
   3948       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
   3949       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
   3950       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
   3951       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
   3952       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
   3953       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
   3954       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
   3955       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
   3956       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
   3957       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
   3958       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
   3959       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
   3960       do_SseReRg:
   3961       {
   3962          HReg argLhi, argLlo, argRhi, argRlo;
   3963          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3964          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3965          HReg dstHi = newVRegV(env);
   3966          HReg dstLo = newVRegV(env);
   3967          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3968          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3969          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
   3970          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
   3971          *rHi = dstHi;
   3972          *rLo = dstLo;
   3973          return;
   3974       }
   3975 
   3976       case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
   3977       case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
   3978       case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
   3979       case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
   3980       case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
   3981       case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
   3982       case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
   3983       case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
   3984       do_SseShift: {
   3985          HReg gregHi, gregLo;
   3986          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
   3987          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3988          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
   3989          HReg        ereg  = newVRegV(env);
   3990          HReg        dstHi = newVRegV(env);
   3991          HReg        dstLo = newVRegV(env);
   3992          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3993          addInstr(env, AMD64Instr_Push(rmi));
   3994          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3995          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
   3996          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
   3997          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
   3998          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
   3999          add_to_rsp(env, 16);
   4000          *rHi = dstHi;
   4001          *rLo = dstLo;
   4002          return;
   4003       }
   4004 
   4005       case Iop_V128HLtoV256: {
   4006          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
   4007          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
   4008          return;
   4009       }
   4010 
   4011       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
   4012                            goto do_SseAssistedBinary;
   4013       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
   4014                            goto do_SseAssistedBinary;
   4015       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
   4016                            goto do_SseAssistedBinary;
   4017       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
   4018                            goto do_SseAssistedBinary;
   4019       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
   4020                            goto do_SseAssistedBinary;
   4021       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
   4022                            goto do_SseAssistedBinary;
   4023       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
   4024                            goto do_SseAssistedBinary;
   4025       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
   4026                            goto do_SseAssistedBinary;
   4027       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
   4028                            goto do_SseAssistedBinary;
   4029       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   4030                            goto do_SseAssistedBinary;
   4031       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   4032                            goto do_SseAssistedBinary;
   4033       do_SseAssistedBinary: {
   4034          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   4035             well. */
   4036          vassert(fn != 0);
   4037          HReg dstHi = newVRegV(env);
   4038          HReg dstLo = newVRegV(env);
   4039          HReg argLhi, argLlo, argRhi, argRlo;
   4040          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   4041          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   4042          HReg argp = newVRegI(env);
   4043          /* subq $160, %rsp         -- make a space*/
   4044          sub_from_rsp(env, 160);
   4045          /* leaq 48(%rsp), %r_argp  -- point into it */
   4046          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   4047                                         argp));
   4048          /* andq $-16, %r_argp      -- 16-align the pointer */
   4049          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   4050                                          AMD64RMI_Imm( ~(UInt)15 ),
   4051                                          argp));
   4052          /* Prepare 3 arg regs:
   4053             leaq 0(%r_argp), %rdi
   4054             leaq 16(%r_argp), %rsi
   4055             leaq 32(%r_argp), %rdx
   4056          */
   4057          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   4058                                         hregAMD64_RDI()));
   4059          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   4060                                         hregAMD64_RSI()));
   4061          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   4062                                         hregAMD64_RDX()));
   4063          /* Store the two high args, at (%rsi) and (%rdx):
   4064             movupd  %argLhi, 0(%rsi)
   4065             movupd  %argRhi, 0(%rdx)
   4066          */
   4067          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
   4068                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   4069          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
   4070                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   4071          /* Store the two low args, at 48(%rsi) and 48(%rdx):
   4072             movupd  %argLlo, 48(%rsi)
   4073             movupd  %argRlo, 48(%rdx)
   4074          */
   4075          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
   4076                                           AMD64AMode_IR(48, hregAMD64_RSI())));
   4077          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
   4078                                           AMD64AMode_IR(48, hregAMD64_RDX())));
   4079          /* call the helper */
   4080          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4081                                         mk_RetLoc_simple(RLPri_None) ));
   4082          /* Prepare 3 arg regs:
   4083             leaq 48(%r_argp), %rdi
   4084             leaq 64(%r_argp), %rsi
   4085             leaq 80(%r_argp), %rdx
   4086          */
   4087          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
   4088                                         hregAMD64_RDI()));
   4089          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
   4090                                         hregAMD64_RSI()));
   4091          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
   4092                                         hregAMD64_RDX()));
   4093          /* call the helper */
   4094          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4095                                         mk_RetLoc_simple(RLPri_None) ));
   4096          /* fetch the result from memory, using %r_argp, which the
   4097             register allocator will keep alive across the call. */
   4098          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
   4099                                           AMD64AMode_IR(0, argp)));
   4100          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
   4101                                           AMD64AMode_IR(48, argp)));
   4102          /* and finally, clear the space */
   4103          add_to_rsp(env, 160);
   4104          *rHi = dstHi;
   4105          *rLo = dstLo;
   4106          return;
   4107       }
   4108 
   4109       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
   4110                            goto do_SseAssistedBinary256;
   4111       do_SseAssistedBinary256: {
   4112          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   4113             well. */
   4114          vassert(fn != 0);
   4115          HReg dstHi = newVRegV(env);
   4116          HReg dstLo = newVRegV(env);
   4117          HReg argLhi, argLlo, argRhi, argRlo;
   4118          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   4119          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   4120          HReg argp = newVRegI(env);
   4121          /* subq $160, %rsp         -- make a space*/
   4122          sub_from_rsp(env, 160);
   4123          /* leaq 48(%rsp), %r_argp  -- point into it */
   4124          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   4125                                         argp));
   4126          /* andq $-16, %r_argp      -- 16-align the pointer */
   4127          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   4128                                          AMD64RMI_Imm( ~(UInt)15 ),
   4129                                          argp));
   4130          /* Prepare 3 arg regs:
   4131             leaq 0(%r_argp), %rdi
   4132             leaq 32(%r_argp), %rsi
   4133             leaq 64(%r_argp), %rdx
   4134          */
   4135          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   4136                                         hregAMD64_RDI()));
   4137          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   4138                                         hregAMD64_RSI()));
   4139          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
   4140                                         hregAMD64_RDX()));
   4141          /* Store the two args, at (%rsi) and (%rdx):
   4142             movupd  %argLlo, 0(%rsi)
   4143             movupd  %argLhi, 16(%rsi)
   4144             movupd  %argRlo, 0(%rdx)
   4145             movupd  %argRhi, 16(%rdx)
   4146          */
   4147          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
   4148                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   4149          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
   4150                                           AMD64AMode_IR(16, hregAMD64_RSI())));
   4151          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
   4152                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   4153          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
   4154                                           AMD64AMode_IR(16, hregAMD64_RDX())));
   4155          /* call the helper */
   4156          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4157                                         mk_RetLoc_simple(RLPri_None) ));
   4158          /* fetch the result from memory, using %r_argp, which the
   4159             register allocator will keep alive across the call. */
   4160          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
   4161                                           AMD64AMode_IR(0, argp)));
   4162          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
   4163                                           AMD64AMode_IR(16, argp)));
   4164          /* and finally, clear the space */
   4165          add_to_rsp(env, 160);
   4166          *rHi = dstHi;
   4167          *rLo = dstLo;
   4168          return;
   4169       }
   4170 
   4171       default:
   4172          break;
   4173    } /* switch (e->Iex.Binop.op) */
   4174    } /* if (e->tag == Iex_Binop) */
   4175 
   4176    if (e->tag == Iex_Triop) {
   4177    IRTriop *triop = e->Iex.Triop.details;
   4178    switch (triop->op) {
   4179 
   4180       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
   4181       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
   4182       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
   4183       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
   4184       do_64Fx4_w_rm:
   4185       {
   4186          HReg argLhi, argLlo, argRhi, argRlo;
   4187          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
   4188          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
   4189          HReg dstHi = newVRegV(env);
   4190          HReg dstLo = newVRegV(env);
   4191          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   4192          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   4193          /* XXXROUNDINGFIXME */
   4194          /* set roundingmode here */
   4195          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
   4196          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
   4197          *rHi = dstHi;
   4198          *rLo = dstLo;
   4199          return;
   4200       }
   4201 
   4202       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
   4203       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
   4204       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
   4205       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
   4206       do_32Fx8_w_rm:
   4207       {
   4208          HReg argLhi, argLlo, argRhi, argRlo;
   4209          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
   4210          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
   4211          HReg dstHi = newVRegV(env);
   4212          HReg dstLo = newVRegV(env);
   4213          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   4214          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   4215          /* XXXROUNDINGFIXME */
   4216          /* set roundingmode here */
   4217          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
   4218          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
   4219          *rHi = dstHi;
   4220          *rLo = dstLo;
   4221          return;
   4222       }
   4223 
   4224       default:
   4225          break;
   4226    } /* switch (triop->op) */
   4227    } /* if (e->tag == Iex_Triop) */
   4228 
   4229 
   4230    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
   4231       HReg        rsp     = hregAMD64_RSP();
   4232       HReg        vHi     = newVRegV(env);
   4233       HReg        vLo     = newVRegV(env);
   4234       AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   4235       AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   4236       /* arg1 is the most significant (Q3), arg4 the least (Q0) */
   4237       /* Get all the args into regs, before messing with the stack. */
   4238       AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
   4239       AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
   4240       AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
   4241       AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
   4242       /* less significant lane (Q2) at the lower address (-16(rsp)) */
   4243       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
   4244       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
   4245       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
   4246       /* and then the lower half .. */
   4247       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
   4248       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
   4249       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
   4250       *rHi = vHi;
   4251       *rLo = vLo;
   4252       return;
   4253    }
   4254 
   4255    if (e->tag == Iex_ITE) {
   4256       HReg r1Hi, r1Lo, r0Hi, r0Lo;
   4257       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
   4258       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
   4259       HReg dstHi = newVRegV(env);
   4260       HReg dstLo = newVRegV(env);
   4261       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
   4262       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
   4263       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   4264       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
   4265       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
   4266       *rHi = dstHi;
   4267       *rLo = dstLo;
   4268       return;
   4269    }
   4270 
   4271    //avx_fail:
   4272    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
   4273               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   4274    ppIRExpr(e);
   4275    vpanic("iselDVecExpr_wrk");
   4276 }
   4277 
   4278 
   4279 /*---------------------------------------------------------*/
   4280 /*--- ISEL: Statements                                  ---*/
   4281 /*---------------------------------------------------------*/
   4282 
   4283 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   4284 {
   4285    if (vex_traceflags & VEX_TRACE_VCODE) {
   4286       vex_printf("\n-- ");
   4287       ppIRStmt(stmt);
   4288       vex_printf("\n");
   4289    }
   4290 
   4291    switch (stmt->tag) {
   4292 
   4293    /* --------- LOADG (guarded load) --------- */
   4294    case Ist_LoadG: {
   4295       IRLoadG* lg = stmt->Ist.LoadG.details;
   4296       if (lg->end != Iend_LE)
   4297          goto stmt_fail;
   4298 
   4299       UChar szB = 0; /* invalid */
   4300       switch (lg->cvt) {
   4301          case ILGop_Ident32: szB = 4; break;
   4302          case ILGop_Ident64: szB = 8; break;
   4303          default: break;
   4304       }
   4305       if (szB == 0)
   4306          goto stmt_fail;
   4307 
   4308       AMD64AMode* amAddr = iselIntExpr_AMode(env, lg->addr);
   4309       HReg rAlt  = iselIntExpr_R(env, lg->alt);
   4310       HReg rDst  = lookupIRTemp(env, lg->dst);
   4311       /* Get the alt value into the dst.  We'll do a conditional load
   4312          which overwrites it -- or not -- with loaded data. */
   4313       addInstr(env, mk_iMOVsd_RR(rAlt, rDst));
   4314       AMD64CondCode cc = iselCondCode(env, lg->guard);
   4315       addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst));
   4316       return;
   4317    }
   4318 
   4319    /* --------- STOREG (guarded store) --------- */
   4320    case Ist_StoreG: {
   4321       IRStoreG* sg = stmt->Ist.StoreG.details;
   4322       if (sg->end != Iend_LE)
   4323          goto stmt_fail;
   4324 
   4325       UChar szB = 0; /* invalid */
   4326       switch (typeOfIRExpr(env->type_env, sg->data)) {
   4327          case Ity_I32: szB = 4; break;
   4328          case Ity_I64: szB = 8; break;
   4329          default: break;
   4330       }
   4331       if (szB == 0)
   4332          goto stmt_fail;
   4333 
   4334       AMD64AMode*   amAddr = iselIntExpr_AMode(env, sg->addr);
   4335       HReg          rSrc   = iselIntExpr_R(env, sg->data);
   4336       AMD64CondCode cc     = iselCondCode(env, sg->guard);
   4337       addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr));
   4338       return;
   4339    }
   4340 
   4341    /* --------- STORE --------- */
   4342    case Ist_Store: {
   4343       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   4344       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   4345       IREndness end   = stmt->Ist.Store.end;
   4346 
   4347       if (tya != Ity_I64 || end != Iend_LE)
   4348          goto stmt_fail;
   4349 
   4350       if (tyd == Ity_I64) {
   4351          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4352          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   4353          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
   4354          return;
   4355       }
   4356       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
   4357          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4358          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   4359          addInstr(env, AMD64Instr_Store(
   4360                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
   4361                           r,am));
   4362          return;
   4363       }
   4364       if (tyd == Ity_F64) {
   4365          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4366          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   4367          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
   4368          return;
   4369       }
   4370       if (tyd == Ity_F32) {
   4371          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4372          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   4373          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
   4374          return;
   4375       }
   4376       if (tyd == Ity_V128) {
   4377          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4378          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   4379          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
   4380          return;
   4381       }
   4382       if (tyd == Ity_V256) {
   4383          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
   4384          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   4385          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   4386          HReg vHi, vLo;
   4387          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
   4388          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   4389          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   4390          return;
   4391       }
   4392       break;
   4393    }
   4394 
   4395    /* --------- PUT --------- */
   4396    case Ist_Put: {
   4397       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   4398       if (ty == Ity_I64) {
   4399          /* We're going to write to memory, so compute the RHS into an
   4400             AMD64RI. */
   4401          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   4402          addInstr(env,
   4403                   AMD64Instr_Alu64M(
   4404                      Aalu_MOV,
   4405                      ri,
   4406                      AMD64AMode_IR(stmt->Ist.Put.offset,
   4407                                    hregAMD64_RBP())
   4408                  ));
   4409          return;
   4410       }
   4411       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   4412          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   4413          addInstr(env, AMD64Instr_Store(
   4414                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   4415                           r,
   4416                           AMD64AMode_IR(stmt->Ist.Put.offset,
   4417                                         hregAMD64_RBP())));
   4418          return;
   4419       }
   4420       if (ty == Ity_F32) {
   4421          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   4422          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
   4423          set_SSE_rounding_default(env); /* paranoia */
   4424          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
   4425          return;
   4426       }
   4427       if (ty == Ity_F64) {
   4428          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   4429          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
   4430                                          hregAMD64_RBP() );
   4431          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
   4432          return;
   4433       }
   4434       if (ty == Ity_V128) {
   4435          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
   4436          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
   4437                                          hregAMD64_RBP());
   4438          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
   4439          return;
   4440       }
   4441       if (ty == Ity_V256) {
   4442          HReg vHi, vLo;
   4443          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
   4444          HReg        rbp  = hregAMD64_RBP();
   4445          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
   4446          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
   4447          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   4448          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   4449          return;
   4450       }
   4451       break;
   4452    }
   4453 
   4454    /* --------- Indexed PUT --------- */
   4455    case Ist_PutI: {
   4456       IRPutI *puti = stmt->Ist.PutI.details;
   4457 
   4458       AMD64AMode* am
   4459          = genGuestArrayOffset(
   4460               env, puti->descr,
   4461                    puti->ix, puti->bias );
   4462 
   4463       IRType ty = typeOfIRExpr(env->type_env, puti->data);
   4464       if (ty == Ity_F64) {
   4465          HReg val = iselDblExpr(env, puti->data);
   4466          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
   4467          return;
   4468       }
   4469       if (ty == Ity_I8) {
   4470          HReg r = iselIntExpr_R(env, puti->data);
   4471          addInstr(env, AMD64Instr_Store( 1, r, am ));
   4472          return;
   4473       }
   4474       if (ty == Ity_I64) {
   4475          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
   4476          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
   4477          return;
   4478       }
   4479       break;
   4480    }
   4481 
   4482    /* --------- TMP --------- */
   4483    case Ist_WrTmp: {
   4484       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   4485       IRType ty = typeOfIRTemp(env->type_env, tmp);
   4486 
   4487       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
   4488          compute it into an AMode and then use LEA.  This usually
   4489          produces fewer instructions, often because (for memcheck
   4490          created IR) we get t = address-expression, (t is later used
   4491          twice) and so doing this naturally turns address-expression
   4492          back into an AMD64 amode. */
   4493       if (ty == Ity_I64
   4494           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   4495           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
   4496          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   4497          HReg dst = lookupIRTemp(env, tmp);
   4498          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
   4499             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   4500                value into a register.  Just emit a normal reg-reg move
   4501                so reg-alloc can coalesce it away in the usual way. */
   4502             HReg src = am->Aam.IR.reg;
   4503             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
   4504          } else {
   4505             addInstr(env, AMD64Instr_Lea64(am,dst));
   4506          }
   4507          return;
   4508       }
   4509 
   4510       if (ty == Ity_I64 || ty == Ity_I32
   4511           || ty == Ity_I16 || ty == Ity_I8) {
   4512          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   4513          HReg dst = lookupIRTemp(env, tmp);
   4514          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
   4515          return;
   4516       }
   4517       if (ty == Ity_I128) {
   4518          HReg rHi, rLo, dstHi, dstLo;
   4519          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   4520          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   4521          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   4522          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   4523          return;
   4524       }
   4525       if (ty == Ity_I1) {
   4526          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   4527          HReg dst = lookupIRTemp(env, tmp);
   4528          addInstr(env, AMD64Instr_Set64(cond, dst));
   4529          return;
   4530       }
   4531       if (ty == Ity_F64) {
   4532          HReg dst = lookupIRTemp(env, tmp);
   4533          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   4534          addInstr(env, mk_vMOVsd_RR(src, dst));
   4535          return;
   4536       }
   4537       if (ty == Ity_F32) {
   4538          HReg dst = lookupIRTemp(env, tmp);
   4539          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   4540          addInstr(env, mk_vMOVsd_RR(src, dst));
   4541          return;
   4542       }
   4543       if (ty == Ity_V128) {
   4544          HReg dst = lookupIRTemp(env, tmp);
   4545          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   4546          addInstr(env, mk_vMOVsd_RR(src, dst));
   4547          return;
   4548       }
   4549       if (ty == Ity_V256) {
   4550          HReg rHi, rLo, dstHi, dstLo;
   4551          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   4552          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   4553          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
   4554          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
   4555          return;
   4556       }
   4557       break;
   4558    }
   4559 
   4560    /* --------- Call to DIRTY helper --------- */
   4561    case Ist_Dirty: {
   4562       IRDirty* d = stmt->Ist.Dirty.details;
   4563 
   4564       /* Figure out the return type, if any. */
   4565       IRType retty = Ity_INVALID;
   4566       if (d->tmp != IRTemp_INVALID)
   4567          retty = typeOfIRTemp(env->type_env, d->tmp);
   4568 
   4569       /* Throw out any return types we don't know about. */
   4570       Bool retty_ok = False;
   4571       switch (retty) {
   4572          case Ity_INVALID: /* function doesn't return anything */
   4573          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
   4574          case Ity_V128: case Ity_V256:
   4575             retty_ok = True; break;
   4576          default:
   4577             break;
   4578       }
   4579       if (!retty_ok)
   4580          break; /* will go to stmt_fail: */
   4581 
   4582       /* Marshal args, do the call, and set the return value to
   4583          0x555..555 if this is a conditional call that returns a value
   4584          and the call is skipped. */
   4585       UInt   addToSp = 0;
   4586       RetLoc rloc    = mk_RetLoc_INVALID();
   4587       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
   4588       vassert(is_sane_RetLoc(rloc));
   4589 
   4590       /* Now figure out what to do with the returned value, if any. */
   4591       switch (retty) {
   4592          case Ity_INVALID: {
   4593             /* No return value.  Nothing to do. */
   4594             vassert(d->tmp == IRTemp_INVALID);
   4595             vassert(rloc.pri == RLPri_None);
   4596             vassert(addToSp == 0);
   4597             return;
   4598          }
   4599          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
   4600             /* The returned value is in %rax.  Park it in the register
   4601                associated with tmp. */
   4602             vassert(rloc.pri == RLPri_Int);
   4603             vassert(addToSp == 0);
   4604             HReg dst = lookupIRTemp(env, d->tmp);
   4605             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
   4606             return;
   4607          }
   4608          case Ity_V128: {
   4609             /* The returned value is on the stack, and rloc.spOff
   4610                tells us where.  Fish it off the stack and then move
   4611                the stack pointer upwards to clear it, as directed by
   4612                doHelperCall. */
   4613             vassert(rloc.pri == RLPri_V128SpRel);
   4614             vassert(addToSp >= 16);
   4615             HReg        dst = lookupIRTemp(env, d->tmp);
   4616             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
   4617             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   4618             add_to_rsp(env, addToSp);
   4619             return;
   4620          }
   4621          case Ity_V256: {
   4622             /* See comments for Ity_V128. */
   4623             vassert(rloc.pri == RLPri_V256SpRel);
   4624             vassert(addToSp >= 32);
   4625             HReg        dstLo, dstHi;
   4626             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
   4627             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
   4628             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
   4629             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
   4630             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
   4631             add_to_rsp(env, addToSp);
   4632             return;
   4633          }
   4634          default:
   4635             /*NOTREACHED*/
   4636             vassert(0);
   4637       }
   4638       break;
   4639    }
   4640 
   4641    /* --------- MEM FENCE --------- */
   4642    case Ist_MBE:
   4643       switch (stmt->Ist.MBE.event) {
   4644          case Imbe_Fence:
   4645             addInstr(env, AMD64Instr_MFence());
   4646             return;
   4647          default:
   4648             break;
   4649       }
   4650       break;
   4651 
   4652    /* --------- ACAS --------- */
   4653    case Ist_CAS:
   4654       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   4655          /* "normal" singleton CAS */
   4656          UChar  sz;
   4657          IRCAS* cas = stmt->Ist.CAS.details;
   4658          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4659          /* get: cas->expd into %rax, and cas->data into %rbx */
   4660          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4661          HReg rData = iselIntExpr_R(env, cas->dataLo);
   4662          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
   4663          HReg rOld  = lookupIRTemp(env, cas->oldLo);
   4664          vassert(cas->expdHi == NULL);
   4665          vassert(cas->dataHi == NULL);
   4666          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
   4667          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
   4668          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
   4669          switch (ty) {
   4670             case Ity_I64: sz = 8; break;
   4671             case Ity_I32: sz = 4; break;
   4672             case Ity_I16: sz = 2; break;
   4673             case Ity_I8:  sz = 1; break;
   4674             default: goto unhandled_cas;
   4675          }
   4676          addInstr(env, AMD64Instr_ACAS(am, sz));
   4677          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld));
   4678          return;
   4679       } else {
   4680          /* double CAS */
   4681          UChar  sz;
   4682          IRCAS* cas = stmt->Ist.CAS.details;
   4683          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4684          /* only 32-bit and 64-bit allowed in this case */
   4685          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
   4686          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
   4687          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4688          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   4689          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   4690          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   4691          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   4692          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   4693          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4694          switch (ty) {
   4695             case Ity_I64:
   4696                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
   4697                   goto unhandled_cas; /* we'd have to generate
   4698                                          cmpxchg16b, but the host
   4699                                          doesn't support that */
   4700                sz = 8;
   4701                break;
   4702             case Ity_I32:
   4703                sz = 4;
   4704                break;
   4705             default:
   4706                goto unhandled_cas;
   4707          }
   4708          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   4709          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4710          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
   4711          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
   4712          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
   4713          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
   4714          addInstr(env, AMD64Instr_DACAS(am, sz));
   4715          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi));
   4716          addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo));
   4717          return;
   4718       }
   4719       unhandled_cas:
   4720       break;
   4721 
   4722    /* --------- INSTR MARK --------- */
   4723    /* Doesn't generate any executable code ... */
   4724    case Ist_IMark:
   4725        return;
   4726 
   4727    /* --------- ABI HINT --------- */
   4728    /* These have no meaning (denotation in the IR) and so we ignore
   4729       them ... if any actually made it this far. */
   4730    case Ist_AbiHint:
   4731        return;
   4732 
   4733    /* --------- NO-OP --------- */
   4734    case Ist_NoOp:
   4735        return;
   4736 
   4737    /* --------- EXIT --------- */
   4738    case Ist_Exit: {
   4739       if (stmt->Ist.Exit.dst->tag != Ico_U64)
   4740          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
   4741 
   4742       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
   4743       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
   4744                                           hregAMD64_RBP());
   4745 
   4746       /* Case: boring transfer to known address */
   4747       if (stmt->Ist.Exit.jk == Ijk_Boring) {
   4748          if (env->chainingAllowed) {
   4749             /* .. almost always true .. */
   4750             /* Skip the event check at the dst if this is a forwards
   4751                edge. */
   4752             Bool toFastEP
   4753                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
   4754             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
   4755             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
   4756                                              amRIP, cc, toFastEP));
   4757          } else {
   4758             /* .. very occasionally .. */
   4759             /* We can't use chaining, so ask for an assisted transfer,
   4760                as that's the only alternative that is allowable. */
   4761             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4762             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
   4763          }
   4764          return;
   4765       }
   4766 
   4767       /* Case: assisted transfer to arbitrary address */
   4768       switch (stmt->Ist.Exit.jk) {
   4769          /* Keep this list in sync with that in iselNext below */
   4770          case Ijk_ClientReq:
   4771          case Ijk_EmWarn:
   4772          case Ijk_NoDecode:
   4773          case Ijk_NoRedir:
   4774          case Ijk_SigSEGV:
   4775          case Ijk_SigTRAP:
   4776          case Ijk_Sys_syscall:
   4777          case Ijk_InvalICache:
   4778          case Ijk_Yield:
   4779          {
   4780             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4781             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
   4782             return;
   4783          }
   4784          default:
   4785             break;
   4786       }
   4787 
   4788       /* Do we ever expect to see any other kind? */
   4789       goto stmt_fail;
   4790    }
   4791 
   4792    default: break;
   4793    }
   4794   stmt_fail:
   4795    ppIRStmt(stmt);
   4796    vpanic("iselStmt(amd64)");
   4797 }
   4798 
   4799 
   4800 /*---------------------------------------------------------*/
   4801 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4802 /*---------------------------------------------------------*/
   4803 
   4804 static void iselNext ( ISelEnv* env,
   4805                        IRExpr* next, IRJumpKind jk, Int offsIP )
   4806 {
   4807    if (vex_traceflags & VEX_TRACE_VCODE) {
   4808       vex_printf( "\n-- PUT(%d) = ", offsIP);
   4809       ppIRExpr( next );
   4810       vex_printf( "; exit-");
   4811       ppIRJumpKind(jk);
   4812       vex_printf( "\n");
   4813    }
   4814 
   4815    /* Case: boring transfer to known address */
   4816    if (next->tag == Iex_Const) {
   4817       IRConst* cdst = next->Iex.Const.con;
   4818       vassert(cdst->tag == Ico_U64);
   4819       if (jk == Ijk_Boring || jk == Ijk_Call) {
   4820          /* Boring transfer to known address */
   4821          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4822          if (env->chainingAllowed) {
   4823             /* .. almost always true .. */
   4824             /* Skip the event check at the dst if this is a forwards
   4825                edge. */
   4826             Bool toFastEP
   4827                = ((Addr64)cdst->Ico.U64) > env->max_ga;
   4828             if (0) vex_printf("%s", toFastEP ? "X" : ".");
   4829             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
   4830                                              amRIP, Acc_ALWAYS,
   4831                                              toFastEP));
   4832          } else {
   4833             /* .. very occasionally .. */
   4834             /* We can't use chaining, so ask for an indirect transfer,
   4835                as that's the cheapest alternative that is
   4836                allowable. */
   4837             HReg r = iselIntExpr_R(env, next);
   4838             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4839                                                Ijk_Boring));
   4840          }
   4841          return;
   4842       }
   4843    }
   4844 
   4845    /* Case: call/return (==boring) transfer to any address */
   4846    switch (jk) {
   4847       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
   4848          HReg        r     = iselIntExpr_R(env, next);
   4849          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4850          if (env->chainingAllowed) {
   4851             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
   4852          } else {
   4853             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4854                                                Ijk_Boring));
   4855          }
   4856          return;
   4857       }
   4858       default:
   4859          break;
   4860    }
   4861 
   4862    /* Case: assisted transfer to arbitrary address */
   4863    switch (jk) {
   4864       /* Keep this list in sync with that for Ist_Exit above */
   4865       case Ijk_ClientReq:
   4866       case Ijk_EmWarn:
   4867       case Ijk_NoDecode:
   4868       case Ijk_NoRedir:
   4869       case Ijk_SigSEGV:
   4870       case Ijk_SigTRAP:
   4871       case Ijk_Sys_syscall:
   4872       case Ijk_InvalICache:
   4873       case Ijk_Yield: {
   4874          HReg        r     = iselIntExpr_R(env, next);
   4875          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4876          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
   4877          return;
   4878       }
   4879       default:
   4880          break;
   4881    }
   4882 
   4883    vex_printf( "\n-- PUT(%d) = ", offsIP);
   4884    ppIRExpr( next );
   4885    vex_printf( "; exit-");
   4886    ppIRJumpKind(jk);
   4887    vex_printf( "\n");
   4888    vassert(0); // are we expecting any other kind?
   4889 }
   4890 
   4891 
   4892 /*---------------------------------------------------------*/
   4893 /*--- Insn selector top-level                           ---*/
   4894 /*---------------------------------------------------------*/
   4895 
   4896 /* Translate an entire SB to amd64 code. */
   4897 
   4898 HInstrArray* iselSB_AMD64 ( const IRSB* bb,
   4899                             VexArch      arch_host,
   4900                             const VexArchInfo* archinfo_host,
   4901                             const VexAbiInfo*  vbi/*UNUSED*/,
   4902                             Int offs_Host_EvC_Counter,
   4903                             Int offs_Host_EvC_FailAddr,
   4904                             Bool chainingAllowed,
   4905                             Bool addProfInc,
   4906                             Addr max_ga )
   4907 {
   4908    Int        i, j;
   4909    HReg       hreg, hregHI;
   4910    ISelEnv*   env;
   4911    UInt       hwcaps_host = archinfo_host->hwcaps;
   4912    AMD64AMode *amCounter, *amFailAddr;
   4913 
   4914    /* sanity ... */
   4915    vassert(arch_host == VexArchAMD64);
   4916    vassert(0 == (hwcaps_host
   4917                  & ~(VEX_HWCAPS_AMD64_SSE3
   4918                      | VEX_HWCAPS_AMD64_CX16
   4919                      | VEX_HWCAPS_AMD64_LZCNT
   4920                      | VEX_HWCAPS_AMD64_AVX
   4921                      | VEX_HWCAPS_AMD64_RDTSCP
   4922                      | VEX_HWCAPS_AMD64_BMI
   4923                      | VEX_HWCAPS_AMD64_AVX2)));
   4924 
   4925    /* Check that the host's endianness is as expected. */
   4926    vassert(archinfo_host->endness == VexEndnessLE);
   4927 
   4928    /* Make up an initial environment to use. */
   4929    env = LibVEX_Alloc_inline(sizeof(ISelEnv));
   4930    env->vreg_ctr = 0;
   4931 
   4932    /* Set up output code array. */
   4933    env->code = newHInstrArray();
   4934 
   4935    /* Copy BB's type env. */
   4936    env->type_env = bb->tyenv;
   4937 
   4938    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4939       change as we go along. */
   4940    env->n_vregmap = bb->tyenv->types_used;
   4941    env->vregmap   = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
   4942    env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg));
   4943 
   4944    /* and finally ... */
   4945    env->chainingAllowed = chainingAllowed;
   4946    env->hwcaps          = hwcaps_host;
   4947    env->max_ga          = max_ga;
   4948 
   4949    /* For each IR temporary, allocate a suitably-kinded virtual
   4950       register. */
   4951    j = 0;
   4952    for (i = 0; i < env->n_vregmap; i++) {
   4953       hregHI = hreg = INVALID_HREG;
   4954       switch (bb->tyenv->types[i]) {
   4955          case Ity_I1:
   4956          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
   4957             hreg = mkHReg(True, HRcInt64, 0, j++);
   4958             break;
   4959          case Ity_I128:
   4960             hreg   = mkHReg(True, HRcInt64, 0, j++);
   4961             hregHI = mkHReg(True, HRcInt64, 0, j++);
   4962             break;
   4963          case Ity_F32:
   4964          case Ity_F64:
   4965          case Ity_V128:
   4966             hreg = mkHReg(True, HRcVec128, 0, j++);
   4967             break;
   4968          case Ity_V256:
   4969             hreg   = mkHReg(True, HRcVec128, 0, j++);
   4970             hregHI = mkHReg(True, HRcVec128, 0, j++);
   4971             break;
   4972          default:
   4973             ppIRType(bb->tyenv->types[i]);
   4974             vpanic("iselBB(amd64): IRTemp type");
   4975       }
   4976       env->vregmap[i]   = hreg;
   4977       env->vregmapHI[i] = hregHI;
   4978    }
   4979    env->vreg_ctr = j;
   4980 
   4981    /* The very first instruction must be an event check. */
   4982    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
   4983    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
   4984    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
   4985 
   4986    /* Possibly a block counter increment (for profiling).  At this
   4987       point we don't know the address of the counter, so just pretend
   4988       it is zero.  It will have to be patched later, but before this
   4989       translation is used, by a call to LibVEX_patchProfCtr. */
   4990    if (addProfInc) {
   4991       addInstr(env, AMD64Instr_ProfInc());
   4992    }
   4993 
   4994    /* Ok, finally we can iterate over the statements. */
   4995    for (i = 0; i < bb->stmts_used; i++)
   4996       if (bb->stmts[i])
   4997          iselStmt(env, bb->stmts[i]);
   4998 
   4999    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
   5000 
   5001    /* record the number of vregs we used. */
   5002    env->code->n_vregs = env->vreg_ctr;
   5003    return env->code;
   5004 }
   5005 
   5006 
   5007 /*---------------------------------------------------------------*/
   5008 /*--- end                                   host_amd64_isel.c ---*/
   5009 /*---------------------------------------------------------------*/
   5010