Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                                 host_amd64_isel.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2013 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "libvex_ir.h"
     38 #include "libvex.h"
     39 
     40 #include "ir_match.h"
     41 #include "main_util.h"
     42 #include "main_globals.h"
     43 #include "host_generic_regs.h"
     44 #include "host_generic_simd64.h"
     45 #include "host_generic_simd128.h"
     46 #include "host_generic_simd256.h"
     47 #include "host_generic_maddf.h"
     48 #include "host_amd64_defs.h"
     49 
     50 
     51 /*---------------------------------------------------------*/
     52 /*--- x87/SSE control word stuff                        ---*/
     53 /*---------------------------------------------------------*/
     54 
     55 /* Vex-generated code expects to run with the FPU set as follows: all
     56    exceptions masked, round-to-nearest, precision = 53 bits.  This
     57    corresponds to a FPU control word value of 0x027F.
     58 
     59    Similarly the SSE control word (%mxcsr) should be 0x1F80.
     60 
     61    %fpucw and %mxcsr should have these values on entry to
     62    Vex-generated code, and should those values should be
     63    unchanged at exit.
     64 */
     65 
     66 #define DEFAULT_FPUCW 0x027F
     67 
     68 #define DEFAULT_MXCSR 0x1F80
     69 
     70 /* debugging only, do not use */
     71 /* define DEFAULT_FPUCW 0x037F */
     72 
     73 
     74 /*---------------------------------------------------------*/
     75 /*--- misc helpers                                      ---*/
     76 /*---------------------------------------------------------*/
     77 
     78 /* These are duplicated in guest-amd64/toIR.c */
     79 static IRExpr* unop ( IROp op, IRExpr* a )
     80 {
     81    return IRExpr_Unop(op, a);
     82 }
     83 
     84 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
     85 {
     86    return IRExpr_Binop(op, a1, a2);
     87 }
     88 
     89 static IRExpr* bind ( Int binder )
     90 {
     91    return IRExpr_Binder(binder);
     92 }
     93 
     94 static Bool isZeroU8 ( IRExpr* e )
     95 {
     96    return e->tag == Iex_Const
     97           && e->Iex.Const.con->tag == Ico_U8
     98           && e->Iex.Const.con->Ico.U8 == 0;
     99 }
    100 
    101 
    102 /*---------------------------------------------------------*/
    103 /*--- ISelEnv                                           ---*/
    104 /*---------------------------------------------------------*/
    105 
    106 /* This carries around:
    107 
    108    - A mapping from IRTemp to IRType, giving the type of any IRTemp we
    109      might encounter.  This is computed before insn selection starts,
    110      and does not change.
    111 
    112    - A mapping from IRTemp to HReg.  This tells the insn selector
    113      which virtual register is associated with each IRTemp
    114      temporary.  This is computed before insn selection starts, and
    115      does not change.  We expect this mapping to map precisely the
    116      same set of IRTemps as the type mapping does.
    117 
    118         - vregmap   holds the primary register for the IRTemp.
    119         - vregmapHI is only used for 128-bit integer-typed
    120              IRTemps.  It holds the identity of a second
    121              64-bit virtual HReg, which holds the high half
    122              of the value.
    123 
    124    - The host subarchitecture we are selecting insns for.
    125      This is set at the start and does not change.
    126 
    127    - The code array, that is, the insns selected so far.
    128 
    129    - A counter, for generating new virtual registers.
    130 
    131    - A Bool for indicating whether we may generate chain-me
    132      instructions for control flow transfers, or whether we must use
    133      XAssisted.
    134 
    135    - The maximum guest address of any guest insn in this block.
    136      Actually, the address of the highest-addressed byte from any insn
    137      in this block.  Is set at the start and does not change.  This is
    138      used for detecting jumps which are definitely forward-edges from
    139      this block, and therefore can be made (chained) to the fast entry
    140      point of the destination, thereby avoiding the destination's
    141      event check.
    142 
    143    Note, this is all host-independent.  (JRS 20050201: well, kinda
    144    ... not completely.  Compare with ISelEnv for X86.)
    145 */
    146 
    147 typedef
    148    struct {
    149       /* Constant -- are set at the start and do not change. */
    150       IRTypeEnv*   type_env;
    151 
    152       HReg*        vregmap;
    153       HReg*        vregmapHI;
    154       Int          n_vregmap;
    155 
    156       UInt         hwcaps;
    157 
    158       Bool         chainingAllowed;
    159       Addr64       max_ga;
    160 
    161       /* These are modified as we go along. */
    162       HInstrArray* code;
    163       Int          vreg_ctr;
    164    }
    165    ISelEnv;
    166 
    167 
    168 static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
    169 {
    170    vassert(tmp >= 0);
    171    vassert(tmp < env->n_vregmap);
    172    return env->vregmap[tmp];
    173 }
    174 
    175 static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
    176                                ISelEnv* env, IRTemp tmp )
    177 {
    178    vassert(tmp >= 0);
    179    vassert(tmp < env->n_vregmap);
    180    vassert(! hregIsInvalid(env->vregmapHI[tmp]));
    181    *vrLO = env->vregmap[tmp];
    182    *vrHI = env->vregmapHI[tmp];
    183 }
    184 
    185 static void addInstr ( ISelEnv* env, AMD64Instr* instr )
    186 {
    187    addHInstr(env->code, instr);
    188    if (vex_traceflags & VEX_TRACE_VCODE) {
    189       ppAMD64Instr(instr, True);
    190       vex_printf("\n");
    191    }
    192 }
    193 
    194 static HReg newVRegI ( ISelEnv* env )
    195 {
    196    HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
    197    env->vreg_ctr++;
    198    return reg;
    199 }
    200 
    201 static HReg newVRegV ( ISelEnv* env )
    202 {
    203    HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
    204    env->vreg_ctr++;
    205    return reg;
    206 }
    207 
    208 
    209 /*---------------------------------------------------------*/
    210 /*--- ISEL: Forward declarations                        ---*/
    211 /*---------------------------------------------------------*/
    212 
    213 /* These are organised as iselXXX and iselXXX_wrk pairs.  The
    214    iselXXX_wrk do the real work, but are not to be called directly.
    215    For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
    216    checks that all returned registers are virtual.  You should not
    217    call the _wrk version directly.
    218 */
    219 static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
    220 static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
    221 
    222 static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
    223 static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
    224 
    225 static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
    226 static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
    227 
    228 static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
    229 static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
    230 
    231 static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
    232 static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
    233 
    234 static void          iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    235                                           ISelEnv* env, IRExpr* e );
    236 static void          iselInt128Expr     ( /*OUT*/HReg* rHi, HReg* rLo,
    237                                           ISelEnv* env, IRExpr* e );
    238 
    239 static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
    240 static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
    241 
    242 static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
    243 static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
    244 
    245 static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
    246 static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
    247 
    248 static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
    249 static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
    250 
    251 static void          iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo,
    252                                         ISelEnv* env, IRExpr* e );
    253 static void          iselDVecExpr     ( /*OUT*/HReg* rHi, HReg* rLo,
    254                                         ISelEnv* env, IRExpr* e );
    255 
    256 
    257 /*---------------------------------------------------------*/
    258 /*--- ISEL: Misc helpers                                ---*/
    259 /*---------------------------------------------------------*/
    260 
    261 static Bool sane_AMode ( AMD64AMode* am )
    262 {
    263    switch (am->tag) {
    264       case Aam_IR:
    265          return
    266             toBool( hregClass(am->Aam.IR.reg) == HRcInt64
    267                     && (hregIsVirtual(am->Aam.IR.reg)
    268                         || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) );
    269       case Aam_IRRS:
    270          return
    271             toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
    272                     && hregIsVirtual(am->Aam.IRRS.base)
    273                     && hregClass(am->Aam.IRRS.index) == HRcInt64
    274                     && hregIsVirtual(am->Aam.IRRS.index) );
    275       default:
    276         vpanic("sane_AMode: unknown amd64 amode tag");
    277    }
    278 }
    279 
    280 
    281 /* Can the lower 32 bits be signedly widened to produce the whole
    282    64-bit value?  In other words, are the top 33 bits either all 0 or
    283    all 1 ? */
    284 static Bool fitsIn32Bits ( ULong x )
    285 {
    286    Long y0 = (Long)x;
    287    Long y1 = y0;
    288    y1 <<= 32;
    289    y1 >>=/*s*/ 32;
    290    return toBool(x == y1);
    291 }
    292 
    293 /* Is this a 64-bit zero expression? */
    294 
    295 static Bool isZeroU64 ( IRExpr* e )
    296 {
    297    return e->tag == Iex_Const
    298           && e->Iex.Const.con->tag == Ico_U64
    299           && e->Iex.Const.con->Ico.U64 == 0ULL;
    300 }
    301 
    302 static Bool isZeroU32 ( IRExpr* e )
    303 {
    304    return e->tag == Iex_Const
    305           && e->Iex.Const.con->tag == Ico_U32
    306           && e->Iex.Const.con->Ico.U32 == 0;
    307 }
    308 
    309 /* Make a int reg-reg move. */
    310 
    311 static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
    312 {
    313    vassert(hregClass(src) == HRcInt64);
    314    vassert(hregClass(dst) == HRcInt64);
    315    return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
    316 }
    317 
    318 /* Make a vector (128 bit) reg-reg move. */
    319 
    320 static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
    321 {
    322    vassert(hregClass(src) == HRcVec128);
    323    vassert(hregClass(dst) == HRcVec128);
    324    return AMD64Instr_SseReRg(Asse_MOV, src, dst);
    325 }
    326 
    327 /* Advance/retreat %rsp by n. */
    328 
    329 static void add_to_rsp ( ISelEnv* env, Int n )
    330 {
    331    vassert(n > 0 && n < 256 && (n%8) == 0);
    332    addInstr(env,
    333             AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n),
    334                                         hregAMD64_RSP()));
    335 }
    336 
    337 static void sub_from_rsp ( ISelEnv* env, Int n )
    338 {
    339    vassert(n > 0 && n < 256 && (n%8) == 0);
    340    addInstr(env,
    341             AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n),
    342                                         hregAMD64_RSP()));
    343 }
    344 
    345 /* Push 64-bit constants on the stack. */
    346 static void push_uimm64( ISelEnv* env, ULong uimm64 )
    347 {
    348    /* If uimm64 can be expressed as the sign extension of its
    349       lower 32 bits, we can do it the easy way. */
    350    Long simm64 = (Long)uimm64;
    351    if ( simm64 == ((simm64 << 32) >> 32) ) {
    352       addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
    353    } else {
    354       HReg tmp = newVRegI(env);
    355       addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
    356       addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
    357    }
    358 }
    359 
    360 
    361 /* Used only in doHelperCall.  If possible, produce a single
    362    instruction which computes 'e' into 'dst'.  If not possible, return
    363    NULL. */
    364 
    365 static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
    366                                                     HReg     dst,
    367                                                     IRExpr*  e )
    368 {
    369    /* Per comments in doHelperCall below, appearance of
    370       Iex_VECRET implies ill-formed IR. */
    371    vassert(e->tag != Iex_VECRET);
    372 
    373    /* In this case we give out a copy of the BaseBlock pointer. */
    374    if (UNLIKELY(e->tag == Iex_BBPTR)) {
    375       return mk_iMOVsd_RR( hregAMD64_RBP(), dst );
    376    }
    377 
    378    vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
    379 
    380    if (e->tag == Iex_Const) {
    381       vassert(e->Iex.Const.con->tag == Ico_U64);
    382       if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
    383          return AMD64Instr_Alu64R(
    384                    Aalu_MOV,
    385                    AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
    386                    dst
    387                 );
    388       } else {
    389          return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
    390       }
    391    }
    392 
    393    if (e->tag == Iex_RdTmp) {
    394       HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
    395       return mk_iMOVsd_RR(src, dst);
    396    }
    397 
    398    if (e->tag == Iex_Get) {
    399       vassert(e->Iex.Get.ty == Ity_I64);
    400       return AMD64Instr_Alu64R(
    401                 Aalu_MOV,
    402                 AMD64RMI_Mem(
    403                    AMD64AMode_IR(e->Iex.Get.offset,
    404                                  hregAMD64_RBP())),
    405                 dst);
    406    }
    407 
    408    if (e->tag == Iex_Unop
    409        && e->Iex.Unop.op == Iop_32Uto64
    410        && e->Iex.Unop.arg->tag == Iex_RdTmp) {
    411       HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
    412       return AMD64Instr_MovxLQ(False, src, dst);
    413    }
    414 
    415    if (0) { ppIRExpr(e); vex_printf("\n"); }
    416 
    417    return NULL;
    418 }
    419 
    420 
    421 /* Do a complete function call.  |guard| is a Ity_Bit expression
    422    indicating whether or not the call happens.  If guard==NULL, the
    423    call is unconditional.  |retloc| is set to indicate where the
    424    return value is after the call.  The caller (of this fn) must
    425    generate code to add |stackAdjustAfterCall| to the stack pointer
    426    after the call is done. */
    427 
    428 static
    429 void doHelperCall ( /*OUT*/UInt*   stackAdjustAfterCall,
    430                     /*OUT*/RetLoc* retloc,
    431                     ISelEnv* env,
    432                     IRExpr* guard,
    433                     IRCallee* cee, IRType retTy, IRExpr** args )
    434 {
    435    AMD64CondCode cc;
    436    HReg          argregs[6];
    437    HReg          tmpregs[6];
    438    AMD64Instr*   fastinstrs[6];
    439    UInt          n_args, i;
    440 
    441    /* Set default returns.  We'll update them later if needed. */
    442    *stackAdjustAfterCall = 0;
    443    *retloc               = mk_RetLoc_INVALID();
    444 
    445    /* These are used for cross-checking that IR-level constraints on
    446       the use of IRExpr_VECRET() and IRExpr_BBPTR() are observed. */
    447    UInt nVECRETs = 0;
    448    UInt nBBPTRs  = 0;
    449 
    450    /* Marshal args for a call and do the call.
    451 
    452       This function only deals with a tiny set of possibilities, which
    453       cover all helpers in practice.  The restrictions are that only
    454       arguments in registers are supported, hence only 6x64 integer
    455       bits in total can be passed.  In fact the only supported arg
    456       type is I64.
    457 
    458       The return type can be I{64,32,16,8} or V{128,256}.  In the
    459       latter two cases, it is expected that |args| will contain the
    460       special node IRExpr_VECRET(), in which case this routine
    461       generates code to allocate space on the stack for the vector
    462       return value.  Since we are not passing any scalars on the
    463       stack, it is enough to preallocate the return space before
    464       marshalling any arguments, in this case.
    465 
    466       |args| may also contain IRExpr_BBPTR(), in which case the
    467       value in %rbp is passed as the corresponding argument.
    468 
    469       Generating code which is both efficient and correct when
    470       parameters are to be passed in registers is difficult, for the
    471       reasons elaborated in detail in comments attached to
    472       doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
    473       of the method described in those comments.
    474 
    475       The problem is split into two cases: the fast scheme and the
    476       slow scheme.  In the fast scheme, arguments are computed
    477       directly into the target (real) registers.  This is only safe
    478       when we can be sure that computation of each argument will not
    479       trash any real registers set by computation of any other
    480       argument.
    481 
    482       In the slow scheme, all args are first computed into vregs, and
    483       once they are all done, they are moved to the relevant real
    484       regs.  This always gives correct code, but it also gives a bunch
    485       of vreg-to-rreg moves which are usually redundant but are hard
    486       for the register allocator to get rid of.
    487 
    488       To decide which scheme to use, all argument expressions are
    489       first examined.  If they are all so simple that it is clear they
    490       will be evaluated without use of any fixed registers, use the
    491       fast scheme, else use the slow scheme.  Note also that only
    492       unconditional calls may use the fast scheme, since having to
    493       compute a condition expression could itself trash real
    494       registers.  Note that for simplicity, in the case where
    495       IRExpr_VECRET() is present, we use the slow scheme.  This is
    496       motivated by the desire to avoid any possible complexity
    497       w.r.t. nested calls.
    498 
    499       Note this requires being able to examine an expression and
    500       determine whether or not evaluation of it might use a fixed
    501       register.  That requires knowledge of how the rest of this insn
    502       selector works.  Currently just the following 3 are regarded as
    503       safe -- hopefully they cover the majority of arguments in
    504       practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
    505    */
    506 
    507    /* Note that the cee->regparms field is meaningless on AMD64 host
    508       (since there is only one calling convention) and so we always
    509       ignore it. */
    510    n_args = 0;
    511    for (i = 0; args[i]; i++)
    512       n_args++;
    513 
    514    if (n_args > 6)
    515       vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
    516 
    517    argregs[0] = hregAMD64_RDI();
    518    argregs[1] = hregAMD64_RSI();
    519    argregs[2] = hregAMD64_RDX();
    520    argregs[3] = hregAMD64_RCX();
    521    argregs[4] = hregAMD64_R8();
    522    argregs[5] = hregAMD64_R9();
    523 
    524    tmpregs[0] = tmpregs[1] = tmpregs[2] =
    525    tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
    526 
    527    fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
    528    fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
    529 
    530    /* First decide which scheme (slow or fast) is to be used.  First
    531       assume the fast scheme, and select slow if any contraindications
    532       (wow) appear. */
    533 
    534    /* We'll need space on the stack for the return value.  Avoid
    535       possible complications with nested calls by using the slow
    536       scheme. */
    537    if (retTy == Ity_V128 || retTy == Ity_V256)
    538       goto slowscheme;
    539 
    540    if (guard) {
    541       if (guard->tag == Iex_Const
    542           && guard->Iex.Const.con->tag == Ico_U1
    543           && guard->Iex.Const.con->Ico.U1 == True) {
    544          /* unconditional */
    545       } else {
    546          /* Not manifestly unconditional -- be conservative. */
    547          goto slowscheme;
    548       }
    549    }
    550 
    551    /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
    552       use the slow scheme.  Because this is tentative, we can't call
    553       addInstr (that is, commit to) any instructions until we're
    554       handled all the arguments.  So park the resulting instructions
    555       in a buffer and emit that if we're successful. */
    556 
    557    /* FAST SCHEME */
    558    /* In this loop, we process args that can be computed into the
    559       destination (real) register with a single instruction, without
    560       using any fixed regs.  That also includes IRExpr_BBPTR(), but
    561       not IRExpr_VECRET().  Indeed, if the IR is well-formed, we can
    562       never see IRExpr_VECRET() at this point, since the return-type
    563       check above should ensure all those cases use the slow scheme
    564       instead. */
    565    vassert(n_args >= 0 && n_args <= 6);
    566    for (i = 0; i < n_args; i++) {
    567       IRExpr* arg = args[i];
    568       if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
    569          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    570       }
    571       fastinstrs[i]
    572          = iselIntExpr_single_instruction( env, argregs[i], args[i] );
    573       if (fastinstrs[i] == NULL)
    574          goto slowscheme;
    575    }
    576 
    577    /* Looks like we're in luck.  Emit the accumulated instructions and
    578       move on to doing the call itself. */
    579    for (i = 0; i < n_args; i++)
    580       addInstr(env, fastinstrs[i]);
    581 
    582    /* Fast scheme only applies for unconditional calls.  Hence: */
    583    cc = Acc_ALWAYS;
    584 
    585    goto handle_call;
    586 
    587 
    588    /* SLOW SCHEME; move via temporaries */
    589   slowscheme:
    590    {}
    591 #  if 0 /* debug only */
    592    if (n_args > 0) {for (i = 0; args[i]; i++) {
    593    ppIRExpr(args[i]); vex_printf(" "); }
    594    vex_printf("\n");}
    595 #  endif
    596 
    597    /* If we have a vector return type, allocate a place for it on the
    598       stack and record its address. */
    599    HReg r_vecRetAddr = INVALID_HREG;
    600    if (retTy == Ity_V128) {
    601       r_vecRetAddr = newVRegI(env);
    602       sub_from_rsp(env, 16);
    603       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
    604    }
    605    else if (retTy == Ity_V256) {
    606       r_vecRetAddr = newVRegI(env);
    607       sub_from_rsp(env, 32);
    608       addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr ));
    609    }
    610 
    611    vassert(n_args >= 0 && n_args <= 6);
    612    for (i = 0; i < n_args; i++) {
    613       IRExpr* arg = args[i];
    614       if (UNLIKELY(arg->tag == Iex_BBPTR)) {
    615          tmpregs[i] = newVRegI(env);
    616          addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i]));
    617          nBBPTRs++;
    618       }
    619       else if (UNLIKELY(arg->tag == Iex_VECRET)) {
    620          /* We stashed the address of the return slot earlier, so just
    621             retrieve it now. */
    622          vassert(!hregIsInvalid(r_vecRetAddr));
    623          tmpregs[i] = r_vecRetAddr;
    624          nVECRETs++;
    625       }
    626       else {
    627          vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
    628          tmpregs[i] = iselIntExpr_R(env, args[i]);
    629       }
    630    }
    631 
    632    /* Now we can compute the condition.  We can't do it earlier
    633       because the argument computations could trash the condition
    634       codes.  Be a bit clever to handle the common case where the
    635       guard is 1:Bit. */
    636    cc = Acc_ALWAYS;
    637    if (guard) {
    638       if (guard->tag == Iex_Const
    639           && guard->Iex.Const.con->tag == Ico_U1
    640           && guard->Iex.Const.con->Ico.U1 == True) {
    641          /* unconditional -- do nothing */
    642       } else {
    643          cc = iselCondCode( env, guard );
    644       }
    645    }
    646 
    647    /* Move the args to their final destinations. */
    648    for (i = 0; i < n_args; i++) {
    649       /* None of these insns, including any spill code that might
    650          be generated, may alter the condition codes. */
    651       addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
    652    }
    653 
    654 
    655    /* Do final checks, set the return values, and generate the call
    656       instruction proper. */
    657   handle_call:
    658 
    659    if (retTy == Ity_V128 || retTy == Ity_V256) {
    660       vassert(nVECRETs == 1);
    661    } else {
    662       vassert(nVECRETs == 0);
    663    }
    664 
    665    vassert(nBBPTRs == 0 || nBBPTRs == 1);
    666 
    667    vassert(*stackAdjustAfterCall == 0);
    668    vassert(is_RetLoc_INVALID(*retloc));
    669    switch (retTy) {
    670          case Ity_INVALID:
    671             /* Function doesn't return a value. */
    672             *retloc = mk_RetLoc_simple(RLPri_None);
    673             break;
    674          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
    675             *retloc = mk_RetLoc_simple(RLPri_Int);
    676             break;
    677          case Ity_V128:
    678             *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0);
    679             *stackAdjustAfterCall = 16;
    680             break;
    681          case Ity_V256:
    682             *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0);
    683             *stackAdjustAfterCall = 32;
    684             break;
    685          default:
    686             /* IR can denote other possible return types, but we don't
    687                handle those here. */
    688            vassert(0);
    689    }
    690 
    691    /* Finally, generate the call itself.  This needs the *retloc value
    692       set in the switch above, which is why it's at the end. */
    693    addInstr(env,
    694             AMD64Instr_Call(cc, Ptr_to_ULong(cee->addr), n_args, *retloc));
    695 }
    696 
    697 
    698 /* Given a guest-state array descriptor, an index expression and a
    699    bias, generate an AMD64AMode holding the relevant guest state
    700    offset. */
    701 
    702 static
    703 AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
    704                                   IRExpr* off, Int bias )
    705 {
    706    HReg tmp, roff;
    707    Int  elemSz = sizeofIRType(descr->elemTy);
    708    Int  nElems = descr->nElems;
    709 
    710    /* Throw out any cases not generated by an amd64 front end.  In
    711       theory there might be a day where we need to handle them -- if
    712       we ever run non-amd64-guest on amd64 host. */
    713 
    714    if (nElems != 8 || (elemSz != 1 && elemSz != 8))
    715       vpanic("genGuestArrayOffset(amd64 host)");
    716 
    717    /* Compute off into a reg, %off.  Then return:
    718 
    719          movq %off, %tmp
    720          addq $bias, %tmp  (if bias != 0)
    721          andq %tmp, 7
    722          ... base(%rbp, %tmp, shift) ...
    723    */
    724    tmp  = newVRegI(env);
    725    roff = iselIntExpr_R(env, off);
    726    addInstr(env, mk_iMOVsd_RR(roff, tmp));
    727    if (bias != 0) {
    728       /* Make sure the bias is sane, in the sense that there are
    729          no significant bits above bit 30 in it. */
    730       vassert(-10000 < bias && bias < 10000);
    731       addInstr(env,
    732                AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
    733    }
    734    addInstr(env,
    735             AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
    736    vassert(elemSz == 1 || elemSz == 8);
    737    return
    738       AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
    739                                     elemSz==8 ? 3 : 0);
    740 }
    741 
    742 
    743 /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
    744 static
    745 void set_SSE_rounding_default ( ISelEnv* env )
    746 {
    747    /* pushq $DEFAULT_MXCSR
    748       ldmxcsr 0(%rsp)
    749       addq $8, %rsp
    750    */
    751    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    752    addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
    753    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    754    add_to_rsp(env, 8);
    755 }
    756 
    757 /* Mess with the FPU's rounding mode: set to the default rounding mode
    758    (DEFAULT_FPUCW). */
    759 static
    760 void set_FPU_rounding_default ( ISelEnv* env )
    761 {
    762    /* movq $DEFAULT_FPUCW, -8(%rsp)
    763       fldcw -8(%esp)
    764    */
    765    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    766    addInstr(env, AMD64Instr_Alu64M(
    767                     Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
    768    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    769 }
    770 
    771 
    772 /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
    773    expression denoting a value in the range 0 .. 3, indicating a round
    774    mode encoded as per type IRRoundingMode.  Set the SSE machinery to
    775    have the same rounding.
    776 */
    777 static
    778 void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
    779 {
    780    /* Note: this sequence only makes sense because DEFAULT_MXCSR has
    781       both rounding bits == 0.  If that wasn't the case, we couldn't
    782       create a new rounding field simply by ORing the new value into
    783       place. */
    784 
    785    /* movq $3, %reg
    786       andq [[mode]], %reg  -- shouldn't be needed; paranoia
    787       shlq $13, %reg
    788       orq $DEFAULT_MXCSR, %reg
    789       pushq %reg
    790       ldmxcsr 0(%esp)
    791       addq $8, %rsp
    792    */
    793    HReg        reg      = newVRegI(env);
    794    AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
    795    addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
    796    addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
    797                                    iselIntExpr_RMI(env, mode), reg));
    798    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
    799    addInstr(env, AMD64Instr_Alu64R(
    800                     Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
    801    addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
    802    addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
    803    add_to_rsp(env, 8);
    804 }
    805 
    806 
    807 /* Mess with the FPU's rounding mode: 'mode' is an I32-typed
    808    expression denoting a value in the range 0 .. 3, indicating a round
    809    mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
    810    the same rounding.
    811 */
    812 static
    813 void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
    814 {
    815    HReg rrm  = iselIntExpr_R(env, mode);
    816    HReg rrm2 = newVRegI(env);
    817    AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
    818 
    819    /* movq  %rrm, %rrm2
    820       andq  $3, %rrm2   -- shouldn't be needed; paranoia
    821       shlq  $10, %rrm2
    822       orq   $DEFAULT_FPUCW, %rrm2
    823       movq  %rrm2, -8(%rsp)
    824       fldcw -8(%esp)
    825    */
    826    addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
    827    addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
    828    addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
    829    addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
    830                                    AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
    831    addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,
    832                                    AMD64RI_Reg(rrm2), m8_rsp));
    833    addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
    834 }
    835 
    836 
    837 /* Generate all-zeroes into a new vector register.
    838 */
    839 static HReg generate_zeroes_V128 ( ISelEnv* env )
    840 {
    841    HReg dst = newVRegV(env);
    842    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
    843    return dst;
    844 }
    845 
    846 /* Generate all-ones into a new vector register.
    847 */
    848 static HReg generate_ones_V128 ( ISelEnv* env )
    849 {
    850    HReg dst = newVRegV(env);
    851    addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
    852    return dst;
    853 }
    854 
    855 
    856 /* Generate !src into a new vector register.  Amazing that there isn't
    857    a less crappy way to do this.
    858 */
    859 static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
    860 {
    861    HReg dst = generate_ones_V128(env);
    862    addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
    863    return dst;
    864 }
    865 
    866 
    867 /* Expand the given byte into a 64-bit word, by cloning each bit
    868    8 times. */
    869 static ULong bitmask8_to_bytemask64 ( UShort w8 )
    870 {
    871    vassert(w8 == (w8 & 0xFF));
    872    ULong w64 = 0;
    873    Int i;
    874    for (i = 0; i < 8; i++) {
    875       if (w8 & (1<<i))
    876          w64 |= (0xFFULL << (8 * i));
    877    }
    878    return w64;
    879 }
    880 
    881 
    882 /*---------------------------------------------------------*/
    883 /*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
    884 /*---------------------------------------------------------*/
    885 
    886 /* Select insns for an integer-typed expression, and add them to the
    887    code list.  Return a reg holding the result.  This reg will be a
    888    virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
    889    want to modify it, ask for a new vreg, copy it in there, and modify
    890    the copy.  The register allocator will do its best to map both
    891    vregs to the same real register, so the copies will often disappear
    892    later in the game.
    893 
    894    This should handle expressions of 64, 32, 16 and 8-bit type.  All
    895    results are returned in a 64-bit register.  For 32-, 16- and 8-bit
    896    expressions, the upper 32/48/56 bits are arbitrary, so you should
    897    mask or sign extend partial values if necessary.
    898 */
    899 
    900 static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
    901 {
    902    HReg r = iselIntExpr_R_wrk(env, e);
    903    /* sanity checks ... */
    904 #  if 0
    905    vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
    906 #  endif
    907    vassert(hregClass(r) == HRcInt64);
    908    vassert(hregIsVirtual(r));
    909    return r;
    910 }
    911 
    912 /* DO NOT CALL THIS DIRECTLY ! */
    913 static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
    914 {
    915    /* Used for unary/binary SIMD64 ops. */
    916    HWord fn = 0;
    917    Bool second_is_UInt;
    918 
    919    MatchInfo mi;
    920    DECLARE_PATTERN(p_1Uto8_64to1);
    921    DECLARE_PATTERN(p_LDle8_then_8Uto64);
    922    DECLARE_PATTERN(p_LDle16_then_16Uto64);
    923 
    924    IRType ty = typeOfIRExpr(env->type_env,e);
    925    switch (ty) {
    926       case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break;
    927       default: vassert(0);
    928    }
    929 
    930    switch (e->tag) {
    931 
    932    /* --------- TEMP --------- */
    933    case Iex_RdTmp: {
    934       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
    935    }
    936 
    937    /* --------- LOAD --------- */
    938    case Iex_Load: {
    939       HReg dst = newVRegI(env);
    940       AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
    941 
    942       /* We can't handle big-endian loads, nor load-linked. */
    943       if (e->Iex.Load.end != Iend_LE)
    944          goto irreducible;
    945 
    946       if (ty == Ity_I64) {
    947          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
    948                                          AMD64RMI_Mem(amode), dst) );
    949          return dst;
    950       }
    951       if (ty == Ity_I32) {
    952          addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
    953          return dst;
    954       }
    955       if (ty == Ity_I16) {
    956          addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
    957          return dst;
    958       }
    959       if (ty == Ity_I8) {
    960          addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
    961          return dst;
    962       }
    963       break;
    964    }
    965 
    966    /* --------- BINARY OP --------- */
    967    case Iex_Binop: {
    968       AMD64AluOp   aluOp;
    969       AMD64ShiftOp shOp;
    970 
    971       /* Pattern: Sub64(0,x) */
    972       /*     and: Sub32(0,x) */
    973       if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
    974           || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
    975          HReg dst = newVRegI(env);
    976          HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
    977          addInstr(env, mk_iMOVsd_RR(reg,dst));
    978          addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
    979          return dst;
    980       }
    981 
    982       /* Is it an addition or logical style op? */
    983       switch (e->Iex.Binop.op) {
    984          case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
    985             aluOp = Aalu_ADD; break;
    986          case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
    987             aluOp = Aalu_SUB; break;
    988          case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
    989             aluOp = Aalu_AND; break;
    990          case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
    991             aluOp = Aalu_OR; break;
    992          case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
    993             aluOp = Aalu_XOR; break;
    994          case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
    995             aluOp = Aalu_MUL; break;
    996          default:
    997             aluOp = Aalu_INVALID; break;
    998       }
    999       /* For commutative ops we assume any literal
   1000          values are on the second operand. */
   1001       if (aluOp != Aalu_INVALID) {
   1002          HReg dst      = newVRegI(env);
   1003          HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1004          AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   1005          addInstr(env, mk_iMOVsd_RR(reg,dst));
   1006          addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
   1007          return dst;
   1008       }
   1009 
   1010       /* Perhaps a shift op? */
   1011       switch (e->Iex.Binop.op) {
   1012          case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
   1013             shOp = Ash_SHL; break;
   1014          case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8:
   1015             shOp = Ash_SHR; break;
   1016          case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8:
   1017             shOp = Ash_SAR; break;
   1018          default:
   1019             shOp = Ash_INVALID; break;
   1020       }
   1021       if (shOp != Ash_INVALID) {
   1022          HReg dst = newVRegI(env);
   1023 
   1024          /* regL = the value to be shifted */
   1025          HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1026          addInstr(env, mk_iMOVsd_RR(regL,dst));
   1027 
   1028          /* Do any necessary widening for 32/16/8 bit operands */
   1029          switch (e->Iex.Binop.op) {
   1030             case Iop_Shr64: case Iop_Shl64: case Iop_Sar64:
   1031                break;
   1032             case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
   1033                break;
   1034             case Iop_Shr8:
   1035                addInstr(env, AMD64Instr_Alu64R(
   1036                                 Aalu_AND, AMD64RMI_Imm(0xFF), dst));
   1037                break;
   1038             case Iop_Shr16:
   1039                addInstr(env, AMD64Instr_Alu64R(
   1040                                 Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
   1041                break;
   1042             case Iop_Shr32:
   1043                addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
   1044                break;
   1045             case Iop_Sar8:
   1046                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
   1047                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
   1048                break;
   1049             case Iop_Sar16:
   1050                addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
   1051                addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
   1052                break;
   1053             case Iop_Sar32:
   1054                addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
   1055                break;
   1056             default:
   1057                ppIROp(e->Iex.Binop.op);
   1058                vassert(0);
   1059          }
   1060 
   1061          /* Now consider the shift amount.  If it's a literal, we
   1062             can do a much better job than the general case. */
   1063          if (e->Iex.Binop.arg2->tag == Iex_Const) {
   1064             /* assert that the IR is well-typed */
   1065             Int nshift;
   1066             vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
   1067             nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1068             vassert(nshift >= 0);
   1069             if (nshift > 0)
   1070                /* Can't allow nshift==0 since that means %cl */
   1071                addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
   1072          } else {
   1073             /* General case; we have to force the amount into %cl. */
   1074             HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1075             addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
   1076             addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
   1077          }
   1078          return dst;
   1079       }
   1080 
   1081       /* Deal with 64-bit SIMD binary ops */
   1082       second_is_UInt = False;
   1083       switch (e->Iex.Binop.op) {
   1084          case Iop_Add8x8:
   1085             fn = (HWord)h_generic_calc_Add8x8; break;
   1086          case Iop_Add16x4:
   1087             fn = (HWord)h_generic_calc_Add16x4; break;
   1088          case Iop_Add32x2:
   1089             fn = (HWord)h_generic_calc_Add32x2; break;
   1090 
   1091          case Iop_Avg8Ux8:
   1092             fn = (HWord)h_generic_calc_Avg8Ux8; break;
   1093          case Iop_Avg16Ux4:
   1094             fn = (HWord)h_generic_calc_Avg16Ux4; break;
   1095 
   1096          case Iop_CmpEQ8x8:
   1097             fn = (HWord)h_generic_calc_CmpEQ8x8; break;
   1098          case Iop_CmpEQ16x4:
   1099             fn = (HWord)h_generic_calc_CmpEQ16x4; break;
   1100          case Iop_CmpEQ32x2:
   1101             fn = (HWord)h_generic_calc_CmpEQ32x2; break;
   1102 
   1103          case Iop_CmpGT8Sx8:
   1104             fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
   1105          case Iop_CmpGT16Sx4:
   1106             fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
   1107          case Iop_CmpGT32Sx2:
   1108             fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
   1109 
   1110          case Iop_InterleaveHI8x8:
   1111             fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
   1112          case Iop_InterleaveLO8x8:
   1113             fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
   1114          case Iop_InterleaveHI16x4:
   1115             fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
   1116          case Iop_InterleaveLO16x4:
   1117             fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
   1118          case Iop_InterleaveHI32x2:
   1119             fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
   1120          case Iop_InterleaveLO32x2:
   1121             fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
   1122          case Iop_CatOddLanes16x4:
   1123             fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
   1124          case Iop_CatEvenLanes16x4:
   1125             fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
   1126          case Iop_Perm8x8:
   1127             fn = (HWord)h_generic_calc_Perm8x8; break;
   1128 
   1129          case Iop_Max8Ux8:
   1130             fn = (HWord)h_generic_calc_Max8Ux8; break;
   1131          case Iop_Max16Sx4:
   1132             fn = (HWord)h_generic_calc_Max16Sx4; break;
   1133          case Iop_Min8Ux8:
   1134             fn = (HWord)h_generic_calc_Min8Ux8; break;
   1135          case Iop_Min16Sx4:
   1136             fn = (HWord)h_generic_calc_Min16Sx4; break;
   1137 
   1138          case Iop_Mul16x4:
   1139             fn = (HWord)h_generic_calc_Mul16x4; break;
   1140          case Iop_Mul32x2:
   1141             fn = (HWord)h_generic_calc_Mul32x2; break;
   1142          case Iop_MulHi16Sx4:
   1143             fn = (HWord)h_generic_calc_MulHi16Sx4; break;
   1144          case Iop_MulHi16Ux4:
   1145             fn = (HWord)h_generic_calc_MulHi16Ux4; break;
   1146 
   1147          case Iop_QAdd8Sx8:
   1148             fn = (HWord)h_generic_calc_QAdd8Sx8; break;
   1149          case Iop_QAdd16Sx4:
   1150             fn = (HWord)h_generic_calc_QAdd16Sx4; break;
   1151          case Iop_QAdd8Ux8:
   1152             fn = (HWord)h_generic_calc_QAdd8Ux8; break;
   1153          case Iop_QAdd16Ux4:
   1154             fn = (HWord)h_generic_calc_QAdd16Ux4; break;
   1155 
   1156          case Iop_QNarrowBin32Sto16Sx4:
   1157             fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break;
   1158          case Iop_QNarrowBin16Sto8Sx8:
   1159             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break;
   1160          case Iop_QNarrowBin16Sto8Ux8:
   1161             fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break;
   1162          case Iop_NarrowBin16to8x8:
   1163             fn = (HWord)h_generic_calc_NarrowBin16to8x8; break;
   1164          case Iop_NarrowBin32to16x4:
   1165             fn = (HWord)h_generic_calc_NarrowBin32to16x4; break;
   1166 
   1167          case Iop_QSub8Sx8:
   1168             fn = (HWord)h_generic_calc_QSub8Sx8; break;
   1169          case Iop_QSub16Sx4:
   1170             fn = (HWord)h_generic_calc_QSub16Sx4; break;
   1171          case Iop_QSub8Ux8:
   1172             fn = (HWord)h_generic_calc_QSub8Ux8; break;
   1173          case Iop_QSub16Ux4:
   1174             fn = (HWord)h_generic_calc_QSub16Ux4; break;
   1175 
   1176          case Iop_Sub8x8:
   1177             fn = (HWord)h_generic_calc_Sub8x8; break;
   1178          case Iop_Sub16x4:
   1179             fn = (HWord)h_generic_calc_Sub16x4; break;
   1180          case Iop_Sub32x2:
   1181             fn = (HWord)h_generic_calc_Sub32x2; break;
   1182 
   1183          case Iop_ShlN32x2:
   1184             fn = (HWord)h_generic_calc_ShlN32x2;
   1185             second_is_UInt = True;
   1186             break;
   1187          case Iop_ShlN16x4:
   1188             fn = (HWord)h_generic_calc_ShlN16x4;
   1189             second_is_UInt = True;
   1190             break;
   1191          case Iop_ShlN8x8:
   1192             fn = (HWord)h_generic_calc_ShlN8x8;
   1193             second_is_UInt = True;
   1194             break;
   1195          case Iop_ShrN32x2:
   1196             fn = (HWord)h_generic_calc_ShrN32x2;
   1197             second_is_UInt = True;
   1198             break;
   1199          case Iop_ShrN16x4:
   1200             fn = (HWord)h_generic_calc_ShrN16x4;
   1201             second_is_UInt = True;
   1202             break;
   1203          case Iop_SarN32x2:
   1204             fn = (HWord)h_generic_calc_SarN32x2;
   1205             second_is_UInt = True;
   1206             break;
   1207          case Iop_SarN16x4:
   1208             fn = (HWord)h_generic_calc_SarN16x4;
   1209             second_is_UInt = True;
   1210             break;
   1211          case Iop_SarN8x8:
   1212             fn = (HWord)h_generic_calc_SarN8x8;
   1213             second_is_UInt = True;
   1214             break;
   1215 
   1216          default:
   1217             fn = (HWord)0; break;
   1218       }
   1219       if (fn != (HWord)0) {
   1220          /* Note: the following assumes all helpers are of signature
   1221                ULong fn ( ULong, ULong ), and they are
   1222             not marked as regparm functions.
   1223          */
   1224          HReg dst  = newVRegI(env);
   1225          HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1226          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1227          if (second_is_UInt)
   1228             addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
   1229          addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
   1230          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
   1231          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2,
   1232                                         mk_RetLoc_simple(RLPri_Int) ));
   1233          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1234          return dst;
   1235       }
   1236 
   1237       /* Handle misc other ops. */
   1238 
   1239       if (e->Iex.Binop.op == Iop_Max32U) {
   1240          HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1241          HReg dst  = newVRegI(env);
   1242          HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1243          addInstr(env, mk_iMOVsd_RR(src1, dst));
   1244          addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst));
   1245          addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
   1246          return dst;
   1247       }
   1248 
   1249       if (e->Iex.Binop.op == Iop_DivModS64to32
   1250           || e->Iex.Binop.op == Iop_DivModU64to32) {
   1251          /* 64 x 32 -> (32(rem),32(div)) division */
   1252          /* Get the 64-bit operand into edx:eax, and the other into
   1253             any old R/M. */
   1254          HReg      rax     = hregAMD64_RAX();
   1255          HReg      rdx     = hregAMD64_RDX();
   1256          HReg      dst     = newVRegI(env);
   1257          Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
   1258          AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   1259          /* Compute the left operand into a reg, and then
   1260             put the top half in edx and the bottom in eax. */
   1261          HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1262          addInstr(env, mk_iMOVsd_RR(left64, rdx));
   1263          addInstr(env, mk_iMOVsd_RR(left64, rax));
   1264          addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
   1265          addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
   1266 	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
   1267 	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
   1268          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
   1269          addInstr(env, mk_iMOVsd_RR(rax, dst));
   1270          addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
   1271          return dst;
   1272       }
   1273 
   1274       if (e->Iex.Binop.op == Iop_32HLto64) {
   1275          HReg hi32  = newVRegI(env);
   1276          HReg lo32  = newVRegI(env);
   1277          HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1278          HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1279          addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
   1280          addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
   1281          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
   1282 	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
   1283          addInstr(env, AMD64Instr_Alu64R(
   1284                           Aalu_OR, AMD64RMI_Reg(lo32), hi32));
   1285          return hi32;
   1286       }
   1287 
   1288       if (e->Iex.Binop.op == Iop_16HLto32) {
   1289          HReg hi16  = newVRegI(env);
   1290          HReg lo16  = newVRegI(env);
   1291          HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1292          HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1293          addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
   1294          addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
   1295          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
   1296          addInstr(env, AMD64Instr_Alu64R(
   1297                           Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
   1298          addInstr(env, AMD64Instr_Alu64R(
   1299                           Aalu_OR, AMD64RMI_Reg(lo16), hi16));
   1300          return hi16;
   1301       }
   1302 
   1303       if (e->Iex.Binop.op == Iop_8HLto16) {
   1304          HReg hi8  = newVRegI(env);
   1305          HReg lo8  = newVRegI(env);
   1306          HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1307          HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1308          addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
   1309          addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
   1310          addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
   1311          addInstr(env, AMD64Instr_Alu64R(
   1312                           Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
   1313          addInstr(env, AMD64Instr_Alu64R(
   1314                           Aalu_OR, AMD64RMI_Reg(lo8), hi8));
   1315          return hi8;
   1316       }
   1317 
   1318       if (e->Iex.Binop.op == Iop_MullS32
   1319           || e->Iex.Binop.op == Iop_MullS16
   1320           || e->Iex.Binop.op == Iop_MullS8
   1321           || e->Iex.Binop.op == Iop_MullU32
   1322           || e->Iex.Binop.op == Iop_MullU16
   1323           || e->Iex.Binop.op == Iop_MullU8) {
   1324          HReg a32   = newVRegI(env);
   1325          HReg b32   = newVRegI(env);
   1326          HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1327          HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
   1328          Int          shift  = 0;
   1329          AMD64ShiftOp shr_op = Ash_SHR;
   1330          switch (e->Iex.Binop.op) {
   1331             case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
   1332             case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
   1333             case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
   1334             case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
   1335             case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
   1336             case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
   1337             default: vassert(0);
   1338          }
   1339 
   1340          addInstr(env, mk_iMOVsd_RR(a32s, a32));
   1341          addInstr(env, mk_iMOVsd_RR(b32s, b32));
   1342          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
   1343          addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
   1344          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
   1345          addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
   1346          addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
   1347          return b32;
   1348       }
   1349 
   1350       if (e->Iex.Binop.op == Iop_CmpF64) {
   1351          HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
   1352          HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
   1353          HReg dst = newVRegI(env);
   1354          addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
   1355          /* Mask out irrelevant parts of the result so as to conform
   1356             to the CmpF64 definition. */
   1357          addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
   1358          return dst;
   1359       }
   1360 
   1361       if (e->Iex.Binop.op == Iop_F64toI32S
   1362           || e->Iex.Binop.op == Iop_F64toI64S) {
   1363          Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
   1364          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
   1365          HReg dst = newVRegI(env);
   1366          set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   1367          addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
   1368          set_SSE_rounding_default(env);
   1369          return dst;
   1370       }
   1371 
   1372       break;
   1373    }
   1374 
   1375    /* --------- UNARY OP --------- */
   1376    case Iex_Unop: {
   1377 
   1378       /* 1Uto8(64to1(expr64)) */
   1379       {
   1380          DEFINE_PATTERN( p_1Uto8_64to1,
   1381                          unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
   1382          if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
   1383             IRExpr* expr64 = mi.bindee[0];
   1384             HReg    dst    = newVRegI(env);
   1385             HReg    src    = iselIntExpr_R(env, expr64);
   1386             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1387             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1388                                             AMD64RMI_Imm(1), dst));
   1389             return dst;
   1390          }
   1391       }
   1392 
   1393       /* 8Uto64(LDle(expr64)) */
   1394       {
   1395          DEFINE_PATTERN(p_LDle8_then_8Uto64,
   1396                         unop(Iop_8Uto64,
   1397                              IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
   1398          if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
   1399             HReg dst = newVRegI(env);
   1400             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1401             addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
   1402             return dst;
   1403          }
   1404       }
   1405 
   1406       /* 16Uto64(LDle(expr64)) */
   1407       {
   1408          DEFINE_PATTERN(p_LDle16_then_16Uto64,
   1409                         unop(Iop_16Uto64,
   1410                              IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
   1411          if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
   1412             HReg dst = newVRegI(env);
   1413             AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
   1414             addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
   1415             return dst;
   1416          }
   1417       }
   1418 
   1419       /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) )
   1420          Use 32 bit arithmetic and let the default zero-extend rule
   1421          do the 32Uto64 for free. */
   1422       if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) {
   1423          IROp    opi  = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */
   1424          IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1;
   1425          IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2;
   1426          AMD64AluOp aluOp = Aalu_INVALID;
   1427          switch (opi) {
   1428             case Iop_Add32: aluOp = Aalu_ADD; break;
   1429             case Iop_Sub32: aluOp = Aalu_SUB; break;
   1430             case Iop_And32: aluOp = Aalu_AND; break;
   1431             case Iop_Or32:  aluOp = Aalu_OR;  break;
   1432             case Iop_Xor32: aluOp = Aalu_XOR; break;
   1433             default: break;
   1434          }
   1435          if (aluOp != Aalu_INVALID) {
   1436             /* For commutative ops we assume any literal values are on
   1437                the second operand. */
   1438             HReg dst      = newVRegI(env);
   1439             HReg reg      = iselIntExpr_R(env, argL);
   1440             AMD64RMI* rmi = iselIntExpr_RMI(env, argR);
   1441             addInstr(env, mk_iMOVsd_RR(reg,dst));
   1442             addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst));
   1443             return dst;
   1444          }
   1445          /* just fall through to normal handling for Iop_32Uto64 */
   1446       }
   1447 
   1448       /* Fallback cases */
   1449       switch (e->Iex.Unop.op) {
   1450          case Iop_32Uto64:
   1451          case Iop_32Sto64: {
   1452             HReg dst = newVRegI(env);
   1453             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1454             addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
   1455                                             src, dst) );
   1456             return dst;
   1457          }
   1458          case Iop_128HIto64: {
   1459             HReg rHi, rLo;
   1460             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1461             return rHi; /* and abandon rLo */
   1462          }
   1463          case Iop_128to64: {
   1464             HReg rHi, rLo;
   1465             iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
   1466             return rLo; /* and abandon rHi */
   1467          }
   1468          case Iop_8Uto16:
   1469          case Iop_8Uto32:
   1470          case Iop_8Uto64:
   1471          case Iop_16Uto64:
   1472          case Iop_16Uto32: {
   1473             HReg dst     = newVRegI(env);
   1474             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1475             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
   1476                                    || e->Iex.Unop.op==Iop_16Uto64 );
   1477             UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
   1478             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1479             addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   1480                                             AMD64RMI_Imm(mask), dst));
   1481             return dst;
   1482          }
   1483          case Iop_8Sto16:
   1484          case Iop_8Sto64:
   1485          case Iop_8Sto32:
   1486          case Iop_16Sto32:
   1487          case Iop_16Sto64: {
   1488             HReg dst     = newVRegI(env);
   1489             HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
   1490             Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
   1491                                    || e->Iex.Unop.op==Iop_16Sto64 );
   1492             UInt amt     = srcIs16 ? 48 : 56;
   1493             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1494             addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
   1495             addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
   1496             return dst;
   1497          }
   1498  	 case Iop_Not8:
   1499  	 case Iop_Not16:
   1500          case Iop_Not32:
   1501          case Iop_Not64: {
   1502             HReg dst = newVRegI(env);
   1503             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1504             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1505             addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
   1506             return dst;
   1507          }
   1508          case Iop_16HIto8:
   1509          case Iop_32HIto16:
   1510          case Iop_64HIto32: {
   1511             HReg dst  = newVRegI(env);
   1512             HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
   1513             Int shift = 0;
   1514             switch (e->Iex.Unop.op) {
   1515                case Iop_16HIto8:  shift = 8;  break;
   1516                case Iop_32HIto16: shift = 16; break;
   1517                case Iop_64HIto32: shift = 32; break;
   1518                default: vassert(0);
   1519             }
   1520             addInstr(env, mk_iMOVsd_RR(src,dst) );
   1521             addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
   1522             return dst;
   1523          }
   1524          case Iop_1Uto64:
   1525          case Iop_1Uto32:
   1526          case Iop_1Uto8: {
   1527             HReg dst           = newVRegI(env);
   1528             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1529             addInstr(env, AMD64Instr_Set64(cond,dst));
   1530             return dst;
   1531          }
   1532          case Iop_1Sto8:
   1533          case Iop_1Sto16:
   1534          case Iop_1Sto32:
   1535          case Iop_1Sto64: {
   1536             /* could do better than this, but for now ... */
   1537             HReg dst           = newVRegI(env);
   1538             AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
   1539             addInstr(env, AMD64Instr_Set64(cond,dst));
   1540             addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
   1541             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1542             return dst;
   1543          }
   1544          case Iop_Ctz64: {
   1545             /* Count trailing zeroes, implemented by amd64 'bsfq' */
   1546             HReg dst = newVRegI(env);
   1547             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1548             addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
   1549             return dst;
   1550          }
   1551          case Iop_Clz64: {
   1552             /* Count leading zeroes.  Do 'bsrq' to establish the index
   1553                of the highest set bit, and subtract that value from
   1554                63. */
   1555             HReg tmp = newVRegI(env);
   1556             HReg dst = newVRegI(env);
   1557             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1558             addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
   1559             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
   1560                                             AMD64RMI_Imm(63), dst));
   1561             addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
   1562                                             AMD64RMI_Reg(tmp), dst));
   1563             return dst;
   1564          }
   1565 
   1566          case Iop_CmpwNEZ64: {
   1567             HReg dst = newVRegI(env);
   1568             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1569             addInstr(env, mk_iMOVsd_RR(src,dst));
   1570             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1571             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1572                                             AMD64RMI_Reg(src), dst));
   1573             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1574             return dst;
   1575          }
   1576 
   1577          case Iop_CmpwNEZ32: {
   1578             HReg src = newVRegI(env);
   1579             HReg dst = newVRegI(env);
   1580             HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
   1581             addInstr(env, mk_iMOVsd_RR(pre,src));
   1582             addInstr(env, AMD64Instr_MovxLQ(False, src, src));
   1583             addInstr(env, mk_iMOVsd_RR(src,dst));
   1584             addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
   1585             addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
   1586                                             AMD64RMI_Reg(src), dst));
   1587             addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
   1588             return dst;
   1589          }
   1590 
   1591          case Iop_Left8:
   1592          case Iop_Left16:
   1593          case Iop_Left32:
   1594          case Iop_Left64: {
   1595             HReg dst = newVRegI(env);
   1596             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   1597             addInstr(env, mk_iMOVsd_RR(src, dst));
   1598             addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
   1599             addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
   1600             return dst;
   1601          }
   1602 
   1603          case Iop_V128to32: {
   1604             HReg        dst     = newVRegI(env);
   1605             HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
   1606             AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   1607             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
   1608             addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
   1609             return dst;
   1610          }
   1611 
   1612          /* V128{HI}to64 */
   1613          case Iop_V128HIto64:
   1614          case Iop_V128to64: {
   1615             HReg dst = newVRegI(env);
   1616             Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? -8 : -16;
   1617             HReg rsp = hregAMD64_RSP();
   1618             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1619             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1620             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1621             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1622                                              16, vec, m16_rsp));
   1623             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1624                                              AMD64RMI_Mem(off_rsp), dst ));
   1625             return dst;
   1626          }
   1627 
   1628          case Iop_V256to64_0: case Iop_V256to64_1:
   1629          case Iop_V256to64_2: case Iop_V256to64_3: {
   1630             HReg vHi, vLo, vec;
   1631             iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   1632             /* Do the first part of the selection by deciding which of
   1633                the 128 bit registers do look at, and second part using
   1634                the same scheme as for V128{HI}to64 above. */
   1635             Int off = 0;
   1636             switch (e->Iex.Unop.op) {
   1637                case Iop_V256to64_0: vec = vLo; off = -16; break;
   1638                case Iop_V256to64_1: vec = vLo; off =  -8; break;
   1639                case Iop_V256to64_2: vec = vHi; off = -16; break;
   1640                case Iop_V256to64_3: vec = vHi; off =  -8; break;
   1641                default: vassert(0);
   1642             }
   1643             HReg        dst     = newVRegI(env);
   1644             HReg        rsp     = hregAMD64_RSP();
   1645             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1646             AMD64AMode* off_rsp = AMD64AMode_IR(off, rsp);
   1647             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1648                                              16, vec, m16_rsp));
   1649             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1650                                              AMD64RMI_Mem(off_rsp), dst ));
   1651             return dst;
   1652          }
   1653 
   1654          /* ReinterpF64asI64(e) */
   1655          /* Given an IEEE754 double, produce an I64 with the same bit
   1656             pattern. */
   1657          case Iop_ReinterpF64asI64: {
   1658             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1659             HReg        dst    = newVRegI(env);
   1660             HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
   1661             /* paranoia */
   1662             set_SSE_rounding_default(env);
   1663             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
   1664             addInstr(env, AMD64Instr_Alu64R(
   1665                              Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
   1666             return dst;
   1667          }
   1668 
   1669          /* ReinterpF32asI32(e) */
   1670          /* Given an IEEE754 single, produce an I64 with the same bit
   1671             pattern in the lower half. */
   1672          case Iop_ReinterpF32asI32: {
   1673             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1674             HReg        dst    = newVRegI(env);
   1675             HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
   1676             /* paranoia */
   1677             set_SSE_rounding_default(env);
   1678             addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
   1679             addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
   1680             return dst;
   1681          }
   1682 
   1683          case Iop_16to8:
   1684          case Iop_32to8:
   1685          case Iop_64to8:
   1686          case Iop_32to16:
   1687          case Iop_64to16:
   1688          case Iop_64to32:
   1689             /* These are no-ops. */
   1690             return iselIntExpr_R(env, e->Iex.Unop.arg);
   1691 
   1692          case Iop_GetMSBs8x8: {
   1693             /* Note: the following assumes the helper is of
   1694                signature
   1695                   UInt fn ( ULong ), and is not a regparm fn.
   1696             */
   1697             HReg dst = newVRegI(env);
   1698             HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1699             fn = (HWord)h_generic_calc_GetMSBs8x8;
   1700             addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1701             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   1702                                            1, mk_RetLoc_simple(RLPri_Int) ));
   1703             /* MovxLQ is not exactly the right thing here.  We just
   1704                need to get the bottom 8 bits of RAX into dst, and zero
   1705                out everything else.  Assuming that the helper returns
   1706                a UInt with the top 24 bits zeroed out, it'll do,
   1707                though. */
   1708             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1709             return dst;
   1710          }
   1711 
   1712          case Iop_GetMSBs8x16: {
   1713             /* Note: the following assumes the helper is of signature
   1714                   UInt fn ( ULong w64hi, ULong w64Lo ),
   1715                and is not a regparm fn. */
   1716             HReg dst = newVRegI(env);
   1717             HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
   1718             HReg rsp = hregAMD64_RSP();
   1719             fn = (HWord)h_generic_calc_GetMSBs8x16;
   1720             AMD64AMode* m8_rsp  = AMD64AMode_IR( -8, rsp);
   1721             AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   1722             addInstr(env, AMD64Instr_SseLdSt(False/*store*/,
   1723                                              16, vec, m16_rsp));
   1724             /* hi 64 bits into RDI -- the first arg */
   1725             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1726                                              AMD64RMI_Mem(m8_rsp),
   1727                                              hregAMD64_RDI() )); /* 1st arg */
   1728             /* lo 64 bits into RSI -- the 2nd arg */
   1729             addInstr(env, AMD64Instr_Alu64R( Aalu_MOV,
   1730                                              AMD64RMI_Mem(m16_rsp),
   1731                                              hregAMD64_RSI() )); /* 2nd arg */
   1732             addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   1733                                            2, mk_RetLoc_simple(RLPri_Int) ));
   1734             /* MovxLQ is not exactly the right thing here.  We just
   1735                need to get the bottom 16 bits of RAX into dst, and zero
   1736                out everything else.  Assuming that the helper returns
   1737                a UInt with the top 16 bits zeroed out, it'll do,
   1738                though. */
   1739             addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1740             return dst;
   1741          }
   1742 
   1743          default:
   1744             break;
   1745       }
   1746 
   1747       /* Deal with unary 64-bit SIMD ops. */
   1748       switch (e->Iex.Unop.op) {
   1749          case Iop_CmpNEZ32x2:
   1750             fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
   1751          case Iop_CmpNEZ16x4:
   1752             fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
   1753          case Iop_CmpNEZ8x8:
   1754             fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
   1755          default:
   1756             fn = (HWord)0; break;
   1757       }
   1758       if (fn != (HWord)0) {
   1759          /* Note: the following assumes all helpers are of
   1760             signature
   1761                ULong fn ( ULong ), and they are
   1762             not marked as regparm functions.
   1763          */
   1764          HReg dst = newVRegI(env);
   1765          HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
   1766          addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
   1767          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1,
   1768                                         mk_RetLoc_simple(RLPri_Int) ));
   1769          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1770          return dst;
   1771       }
   1772 
   1773       break;
   1774    }
   1775 
   1776    /* --------- GET --------- */
   1777    case Iex_Get: {
   1778       if (ty == Ity_I64) {
   1779          HReg dst = newVRegI(env);
   1780          addInstr(env, AMD64Instr_Alu64R(
   1781                           Aalu_MOV,
   1782                           AMD64RMI_Mem(
   1783                              AMD64AMode_IR(e->Iex.Get.offset,
   1784                                            hregAMD64_RBP())),
   1785                           dst));
   1786          return dst;
   1787       }
   1788       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   1789          HReg dst = newVRegI(env);
   1790          addInstr(env, AMD64Instr_LoadEX(
   1791                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   1792                           False,
   1793                           AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
   1794                           dst));
   1795          return dst;
   1796       }
   1797       break;
   1798    }
   1799 
   1800    case Iex_GetI: {
   1801       AMD64AMode* am
   1802          = genGuestArrayOffset(
   1803               env, e->Iex.GetI.descr,
   1804                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   1805       HReg dst = newVRegI(env);
   1806       if (ty == Ity_I8) {
   1807          addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
   1808          return dst;
   1809       }
   1810       if (ty == Ity_I64) {
   1811          addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
   1812          return dst;
   1813       }
   1814       break;
   1815    }
   1816 
   1817    /* --------- CCALL --------- */
   1818    case Iex_CCall: {
   1819       HReg    dst = newVRegI(env);
   1820       vassert(ty == e->Iex.CCall.retty);
   1821 
   1822       /* be very restrictive for now.  Only 64-bit ints allowed for
   1823          args, and 64 or 32 bits for return type. */
   1824       if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
   1825          goto irreducible;
   1826 
   1827       /* Marshal args, do the call. */
   1828       UInt   addToSp = 0;
   1829       RetLoc rloc    = mk_RetLoc_INVALID();
   1830       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
   1831                     e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args );
   1832       vassert(is_sane_RetLoc(rloc));
   1833       vassert(rloc.pri == RLPri_Int);
   1834       vassert(addToSp == 0);
   1835 
   1836       /* Move to dst, and zero out the top 32 bits if the result type is
   1837          Ity_I32.  Probably overkill, but still .. */
   1838       if (e->Iex.CCall.retty == Ity_I64)
   1839          addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
   1840       else
   1841          addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
   1842 
   1843       return dst;
   1844    }
   1845 
   1846    /* --------- LITERAL --------- */
   1847    /* 64/32/16/8-bit literals */
   1848    case Iex_Const:
   1849       if (ty == Ity_I64) {
   1850          HReg r = newVRegI(env);
   1851          addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
   1852          return r;
   1853       } else {
   1854          AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
   1855          HReg      r   = newVRegI(env);
   1856          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
   1857          return r;
   1858       }
   1859 
   1860    /* --------- MULTIPLEX --------- */
   1861    case Iex_ITE: { // VFD
   1862       if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
   1863           && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) {
   1864          HReg     r1  = iselIntExpr_R(env, e->Iex.ITE.iftrue);
   1865          AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.ITE.iffalse);
   1866          HReg     dst = newVRegI(env);
   1867          addInstr(env, mk_iMOVsd_RR(r1,dst));
   1868          AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   1869          addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst));
   1870          return dst;
   1871       }
   1872       break;
   1873    }
   1874 
   1875    /* --------- TERNARY OP --------- */
   1876    case Iex_Triop: {
   1877       IRTriop *triop = e->Iex.Triop.details;
   1878       /* C3210 flags following FPU partial remainder (fprem), both
   1879          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
   1880       if (triop->op == Iop_PRemC3210F64
   1881           || triop->op == Iop_PRem1C3210F64) {
   1882          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   1883          HReg        arg1   = iselDblExpr(env, triop->arg2);
   1884          HReg        arg2   = iselDblExpr(env, triop->arg3);
   1885          HReg        dst    = newVRegI(env);
   1886          addInstr(env, AMD64Instr_A87Free(2));
   1887 
   1888          /* one arg -> top of x87 stack */
   1889          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
   1890          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1891 
   1892          /* other arg -> top of x87 stack */
   1893          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
   1894          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   1895 
   1896          switch (triop->op) {
   1897             case Iop_PRemC3210F64:
   1898                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   1899                break;
   1900             case Iop_PRem1C3210F64:
   1901                addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   1902                break;
   1903             default:
   1904                vassert(0);
   1905          }
   1906          /* Ignore the result, and instead make off with the FPU's
   1907 	    C3210 flags (in the status word). */
   1908          addInstr(env, AMD64Instr_A87StSW(m8_rsp));
   1909          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
   1910          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
   1911          return dst;
   1912       }
   1913       break;
   1914    }
   1915 
   1916    default:
   1917    break;
   1918    } /* switch (e->tag) */
   1919 
   1920    /* We get here if no pattern matched. */
   1921   irreducible:
   1922    ppIRExpr(e);
   1923    vpanic("iselIntExpr_R(amd64): cannot reduce tree");
   1924 }
   1925 
   1926 
   1927 /*---------------------------------------------------------*/
   1928 /*--- ISEL: Integer expression auxiliaries              ---*/
   1929 /*---------------------------------------------------------*/
   1930 
   1931 /* --------------------- AMODEs --------------------- */
   1932 
   1933 /* Return an AMode which computes the value of the specified
   1934    expression, possibly also adding insns to the code list as a
   1935    result.  The expression may only be a 32-bit one.
   1936 */
   1937 
   1938 static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
   1939 {
   1940    AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
   1941    vassert(sane_AMode(am));
   1942    return am;
   1943 }
   1944 
   1945 /* DO NOT CALL THIS DIRECTLY ! */
   1946 static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
   1947 {
   1948    MatchInfo mi;
   1949    DECLARE_PATTERN(p_complex);
   1950    IRType ty = typeOfIRExpr(env->type_env,e);
   1951    vassert(ty == Ity_I64);
   1952 
   1953    /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
   1954    /*              bind0        bind1  bind2   bind3   */
   1955    DEFINE_PATTERN(p_complex,
   1956       binop( Iop_Add64,
   1957              binop( Iop_Add64,
   1958                     bind(0),
   1959                     binop(Iop_Shl64, bind(1), bind(2))
   1960                   ),
   1961              bind(3)
   1962            )
   1963    );
   1964    if (matchIRExpr(&mi, p_complex, e)) {
   1965       IRExpr* expr1  = mi.bindee[0];
   1966       IRExpr* expr2  = mi.bindee[1];
   1967       IRExpr* imm8   = mi.bindee[2];
   1968       IRExpr* simm32 = mi.bindee[3];
   1969       if (imm8->tag == Iex_Const
   1970           && imm8->Iex.Const.con->tag == Ico_U8
   1971           && imm8->Iex.Const.con->Ico.U8 < 4
   1972           /* imm8 is OK, now check simm32 */
   1973           && simm32->tag == Iex_Const
   1974           && simm32->Iex.Const.con->tag == Ico_U64
   1975           && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
   1976          UInt shift = imm8->Iex.Const.con->Ico.U8;
   1977          UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
   1978          HReg r1 = iselIntExpr_R(env, expr1);
   1979          HReg r2 = iselIntExpr_R(env, expr2);
   1980          vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
   1981          return AMD64AMode_IRRS(offset, r1, r2, shift);
   1982       }
   1983    }
   1984 
   1985    /* Add64(expr1, Shl64(expr2, imm)) */
   1986    if (e->tag == Iex_Binop
   1987        && e->Iex.Binop.op == Iop_Add64
   1988        && e->Iex.Binop.arg2->tag == Iex_Binop
   1989        && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
   1990        && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
   1991        && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
   1992       UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
   1993       if (shift == 1 || shift == 2 || shift == 3) {
   1994          HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   1995          HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
   1996          return AMD64AMode_IRRS(0, r1, r2, shift);
   1997       }
   1998    }
   1999 
   2000    /* Add64(expr,i) */
   2001    if (e->tag == Iex_Binop
   2002        && e->Iex.Binop.op == Iop_Add64
   2003        && e->Iex.Binop.arg2->tag == Iex_Const
   2004        && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
   2005        && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
   2006       HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2007       return AMD64AMode_IR(
   2008                 toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64),
   2009                 r1
   2010              );
   2011    }
   2012 
   2013    /* Doesn't match anything in particular.  Generate it into
   2014       a register and use that. */
   2015    {
   2016       HReg r1 = iselIntExpr_R(env, e);
   2017       return AMD64AMode_IR(0, r1);
   2018    }
   2019 }
   2020 
   2021 
   2022 /* --------------------- RMIs --------------------- */
   2023 
   2024 /* Similarly, calculate an expression into an X86RMI operand.  As with
   2025    iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
   2026 
   2027 static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
   2028 {
   2029    AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
   2030    /* sanity checks ... */
   2031    switch (rmi->tag) {
   2032       case Armi_Imm:
   2033          return rmi;
   2034       case Armi_Reg:
   2035          vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
   2036          vassert(hregIsVirtual(rmi->Armi.Reg.reg));
   2037          return rmi;
   2038       case Armi_Mem:
   2039          vassert(sane_AMode(rmi->Armi.Mem.am));
   2040          return rmi;
   2041       default:
   2042          vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
   2043    }
   2044 }
   2045 
   2046 /* DO NOT CALL THIS DIRECTLY ! */
   2047 static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
   2048 {
   2049    IRType ty = typeOfIRExpr(env->type_env,e);
   2050    vassert(ty == Ity_I64 || ty == Ity_I32
   2051            || ty == Ity_I16 || ty == Ity_I8);
   2052 
   2053    /* special case: immediate 64/32/16/8 */
   2054    if (e->tag == Iex_Const) {
   2055       switch (e->Iex.Const.con->tag) {
   2056         case Ico_U64:
   2057            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   2058               return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   2059            }
   2060            break;
   2061          case Ico_U32:
   2062             return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
   2063          case Ico_U16:
   2064             return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
   2065          case Ico_U8:
   2066             return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
   2067          default:
   2068             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   2069       }
   2070    }
   2071 
   2072    /* special case: 64-bit GET */
   2073    if (e->tag == Iex_Get && ty == Ity_I64) {
   2074       return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2075                                         hregAMD64_RBP()));
   2076    }
   2077 
   2078    /* special case: 64-bit load from memory */
   2079    if (e->tag == Iex_Load && ty == Ity_I64
   2080        && e->Iex.Load.end == Iend_LE) {
   2081       AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2082       return AMD64RMI_Mem(am);
   2083    }
   2084 
   2085    /* default case: calculate into a register and return that */
   2086    {
   2087       HReg r = iselIntExpr_R ( env, e );
   2088       return AMD64RMI_Reg(r);
   2089    }
   2090 }
   2091 
   2092 
   2093 /* --------------------- RIs --------------------- */
   2094 
   2095 /* Calculate an expression into an AMD64RI operand.  As with
   2096    iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2097    bits. */
   2098 
   2099 static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
   2100 {
   2101    AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
   2102    /* sanity checks ... */
   2103    switch (ri->tag) {
   2104       case Ari_Imm:
   2105          return ri;
   2106       case Ari_Reg:
   2107          vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
   2108          vassert(hregIsVirtual(ri->Ari.Reg.reg));
   2109          return ri;
   2110       default:
   2111          vpanic("iselIntExpr_RI: unknown amd64 RI tag");
   2112    }
   2113 }
   2114 
   2115 /* DO NOT CALL THIS DIRECTLY ! */
   2116 static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
   2117 {
   2118    IRType ty = typeOfIRExpr(env->type_env,e);
   2119    vassert(ty == Ity_I64 || ty == Ity_I32
   2120            || ty == Ity_I16 || ty == Ity_I8);
   2121 
   2122    /* special case: immediate */
   2123    if (e->tag == Iex_Const) {
   2124       switch (e->Iex.Const.con->tag) {
   2125         case Ico_U64:
   2126            if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
   2127               return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
   2128            }
   2129            break;
   2130          case Ico_U32:
   2131             return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
   2132          case Ico_U16:
   2133             return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
   2134          case Ico_U8:
   2135             return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
   2136          default:
   2137             vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
   2138       }
   2139    }
   2140 
   2141    /* default case: calculate into a register and return that */
   2142    {
   2143       HReg r = iselIntExpr_R ( env, e );
   2144       return AMD64RI_Reg(r);
   2145    }
   2146 }
   2147 
   2148 
   2149 /* --------------------- RMs --------------------- */
   2150 
   2151 /* Similarly, calculate an expression into an AMD64RM operand.  As
   2152    with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
   2153    bits.  */
   2154 
   2155 static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
   2156 {
   2157    AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
   2158    /* sanity checks ... */
   2159    switch (rm->tag) {
   2160       case Arm_Reg:
   2161          vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
   2162          vassert(hregIsVirtual(rm->Arm.Reg.reg));
   2163          return rm;
   2164       case Arm_Mem:
   2165          vassert(sane_AMode(rm->Arm.Mem.am));
   2166          return rm;
   2167       default:
   2168          vpanic("iselIntExpr_RM: unknown amd64 RM tag");
   2169    }
   2170 }
   2171 
   2172 /* DO NOT CALL THIS DIRECTLY ! */
   2173 static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
   2174 {
   2175    IRType ty = typeOfIRExpr(env->type_env,e);
   2176    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
   2177 
   2178    /* special case: 64-bit GET */
   2179    if (e->tag == Iex_Get && ty == Ity_I64) {
   2180       return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
   2181                                        hregAMD64_RBP()));
   2182    }
   2183 
   2184    /* special case: load from memory */
   2185 
   2186    /* default case: calculate into a register and return that */
   2187    {
   2188       HReg r = iselIntExpr_R ( env, e );
   2189       return AMD64RM_Reg(r);
   2190    }
   2191 }
   2192 
   2193 
   2194 /* --------------------- CONDCODE --------------------- */
   2195 
   2196 /* Generate code to evaluated a bit-typed expression, returning the
   2197    condition code which would correspond when the expression would
   2198    notionally have returned 1. */
   2199 
   2200 static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
   2201 {
   2202    /* Uh, there's nothing we can sanity check here, unfortunately. */
   2203    return iselCondCode_wrk(env,e);
   2204 }
   2205 
   2206 /* DO NOT CALL THIS DIRECTLY ! */
   2207 static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
   2208 {
   2209    MatchInfo mi;
   2210 
   2211    vassert(e);
   2212    vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
   2213 
   2214    /* var */
   2215    if (e->tag == Iex_RdTmp) {
   2216       HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2217       HReg dst = newVRegI(env);
   2218       addInstr(env, mk_iMOVsd_RR(r64,dst));
   2219       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
   2220       return Acc_NZ;
   2221    }
   2222 
   2223    /* Constant 1:Bit */
   2224    if (e->tag == Iex_Const) {
   2225       HReg r;
   2226       vassert(e->Iex.Const.con->tag == Ico_U1);
   2227       vassert(e->Iex.Const.con->Ico.U1 == True
   2228               || e->Iex.Const.con->Ico.U1 == False);
   2229       r = newVRegI(env);
   2230       addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
   2231       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
   2232       return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
   2233    }
   2234 
   2235    /* Not1(...) */
   2236    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
   2237       /* Generate code for the arg, and negate the test condition */
   2238       return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
   2239    }
   2240 
   2241    /* --- patterns rooted at: 64to1 --- */
   2242 
   2243    /* 64to1 */
   2244    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
   2245       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2246       addInstr(env, AMD64Instr_Test64(1,reg));
   2247       return Acc_NZ;
   2248    }
   2249 
   2250    /* --- patterns rooted at: 32to1 --- */
   2251 
   2252    /* 32to1 */
   2253    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) {
   2254       HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
   2255       addInstr(env, AMD64Instr_Test64(1,reg));
   2256       return Acc_NZ;
   2257    }
   2258 
   2259    /* --- patterns rooted at: CmpNEZ8 --- */
   2260 
   2261    /* CmpNEZ8(x) */
   2262    if (e->tag == Iex_Unop
   2263        && e->Iex.Unop.op == Iop_CmpNEZ8) {
   2264       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2265       addInstr(env, AMD64Instr_Test64(0xFF,r));
   2266       return Acc_NZ;
   2267    }
   2268 
   2269    /* --- patterns rooted at: CmpNEZ16 --- */
   2270 
   2271    /* CmpNEZ16(x) */
   2272    if (e->tag == Iex_Unop
   2273        && e->Iex.Unop.op == Iop_CmpNEZ16) {
   2274       HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
   2275       addInstr(env, AMD64Instr_Test64(0xFFFF,r));
   2276       return Acc_NZ;
   2277    }
   2278 
   2279    /* --- patterns rooted at: CmpNEZ32 --- */
   2280 
   2281    /* CmpNEZ32(x) */
   2282    if (e->tag == Iex_Unop
   2283        && e->Iex.Unop.op == Iop_CmpNEZ32) {
   2284       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2285       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2286       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2287       return Acc_NZ;
   2288    }
   2289 
   2290    /* --- patterns rooted at: CmpNEZ64 --- */
   2291 
   2292    /* CmpNEZ64(Or64(x,y)) */
   2293    {
   2294       DECLARE_PATTERN(p_CmpNEZ64_Or64);
   2295       DEFINE_PATTERN(p_CmpNEZ64_Or64,
   2296                      unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
   2297       if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
   2298          HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
   2299          AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
   2300          HReg      tmp  = newVRegI(env);
   2301          addInstr(env, mk_iMOVsd_RR(r0, tmp));
   2302          addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
   2303          return Acc_NZ;
   2304       }
   2305    }
   2306 
   2307    /* CmpNEZ64(x) */
   2308    if (e->tag == Iex_Unop
   2309        && e->Iex.Unop.op == Iop_CmpNEZ64) {
   2310       HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
   2311       AMD64RMI* rmi2 = AMD64RMI_Imm(0);
   2312       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2313       return Acc_NZ;
   2314    }
   2315 
   2316    /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
   2317 
   2318    /* CmpEQ8 / CmpNE8 */
   2319    if (e->tag == Iex_Binop
   2320        && (e->Iex.Binop.op == Iop_CmpEQ8
   2321            || e->Iex.Binop.op == Iop_CmpNE8
   2322            || e->Iex.Binop.op == Iop_CasCmpEQ8
   2323            || e->Iex.Binop.op == Iop_CasCmpNE8)) {
   2324       if (isZeroU8(e->Iex.Binop.arg2)) {
   2325          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2326          addInstr(env, AMD64Instr_Test64(0xFF,r1));
   2327          switch (e->Iex.Binop.op) {
   2328             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2329             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2330             default: vpanic("iselCondCode(amd64): CmpXX8(expr,0:I8)");
   2331          }
   2332       } else {
   2333          HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2334          AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2335          HReg      r    = newVRegI(env);
   2336          addInstr(env, mk_iMOVsd_RR(r1,r));
   2337          addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2338          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
   2339          switch (e->Iex.Binop.op) {
   2340             case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
   2341             case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
   2342             default: vpanic("iselCondCode(amd64): CmpXX8(expr,expr)");
   2343          }
   2344       }
   2345    }
   2346 
   2347    /* CmpEQ16 / CmpNE16 */
   2348    if (e->tag == Iex_Binop
   2349        && (e->Iex.Binop.op == Iop_CmpEQ16
   2350            || e->Iex.Binop.op == Iop_CmpNE16
   2351            || e->Iex.Binop.op == Iop_CasCmpEQ16
   2352            || e->Iex.Binop.op == Iop_CasCmpNE16)) {
   2353       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2354       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2355       HReg      r    = newVRegI(env);
   2356       addInstr(env, mk_iMOVsd_RR(r1,r));
   2357       addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
   2358       addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
   2359       switch (e->Iex.Binop.op) {
   2360          case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
   2361          case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
   2362          default: vpanic("iselCondCode(amd64): CmpXX16");
   2363       }
   2364    }
   2365 
   2366    /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation).
   2367       Saves a "movq %rax, %tmp" compared to the default route. */
   2368    if (e->tag == Iex_Binop
   2369        && e->Iex.Binop.op == Iop_CmpNE64
   2370        && e->Iex.Binop.arg1->tag == Iex_CCall
   2371        && e->Iex.Binop.arg2->tag == Iex_Const) {
   2372       IRExpr* cal = e->Iex.Binop.arg1;
   2373       IRExpr* con = e->Iex.Binop.arg2;
   2374       HReg    tmp = newVRegI(env);
   2375       /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */
   2376       vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */
   2377       vassert(con->Iex.Const.con->tag == Ico_U64);
   2378       /* Marshal args, do the call. */
   2379       UInt   addToSp = 0;
   2380       RetLoc rloc    = mk_RetLoc_INVALID();
   2381       doHelperCall( &addToSp, &rloc, env, NULL/*guard*/,
   2382                     cal->Iex.CCall.cee,
   2383                     cal->Iex.CCall.retty, cal->Iex.CCall.args );
   2384       vassert(is_sane_RetLoc(rloc));
   2385       vassert(rloc.pri == RLPri_Int);
   2386       vassert(addToSp == 0);
   2387       /* */
   2388       addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp));
   2389       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,
   2390                                       AMD64RMI_Reg(hregAMD64_RAX()), tmp));
   2391       return Acc_NZ;
   2392    }
   2393 
   2394    /* Cmp*64*(x,y) */
   2395    if (e->tag == Iex_Binop
   2396        && (e->Iex.Binop.op == Iop_CmpEQ64
   2397            || e->Iex.Binop.op == Iop_CmpNE64
   2398            || e->Iex.Binop.op == Iop_CmpLT64S
   2399            || e->Iex.Binop.op == Iop_CmpLT64U
   2400            || e->Iex.Binop.op == Iop_CmpLE64S
   2401            || e->Iex.Binop.op == Iop_CmpLE64U
   2402            || e->Iex.Binop.op == Iop_CasCmpEQ64
   2403            || e->Iex.Binop.op == Iop_CasCmpNE64
   2404            || e->Iex.Binop.op == Iop_ExpCmpNE64)) {
   2405       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2406       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2407       addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
   2408       switch (e->Iex.Binop.op) {
   2409          case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
   2410          case Iop_CmpNE64:
   2411          case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ;
   2412 	 case Iop_CmpLT64S: return Acc_L;
   2413 	 case Iop_CmpLT64U: return Acc_B;
   2414 	 case Iop_CmpLE64S: return Acc_LE;
   2415          case Iop_CmpLE64U: return Acc_BE;
   2416          default: vpanic("iselCondCode(amd64): CmpXX64");
   2417       }
   2418    }
   2419 
   2420    /* Cmp*32*(x,y) */
   2421    if (e->tag == Iex_Binop
   2422        && (e->Iex.Binop.op == Iop_CmpEQ32
   2423            || e->Iex.Binop.op == Iop_CmpNE32
   2424            || e->Iex.Binop.op == Iop_CmpLT32S
   2425            || e->Iex.Binop.op == Iop_CmpLT32U
   2426            || e->Iex.Binop.op == Iop_CmpLE32S
   2427            || e->Iex.Binop.op == Iop_CmpLE32U
   2428            || e->Iex.Binop.op == Iop_CasCmpEQ32
   2429            || e->Iex.Binop.op == Iop_CasCmpNE32
   2430            || e->Iex.Binop.op == Iop_ExpCmpNE32)) {
   2431       HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2432       AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   2433       addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1));
   2434       switch (e->Iex.Binop.op) {
   2435          case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
   2436          case Iop_CmpNE32:
   2437          case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ;
   2438 	 case Iop_CmpLT32S: return Acc_L;
   2439 	 case Iop_CmpLT32U: return Acc_B;
   2440 	 case Iop_CmpLE32S: return Acc_LE;
   2441          case Iop_CmpLE32U: return Acc_BE;
   2442          default: vpanic("iselCondCode(amd64): CmpXX32");
   2443       }
   2444    }
   2445 
   2446    ppIRExpr(e);
   2447    vpanic("iselCondCode(amd64)");
   2448 }
   2449 
   2450 
   2451 /*---------------------------------------------------------*/
   2452 /*--- ISEL: Integer expressions (128 bit)               ---*/
   2453 /*---------------------------------------------------------*/
   2454 
   2455 /* Compute a 128-bit value into a register pair, which is returned as
   2456    the first two parameters.  As with iselIntExpr_R, these may be
   2457    either real or virtual regs; in any case they must not be changed
   2458    by subsequent code emitted by the caller.  */
   2459 
   2460 static void iselInt128Expr ( HReg* rHi, HReg* rLo,
   2461                              ISelEnv* env, IRExpr* e )
   2462 {
   2463    iselInt128Expr_wrk(rHi, rLo, env, e);
   2464 #  if 0
   2465    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2466 #  endif
   2467    vassert(hregClass(*rHi) == HRcInt64);
   2468    vassert(hregIsVirtual(*rHi));
   2469    vassert(hregClass(*rLo) == HRcInt64);
   2470    vassert(hregIsVirtual(*rLo));
   2471 }
   2472 
   2473 /* DO NOT CALL THIS DIRECTLY ! */
   2474 static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
   2475                                  ISelEnv* env, IRExpr* e )
   2476 {
   2477    vassert(e);
   2478    vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
   2479 
   2480    /* read 128-bit IRTemp */
   2481    if (e->tag == Iex_RdTmp) {
   2482       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   2483       return;
   2484    }
   2485 
   2486    /* --------- BINARY ops --------- */
   2487    if (e->tag == Iex_Binop) {
   2488       switch (e->Iex.Binop.op) {
   2489          /* 64 x 64 -> 128 multiply */
   2490          case Iop_MullU64:
   2491          case Iop_MullS64: {
   2492             /* get one operand into %rax, and the other into a R/M.
   2493                Need to make an educated guess about which is better in
   2494                which. */
   2495             HReg     tLo    = newVRegI(env);
   2496             HReg     tHi    = newVRegI(env);
   2497             Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
   2498             AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
   2499             HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2500             addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
   2501             addInstr(env, AMD64Instr_MulL(syned, rmLeft));
   2502             /* Result is now in RDX:RAX.  Tell the caller. */
   2503             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2504             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2505             *rHi = tHi;
   2506             *rLo = tLo;
   2507             return;
   2508          }
   2509 
   2510          /* 128 x 64 -> (64(rem),64(div)) division */
   2511          case Iop_DivModU128to64:
   2512          case Iop_DivModS128to64: {
   2513             /* Get the 128-bit operand into rdx:rax, and the other into
   2514                any old R/M. */
   2515             HReg sHi, sLo;
   2516             HReg     tLo     = newVRegI(env);
   2517             HReg     tHi     = newVRegI(env);
   2518             Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
   2519             AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
   2520             iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
   2521             addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
   2522             addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
   2523             addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
   2524             addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
   2525             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
   2526             *rHi = tHi;
   2527             *rLo = tLo;
   2528             return;
   2529          }
   2530 
   2531          /* 64HLto128(e1,e2) */
   2532          case Iop_64HLto128:
   2533             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
   2534             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2535             return;
   2536 
   2537          default:
   2538             break;
   2539       }
   2540    } /* if (e->tag == Iex_Binop) */
   2541 
   2542    ppIRExpr(e);
   2543    vpanic("iselInt128Expr");
   2544 }
   2545 
   2546 
   2547 /*---------------------------------------------------------*/
   2548 /*--- ISEL: Floating point expressions (32 bit)         ---*/
   2549 /*---------------------------------------------------------*/
   2550 
   2551 /* Nothing interesting here; really just wrappers for
   2552    64-bit stuff. */
   2553 
   2554 static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
   2555 {
   2556    HReg r = iselFltExpr_wrk( env, e );
   2557 #  if 0
   2558    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2559 #  endif
   2560    vassert(hregClass(r) == HRcVec128);
   2561    vassert(hregIsVirtual(r));
   2562    return r;
   2563 }
   2564 
   2565 /* DO NOT CALL THIS DIRECTLY */
   2566 static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
   2567 {
   2568    IRType ty = typeOfIRExpr(env->type_env,e);
   2569    vassert(ty == Ity_F32);
   2570 
   2571    if (e->tag == Iex_RdTmp) {
   2572       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2573    }
   2574 
   2575    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2576       AMD64AMode* am;
   2577       HReg res = newVRegV(env);
   2578       vassert(e->Iex.Load.ty == Ity_F32);
   2579       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2580       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
   2581       return res;
   2582    }
   2583 
   2584    if (e->tag == Iex_Binop
   2585        && e->Iex.Binop.op == Iop_F64toF32) {
   2586       /* Although the result is still held in a standard SSE register,
   2587          we need to round it to reflect the loss of accuracy/range
   2588          entailed in casting it to a 32-bit float. */
   2589       HReg dst = newVRegV(env);
   2590       HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
   2591       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2592       addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
   2593       set_SSE_rounding_default( env );
   2594       return dst;
   2595    }
   2596 
   2597    if (e->tag == Iex_Get) {
   2598       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2599                                        hregAMD64_RBP() );
   2600       HReg res = newVRegV(env);
   2601       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
   2602       return res;
   2603    }
   2604 
   2605    if (e->tag == Iex_Unop
   2606        && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
   2607        /* Given an I32, produce an IEEE754 float with the same bit
   2608           pattern. */
   2609        HReg        dst    = newVRegV(env);
   2610        HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
   2611        AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
   2612        addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
   2613        addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
   2614        return dst;
   2615    }
   2616 
   2617    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
   2618       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2619       HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
   2620       HReg        dst    = newVRegV(env);
   2621 
   2622       /* rf now holds the value to be rounded.  The first thing to do
   2623          is set the FPU's rounding mode accordingly. */
   2624 
   2625       /* Set host x87 rounding mode */
   2626       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2627 
   2628       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
   2629       addInstr(env, AMD64Instr_A87Free(1));
   2630       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
   2631       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2632       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
   2633       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
   2634 
   2635       /* Restore default x87 rounding. */
   2636       set_FPU_rounding_default( env );
   2637 
   2638       return dst;
   2639    }
   2640 
   2641    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) {
   2642       /* Sigh ... very rough code.  Could do much better. */
   2643       /* Get the 128-bit literal 00---0 10---0 into a register
   2644          and xor it with the value to be negated. */
   2645       HReg r1  = newVRegI(env);
   2646       HReg dst = newVRegV(env);
   2647       HReg tmp = newVRegV(env);
   2648       HReg src = iselFltExpr(env, e->Iex.Unop.arg);
   2649       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2650       addInstr(env, mk_vMOVsd_RR(src,tmp));
   2651       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   2652       addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 ));
   2653       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   2654       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   2655       addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   2656       add_to_rsp(env, 16);
   2657       return dst;
   2658    }
   2659 
   2660    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) {
   2661       IRQop *qop = e->Iex.Qop.details;
   2662       HReg dst  = newVRegV(env);
   2663       HReg argX = iselFltExpr(env, qop->arg2);
   2664       HReg argY = iselFltExpr(env, qop->arg3);
   2665       HReg argZ = iselFltExpr(env, qop->arg4);
   2666       /* XXXROUNDINGFIXME */
   2667       /* set roundingmode here */
   2668       /* subq $16, %rsp         -- make a space*/
   2669       sub_from_rsp(env, 16);
   2670       /* Prepare 4 arg regs:
   2671          leaq 0(%rsp), %rdi
   2672          leaq 4(%rsp), %rsi
   2673          leaq 8(%rsp), %rdx
   2674          leaq 12(%rsp), %rcx
   2675       */
   2676       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
   2677                                      hregAMD64_RDI()));
   2678       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()),
   2679                                      hregAMD64_RSI()));
   2680       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
   2681                                      hregAMD64_RDX()));
   2682       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()),
   2683                                      hregAMD64_RCX()));
   2684       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
   2685          movss  %argX, 0(%rsi)
   2686          movss  %argY, 0(%rdx)
   2687          movss  %argZ, 0(%rcx)
   2688          */
   2689       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX,
   2690                                        AMD64AMode_IR(0, hregAMD64_RSI())));
   2691       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY,
   2692                                        AMD64AMode_IR(0, hregAMD64_RDX())));
   2693       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ,
   2694                                        AMD64AMode_IR(0, hregAMD64_RCX())));
   2695       /* call the helper */
   2696       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
   2697                                      (ULong)(HWord)h_generic_calc_MAddF32,
   2698                                      4, mk_RetLoc_simple(RLPri_None) ));
   2699       /* fetch the result from memory, using %r_argp, which the
   2700          register allocator will keep alive across the call. */
   2701       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst,
   2702                                        AMD64AMode_IR(0, hregAMD64_RSP())));
   2703       /* and finally, clear the space */
   2704       add_to_rsp(env, 16);
   2705       return dst;
   2706    }
   2707 
   2708    ppIRExpr(e);
   2709    vpanic("iselFltExpr_wrk");
   2710 }
   2711 
   2712 
   2713 /*---------------------------------------------------------*/
   2714 /*--- ISEL: Floating point expressions (64 bit)         ---*/
   2715 /*---------------------------------------------------------*/
   2716 
   2717 /* Compute a 64-bit floating point value into the lower half of an xmm
   2718    register, the identity of which is returned.  As with
   2719    iselIntExpr_R, the returned reg will be virtual, and it must not be
   2720    changed by subsequent code emitted by the caller.
   2721 */
   2722 
   2723 /* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
   2724 
   2725     Type                  S (1 bit)   E (11 bits)   F (52 bits)
   2726     ----                  ---------   -----------   -----------
   2727     signalling NaN        u           2047 (max)    .0uuuuu---u
   2728                                                     (with at least
   2729                                                      one 1 bit)
   2730     quiet NaN             u           2047 (max)    .1uuuuu---u
   2731 
   2732     negative infinity     1           2047 (max)    .000000---0
   2733 
   2734     positive infinity     0           2047 (max)    .000000---0
   2735 
   2736     negative zero         1           0             .000000---0
   2737 
   2738     positive zero         0           0             .000000---0
   2739 */
   2740 
   2741 static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
   2742 {
   2743    HReg r = iselDblExpr_wrk( env, e );
   2744 #  if 0
   2745    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   2746 #  endif
   2747    vassert(hregClass(r) == HRcVec128);
   2748    vassert(hregIsVirtual(r));
   2749    return r;
   2750 }
   2751 
   2752 /* DO NOT CALL THIS DIRECTLY */
   2753 static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
   2754 {
   2755    IRType ty = typeOfIRExpr(env->type_env,e);
   2756    vassert(e);
   2757    vassert(ty == Ity_F64);
   2758 
   2759    if (e->tag == Iex_RdTmp) {
   2760       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   2761    }
   2762 
   2763    if (e->tag == Iex_Const) {
   2764       union { ULong u64; Double f64; } u;
   2765       HReg res = newVRegV(env);
   2766       HReg tmp = newVRegI(env);
   2767       vassert(sizeof(u) == 8);
   2768       vassert(sizeof(u.u64) == 8);
   2769       vassert(sizeof(u.f64) == 8);
   2770 
   2771       if (e->Iex.Const.con->tag == Ico_F64) {
   2772          u.f64 = e->Iex.Const.con->Ico.F64;
   2773       }
   2774       else if (e->Iex.Const.con->tag == Ico_F64i) {
   2775          u.u64 = e->Iex.Const.con->Ico.F64i;
   2776       }
   2777       else
   2778          vpanic("iselDblExpr(amd64): const");
   2779 
   2780       addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
   2781       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
   2782       addInstr(env, AMD64Instr_SseLdSt(
   2783                        True/*load*/, 8, res,
   2784                        AMD64AMode_IR(0, hregAMD64_RSP())
   2785               ));
   2786       add_to_rsp(env, 8);
   2787       return res;
   2788    }
   2789 
   2790    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   2791       AMD64AMode* am;
   2792       HReg res = newVRegV(env);
   2793       vassert(e->Iex.Load.ty == Ity_F64);
   2794       am = iselIntExpr_AMode(env, e->Iex.Load.addr);
   2795       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2796       return res;
   2797    }
   2798 
   2799    if (e->tag == Iex_Get) {
   2800       AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
   2801                                       hregAMD64_RBP() );
   2802       HReg res = newVRegV(env);
   2803       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2804       return res;
   2805    }
   2806 
   2807    if (e->tag == Iex_GetI) {
   2808       AMD64AMode* am
   2809          = genGuestArrayOffset(
   2810               env, e->Iex.GetI.descr,
   2811                    e->Iex.GetI.ix, e->Iex.GetI.bias );
   2812       HReg res = newVRegV(env);
   2813       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
   2814       return res;
   2815    }
   2816 
   2817    if (e->tag == Iex_Triop) {
   2818       IRTriop *triop = e->Iex.Triop.details;
   2819       AMD64SseOp op = Asse_INVALID;
   2820       switch (triop->op) {
   2821          case Iop_AddF64: op = Asse_ADDF; break;
   2822          case Iop_SubF64: op = Asse_SUBF; break;
   2823          case Iop_MulF64: op = Asse_MULF; break;
   2824          case Iop_DivF64: op = Asse_DIVF; break;
   2825          default: break;
   2826       }
   2827       if (op != Asse_INVALID) {
   2828          HReg dst  = newVRegV(env);
   2829          HReg argL = iselDblExpr(env, triop->arg2);
   2830          HReg argR = iselDblExpr(env, triop->arg3);
   2831          addInstr(env, mk_vMOVsd_RR(argL, dst));
   2832          /* XXXROUNDINGFIXME */
   2833          /* set roundingmode here */
   2834          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   2835          return dst;
   2836       }
   2837    }
   2838 
   2839    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) {
   2840       IRQop *qop = e->Iex.Qop.details;
   2841       HReg dst  = newVRegV(env);
   2842       HReg argX = iselDblExpr(env, qop->arg2);
   2843       HReg argY = iselDblExpr(env, qop->arg3);
   2844       HReg argZ = iselDblExpr(env, qop->arg4);
   2845       /* XXXROUNDINGFIXME */
   2846       /* set roundingmode here */
   2847       /* subq $32, %rsp         -- make a space*/
   2848       sub_from_rsp(env, 32);
   2849       /* Prepare 4 arg regs:
   2850          leaq 0(%rsp), %rdi
   2851          leaq 8(%rsp), %rsi
   2852          leaq 16(%rsp), %rdx
   2853          leaq 24(%rsp), %rcx
   2854       */
   2855       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()),
   2856                                      hregAMD64_RDI()));
   2857       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()),
   2858                                      hregAMD64_RSI()));
   2859       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()),
   2860                                      hregAMD64_RDX()));
   2861       addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()),
   2862                                      hregAMD64_RCX()));
   2863       /* Store the three args, at (%rsi), (%rdx) and (%rcx):
   2864          movsd  %argX, 0(%rsi)
   2865          movsd  %argY, 0(%rdx)
   2866          movsd  %argZ, 0(%rcx)
   2867          */
   2868       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX,
   2869                                        AMD64AMode_IR(0, hregAMD64_RSI())));
   2870       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY,
   2871                                        AMD64AMode_IR(0, hregAMD64_RDX())));
   2872       addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ,
   2873                                        AMD64AMode_IR(0, hregAMD64_RCX())));
   2874       /* call the helper */
   2875       addInstr(env, AMD64Instr_Call( Acc_ALWAYS,
   2876                                      (ULong)(HWord)h_generic_calc_MAddF64,
   2877                                      4, mk_RetLoc_simple(RLPri_None) ));
   2878       /* fetch the result from memory, using %r_argp, which the
   2879          register allocator will keep alive across the call. */
   2880       addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst,
   2881                                        AMD64AMode_IR(0, hregAMD64_RSP())));
   2882       /* and finally, clear the space */
   2883       add_to_rsp(env, 32);
   2884       return dst;
   2885    }
   2886 
   2887    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
   2888       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2889       HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   2890       HReg        dst    = newVRegV(env);
   2891 
   2892       /* rf now holds the value to be rounded.  The first thing to do
   2893          is set the FPU's rounding mode accordingly. */
   2894 
   2895       /* Set host x87 rounding mode */
   2896       set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
   2897 
   2898       addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   2899       addInstr(env, AMD64Instr_A87Free(1));
   2900       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2901       addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
   2902       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2903       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2904 
   2905       /* Restore default x87 rounding. */
   2906       set_FPU_rounding_default( env );
   2907 
   2908       return dst;
   2909    }
   2910 
   2911    IRTriop *triop = e->Iex.Triop.details;
   2912    if (e->tag == Iex_Triop
   2913        && (triop->op == Iop_ScaleF64
   2914            || triop->op == Iop_AtanF64
   2915            || triop->op == Iop_Yl2xF64
   2916            || triop->op == Iop_Yl2xp1F64
   2917            || triop->op == Iop_PRemF64
   2918            || triop->op == Iop_PRem1F64)
   2919       ) {
   2920       AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   2921       HReg        arg1   = iselDblExpr(env, triop->arg2);
   2922       HReg        arg2   = iselDblExpr(env, triop->arg3);
   2923       HReg        dst    = newVRegV(env);
   2924       Bool     arg2first = toBool(triop->op == Iop_ScaleF64
   2925                                   || triop->op == Iop_PRemF64
   2926                                   || triop->op == Iop_PRem1F64);
   2927       addInstr(env, AMD64Instr_A87Free(2));
   2928 
   2929       /* one arg -> top of x87 stack */
   2930       addInstr(env, AMD64Instr_SseLdSt(
   2931                        False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
   2932       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2933 
   2934       /* other arg -> top of x87 stack */
   2935       addInstr(env, AMD64Instr_SseLdSt(
   2936                        False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
   2937       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   2938 
   2939       /* do it */
   2940       /* XXXROUNDINGFIXME */
   2941       /* set roundingmode here */
   2942       switch (triop->op) {
   2943          case Iop_ScaleF64:
   2944             addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
   2945             break;
   2946          case Iop_AtanF64:
   2947             addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
   2948             break;
   2949          case Iop_Yl2xF64:
   2950             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
   2951             break;
   2952          case Iop_Yl2xp1F64:
   2953             addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
   2954             break;
   2955          case Iop_PRemF64:
   2956             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
   2957             break;
   2958          case Iop_PRem1F64:
   2959             addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
   2960             break;
   2961          default:
   2962             vassert(0);
   2963       }
   2964 
   2965       /* save result */
   2966       addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   2967       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   2968       return dst;
   2969    }
   2970 
   2971    if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
   2972       HReg dst = newVRegV(env);
   2973       HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
   2974       set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
   2975       addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
   2976       set_SSE_rounding_default( env );
   2977       return dst;
   2978    }
   2979 
   2980    if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
   2981       HReg dst = newVRegV(env);
   2982       HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
   2983       set_SSE_rounding_default( env );
   2984       addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
   2985       return dst;
   2986    }
   2987 
   2988    if (e->tag == Iex_Unop
   2989        && (e->Iex.Unop.op == Iop_NegF64
   2990            || e->Iex.Unop.op == Iop_AbsF64)) {
   2991       /* Sigh ... very rough code.  Could do much better. */
   2992       /* Get the 128-bit literal 00---0 10---0 into a register
   2993          and xor/nand it with the value to be negated. */
   2994       HReg r1  = newVRegI(env);
   2995       HReg dst = newVRegV(env);
   2996       HReg tmp = newVRegV(env);
   2997       HReg src = iselDblExpr(env, e->Iex.Unop.arg);
   2998       AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   2999       addInstr(env, mk_vMOVsd_RR(src,tmp));
   3000       addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3001       addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
   3002       addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
   3003       addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
   3004 
   3005       if (e->Iex.Unop.op == Iop_NegF64)
   3006          addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
   3007       else
   3008          addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
   3009 
   3010       add_to_rsp(env, 16);
   3011       return dst;
   3012    }
   3013 
   3014    if (e->tag == Iex_Binop) {
   3015       A87FpOp fpop = Afp_INVALID;
   3016       switch (e->Iex.Binop.op) {
   3017          case Iop_SqrtF64: fpop = Afp_SQRT; break;
   3018          case Iop_SinF64:  fpop = Afp_SIN;  break;
   3019          case Iop_CosF64:  fpop = Afp_COS;  break;
   3020          case Iop_TanF64:  fpop = Afp_TAN;  break;
   3021          case Iop_2xm1F64: fpop = Afp_2XM1; break;
   3022          default: break;
   3023       }
   3024       if (fpop != Afp_INVALID) {
   3025          AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3026          HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
   3027          HReg        dst    = newVRegV(env);
   3028          Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
   3029          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
   3030          addInstr(env, AMD64Instr_A87Free(nNeeded));
   3031          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
   3032          /* XXXROUNDINGFIXME */
   3033          /* set roundingmode here */
   3034          /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition
   3035             codes.  I don't think that matters, since this insn
   3036             selector never generates such an instruction intervening
   3037             between an flag-setting instruction and a flag-using
   3038             instruction. */
   3039          addInstr(env, AMD64Instr_A87FpOp(fpop));
   3040          addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
   3041          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3042          return dst;
   3043       }
   3044    }
   3045 
   3046    if (e->tag == Iex_Unop) {
   3047       switch (e->Iex.Unop.op) {
   3048 //..          case Iop_I32toF64: {
   3049 //..             HReg dst = newVRegF(env);
   3050 //..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
   3051 //..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
   3052 //..             set_FPU_rounding_default(env);
   3053 //..             addInstr(env, X86Instr_FpLdStI(
   3054 //..                              True/*load*/, 4, dst,
   3055 //..                              X86AMode_IR(0, hregX86_ESP())));
   3056 //..             add_to_esp(env, 4);
   3057 //..             return dst;
   3058 //..          }
   3059          case Iop_ReinterpI64asF64: {
   3060             /* Given an I64, produce an IEEE754 double with the same
   3061                bit pattern. */
   3062             AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
   3063             HReg        dst    = newVRegV(env);
   3064             AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3065             /* paranoia */
   3066             set_SSE_rounding_default(env);
   3067             addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
   3068             addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
   3069             return dst;
   3070          }
   3071          case Iop_F32toF64: {
   3072             HReg f32;
   3073             HReg f64 = newVRegV(env);
   3074             /* this shouldn't be necessary, but be paranoid ... */
   3075             set_SSE_rounding_default(env);
   3076             f32 = iselFltExpr(env, e->Iex.Unop.arg);
   3077             addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
   3078             return f64;
   3079          }
   3080          default:
   3081             break;
   3082       }
   3083    }
   3084 
   3085    /* --------- MULTIPLEX --------- */
   3086    if (e->tag == Iex_ITE) { // VFD
   3087       HReg r1, r0, dst;
   3088       vassert(ty == Ity_F64);
   3089       vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1);
   3090       r1  = iselDblExpr(env, e->Iex.ITE.iftrue);
   3091       r0  = iselDblExpr(env, e->Iex.ITE.iffalse);
   3092       dst = newVRegV(env);
   3093       addInstr(env, mk_vMOVsd_RR(r1,dst));
   3094       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   3095       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
   3096       return dst;
   3097    }
   3098 
   3099    ppIRExpr(e);
   3100    vpanic("iselDblExpr_wrk");
   3101 }
   3102 
   3103 
   3104 /*---------------------------------------------------------*/
   3105 /*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
   3106 /*---------------------------------------------------------*/
   3107 
   3108 static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
   3109 {
   3110    HReg r = iselVecExpr_wrk( env, e );
   3111 #  if 0
   3112    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3113 #  endif
   3114    vassert(hregClass(r) == HRcVec128);
   3115    vassert(hregIsVirtual(r));
   3116    return r;
   3117 }
   3118 
   3119 
   3120 /* DO NOT CALL THIS DIRECTLY */
   3121 static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
   3122 {
   3123    HWord      fn = 0; /* address of helper fn, if required */
   3124    Bool       arg1isEReg = False;
   3125    AMD64SseOp op = Asse_INVALID;
   3126    IRType     ty = typeOfIRExpr(env->type_env,e);
   3127    vassert(e);
   3128    vassert(ty == Ity_V128);
   3129 
   3130    if (e->tag == Iex_RdTmp) {
   3131       return lookupIRTemp(env, e->Iex.RdTmp.tmp);
   3132    }
   3133 
   3134    if (e->tag == Iex_Get) {
   3135       HReg dst = newVRegV(env);
   3136       addInstr(env, AMD64Instr_SseLdSt(
   3137                        True/*load*/,
   3138                        16,
   3139                        dst,
   3140                        AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
   3141                     )
   3142               );
   3143       return dst;
   3144    }
   3145 
   3146    if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
   3147       HReg        dst = newVRegV(env);
   3148       AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
   3149       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   3150       return dst;
   3151    }
   3152 
   3153    if (e->tag == Iex_Const) {
   3154       HReg dst = newVRegV(env);
   3155       vassert(e->Iex.Const.con->tag == Ico_V128);
   3156       switch (e->Iex.Const.con->Ico.V128) {
   3157          case 0x0000:
   3158             dst = generate_zeroes_V128(env);
   3159             break;
   3160          case 0xFFFF:
   3161             dst = generate_ones_V128(env);
   3162             break;
   3163          default: {
   3164             AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3165             /* do push_uimm64 twice, first time for the high-order half. */
   3166             push_uimm64(env, bitmask8_to_bytemask64(
   3167                                 (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
   3168                        ));
   3169             push_uimm64(env, bitmask8_to_bytemask64(
   3170                                 (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
   3171                        ));
   3172             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
   3173             add_to_rsp(env, 16);
   3174             break;
   3175          }
   3176       }
   3177       return dst;
   3178    }
   3179 
   3180    if (e->tag == Iex_Unop) {
   3181    switch (e->Iex.Unop.op) {
   3182 
   3183       case Iop_NotV128: {
   3184          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3185          return do_sse_NotV128(env, arg);
   3186       }
   3187 
   3188       case Iop_CmpNEZ64x2: {
   3189          /* We can use SSE2 instructions for this. */
   3190          /* Ideally, we want to do a 64Ix2 comparison against zero of
   3191             the operand.  Problem is no such insn exists.  Solution
   3192             therefore is to do a 32Ix4 comparison instead, and bitwise-
   3193             negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and
   3194             let the not'd result of this initial comparison be a:b:c:d.
   3195             What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
   3196             pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
   3197             giving the required result.
   3198 
   3199             The required selection sequence is 2,3,0,1, which
   3200             according to Intel's documentation means the pshufd
   3201             literal value is 0xB1, that is,
   3202             (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)
   3203          */
   3204          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3205          HReg tmp  = generate_zeroes_V128(env);
   3206          HReg dst  = newVRegV(env);
   3207          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
   3208          tmp = do_sse_NotV128(env, tmp);
   3209          addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
   3210          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
   3211          return dst;
   3212       }
   3213 
   3214       case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3215       case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   3216       case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   3217       do_CmpNEZ_vector:
   3218       {
   3219          HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
   3220          HReg tmp  = newVRegV(env);
   3221          HReg zero = generate_zeroes_V128(env);
   3222          HReg dst;
   3223          addInstr(env, mk_vMOVsd_RR(arg, tmp));
   3224          addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
   3225          dst = do_sse_NotV128(env, tmp);
   3226          return dst;
   3227       }
   3228 
   3229       case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
   3230       case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
   3231       case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
   3232       do_32Fx4_unary:
   3233       {
   3234          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3235          HReg dst = newVRegV(env);
   3236          addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
   3237          return dst;
   3238       }
   3239 
   3240       case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
   3241       do_64Fx2_unary:
   3242       {
   3243          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3244          HReg dst = newVRegV(env);
   3245          addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
   3246          return dst;
   3247       }
   3248 
   3249       case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
   3250       case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
   3251       case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
   3252       do_32F0x4_unary:
   3253       {
   3254          /* A bit subtle.  We have to copy the arg to the result
   3255             register first, because actually doing the SSE scalar insn
   3256             leaves the upper 3/4 of the destination register
   3257             unchanged.  Whereas the required semantics of these
   3258             primops is that the upper 3/4 is simply copied in from the
   3259             argument. */
   3260          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3261          HReg dst = newVRegV(env);
   3262          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3263          addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
   3264          return dst;
   3265       }
   3266 
   3267       case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
   3268       do_64F0x2_unary:
   3269       {
   3270          /* A bit subtle.  We have to copy the arg to the result
   3271             register first, because actually doing the SSE scalar insn
   3272             leaves the upper half of the destination register
   3273             unchanged.  Whereas the required semantics of these
   3274             primops is that the upper half is simply copied in from the
   3275             argument. */
   3276          HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
   3277          HReg dst = newVRegV(env);
   3278          addInstr(env, mk_vMOVsd_RR(arg, dst));
   3279          addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
   3280          return dst;
   3281       }
   3282 
   3283       case Iop_32UtoV128: {
   3284          HReg        dst     = newVRegV(env);
   3285          AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
   3286          AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
   3287          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
   3288          addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
   3289          return dst;
   3290       }
   3291 
   3292       case Iop_64UtoV128: {
   3293          HReg        dst  = newVRegV(env);
   3294          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3295          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
   3296          addInstr(env, AMD64Instr_Push(rmi));
   3297          addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
   3298          add_to_rsp(env, 8);
   3299          return dst;
   3300       }
   3301 
   3302       case Iop_V256toV128_0:
   3303       case Iop_V256toV128_1: {
   3304          HReg vHi, vLo;
   3305          iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg);
   3306          return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo;
   3307       }
   3308 
   3309       default:
   3310          break;
   3311    } /* switch (e->Iex.Unop.op) */
   3312    } /* if (e->tag == Iex_Unop) */
   3313 
   3314    if (e->tag == Iex_Binop) {
   3315    switch (e->Iex.Binop.op) {
   3316 
   3317       /* FIXME: could we generate MOVQ here? */
   3318       case Iop_SetV128lo64: {
   3319          HReg dst  = newVRegV(env);
   3320          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3321          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3322          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3323          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3324          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
   3325          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3326          return dst;
   3327       }
   3328 
   3329       /* FIXME: could we generate MOVD here? */
   3330       case Iop_SetV128lo32: {
   3331          HReg dst  = newVRegV(env);
   3332          HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
   3333          HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3334          AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
   3335          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
   3336          addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
   3337          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
   3338          return dst;
   3339       }
   3340 
   3341       case Iop_64HLtoV128: {
   3342          HReg        rsp     = hregAMD64_RSP();
   3343          AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   3344          AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   3345          AMD64RI*    qHi = iselIntExpr_RI(env, e->Iex.Binop.arg1);
   3346          AMD64RI*    qLo = iselIntExpr_RI(env, e->Iex.Binop.arg2);
   3347          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qHi, m8_rsp));
   3348          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, qLo, m16_rsp));
   3349          HReg        dst = newVRegV(env);
   3350          /* One store-forwarding stall coming up, oh well :-( */
   3351          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, m16_rsp));
   3352          return dst;
   3353       }
   3354 
   3355       case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
   3356       case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
   3357       case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
   3358       case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
   3359       case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
   3360       case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
   3361       do_32Fx4:
   3362       {
   3363          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3364          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3365          HReg dst = newVRegV(env);
   3366          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3367          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3368          return dst;
   3369       }
   3370 
   3371       case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
   3372       case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
   3373       case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
   3374       case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
   3375       case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
   3376       case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
   3377       do_64Fx2:
   3378       {
   3379          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3380          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3381          HReg dst = newVRegV(env);
   3382          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3383          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3384          return dst;
   3385       }
   3386 
   3387       case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
   3388       case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
   3389       case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
   3390       case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
   3391       case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
   3392       case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
   3393       case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
   3394       case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
   3395       case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
   3396       case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
   3397       do_32F0x4: {
   3398          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3399          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3400          HReg dst = newVRegV(env);
   3401          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3402          addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
   3403          return dst;
   3404       }
   3405 
   3406       case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
   3407       case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
   3408       case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
   3409       case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
   3410       case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
   3411       case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
   3412       case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
   3413       case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
   3414       case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
   3415       case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
   3416       do_64F0x2: {
   3417          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3418          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3419          HReg dst = newVRegV(env);
   3420          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3421          addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
   3422          return dst;
   3423       }
   3424 
   3425       case Iop_QNarrowBin32Sto16Sx8:
   3426          op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
   3427       case Iop_QNarrowBin16Sto8Sx16:
   3428          op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
   3429       case Iop_QNarrowBin16Sto8Ux16:
   3430          op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
   3431 
   3432       case Iop_InterleaveHI8x16:
   3433          op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
   3434       case Iop_InterleaveHI16x8:
   3435          op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
   3436       case Iop_InterleaveHI32x4:
   3437          op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
   3438       case Iop_InterleaveHI64x2:
   3439          op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
   3440 
   3441       case Iop_InterleaveLO8x16:
   3442          op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
   3443       case Iop_InterleaveLO16x8:
   3444          op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
   3445       case Iop_InterleaveLO32x4:
   3446          op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
   3447       case Iop_InterleaveLO64x2:
   3448          op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
   3449 
   3450       case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
   3451       case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
   3452       case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
   3453       case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
   3454       case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
   3455       case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
   3456       case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
   3457       case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
   3458       case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
   3459       case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
   3460       case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
   3461       case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
   3462       case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
   3463       case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3464       case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
   3465       case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3466       case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
   3467       case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
   3468       case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
   3469       case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
   3470       case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
   3471       case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
   3472       case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
   3473       case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
   3474       case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
   3475       case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
   3476       case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
   3477       case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
   3478       case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
   3479       case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
   3480       case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
   3481       case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
   3482       case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
   3483       case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
   3484       do_SseReRg: {
   3485          HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
   3486          HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
   3487          HReg dst = newVRegV(env);
   3488          if (arg1isEReg) {
   3489             addInstr(env, mk_vMOVsd_RR(arg2, dst));
   3490             addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
   3491          } else {
   3492             addInstr(env, mk_vMOVsd_RR(arg1, dst));
   3493             addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
   3494          }
   3495          return dst;
   3496       }
   3497 
   3498       case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
   3499       case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
   3500       case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
   3501       case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
   3502       case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
   3503       case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
   3504       case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
   3505       case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
   3506       do_SseShift: {
   3507          HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
   3508          AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3509          AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
   3510          HReg        ereg = newVRegV(env);
   3511          HReg        dst  = newVRegV(env);
   3512          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3513          addInstr(env, AMD64Instr_Push(rmi));
   3514          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3515          addInstr(env, mk_vMOVsd_RR(greg, dst));
   3516          addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
   3517          add_to_rsp(env, 16);
   3518          return dst;
   3519       }
   3520 
   3521       case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
   3522                            goto do_SseAssistedBinary;
   3523       case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
   3524                            goto do_SseAssistedBinary;
   3525       case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
   3526                            goto do_SseAssistedBinary;
   3527       case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
   3528                            goto do_SseAssistedBinary;
   3529       case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
   3530                            goto do_SseAssistedBinary;
   3531       case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
   3532                            goto do_SseAssistedBinary;
   3533       case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
   3534                            goto do_SseAssistedBinary;
   3535       case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
   3536                            goto do_SseAssistedBinary;
   3537       case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
   3538                            goto do_SseAssistedBinary;
   3539       case Iop_CmpEQ64x2:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   3540                            goto do_SseAssistedBinary;
   3541       case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   3542                            goto do_SseAssistedBinary;
   3543       case Iop_Perm32x4:   fn = (HWord)h_generic_calc_Perm32x4;
   3544                            goto do_SseAssistedBinary;
   3545       case Iop_QNarrowBin32Sto16Ux8:
   3546                            fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8;
   3547                            goto do_SseAssistedBinary;
   3548       case Iop_NarrowBin16to8x16:
   3549                            fn = (HWord)h_generic_calc_NarrowBin16to8x16;
   3550                            goto do_SseAssistedBinary;
   3551       case Iop_NarrowBin32to16x8:
   3552                            fn = (HWord)h_generic_calc_NarrowBin32to16x8;
   3553                            goto do_SseAssistedBinary;
   3554       do_SseAssistedBinary: {
   3555          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3556             well. */
   3557          vassert(fn != 0);
   3558          HReg dst = newVRegV(env);
   3559          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3560          HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
   3561          HReg argp = newVRegI(env);
   3562          /* subq $112, %rsp         -- make a space*/
   3563          sub_from_rsp(env, 112);
   3564          /* leaq 48(%rsp), %r_argp  -- point into it */
   3565          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3566                                         argp));
   3567          /* andq $-16, %r_argp      -- 16-align the pointer */
   3568          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3569                                          AMD64RMI_Imm( ~(UInt)15 ),
   3570                                          argp));
   3571          /* Prepare 3 arg regs:
   3572             leaq 0(%r_argp), %rdi
   3573             leaq 16(%r_argp), %rsi
   3574             leaq 32(%r_argp), %rdx
   3575          */
   3576          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3577                                         hregAMD64_RDI()));
   3578          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3579                                         hregAMD64_RSI()));
   3580          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   3581                                         hregAMD64_RDX()));
   3582          /* Store the two args, at (%rsi) and (%rdx):
   3583             movupd  %argL, 0(%rsi)
   3584             movupd  %argR, 0(%rdx)
   3585          */
   3586          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3587                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3588          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
   3589                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   3590          /* call the helper */
   3591          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   3592                                         3, mk_RetLoc_simple(RLPri_None) ));
   3593          /* fetch the result from memory, using %r_argp, which the
   3594             register allocator will keep alive across the call. */
   3595          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3596                                           AMD64AMode_IR(0, argp)));
   3597          /* and finally, clear the space */
   3598          add_to_rsp(env, 112);
   3599          return dst;
   3600       }
   3601 
   3602       case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2;
   3603                          goto do_SseAssistedVectorAndScalar;
   3604       case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16;
   3605                          goto do_SseAssistedVectorAndScalar;
   3606       do_SseAssistedVectorAndScalar: {
   3607          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   3608             well. */
   3609          vassert(fn != 0);
   3610          HReg dst = newVRegV(env);
   3611          HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
   3612          HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
   3613          HReg argp = newVRegI(env);
   3614          /* subq $112, %rsp         -- make a space*/
   3615          sub_from_rsp(env, 112);
   3616          /* leaq 48(%rsp), %r_argp  -- point into it */
   3617          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   3618                                         argp));
   3619          /* andq $-16, %r_argp      -- 16-align the pointer */
   3620          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   3621                                          AMD64RMI_Imm( ~(UInt)15 ),
   3622                                          argp));
   3623          /* Prepare 2 vector arg regs:
   3624             leaq 0(%r_argp), %rdi
   3625             leaq 16(%r_argp), %rsi
   3626          */
   3627          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   3628                                         hregAMD64_RDI()));
   3629          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   3630                                         hregAMD64_RSI()));
   3631          /* Store the vector arg, at (%rsi):
   3632             movupd  %argL, 0(%rsi)
   3633          */
   3634          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
   3635                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   3636          /* And get the scalar value into rdx */
   3637          addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX()));
   3638 
   3639          /* call the helper */
   3640          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn,
   3641                                         3, mk_RetLoc_simple(RLPri_None) ));
   3642          /* fetch the result from memory, using %r_argp, which the
   3643             register allocator will keep alive across the call. */
   3644          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
   3645                                           AMD64AMode_IR(0, argp)));
   3646          /* and finally, clear the space */
   3647          add_to_rsp(env, 112);
   3648          return dst;
   3649       }
   3650 
   3651       default:
   3652          break;
   3653    } /* switch (e->Iex.Binop.op) */
   3654    } /* if (e->tag == Iex_Binop) */
   3655 
   3656    if (e->tag == Iex_Triop) {
   3657    IRTriop *triop = e->Iex.Triop.details;
   3658    switch (triop->op) {
   3659 
   3660       case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm;
   3661       case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm;
   3662       case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm;
   3663       case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm;
   3664       do_64Fx2_w_rm:
   3665       {
   3666          HReg argL = iselVecExpr(env, triop->arg2);
   3667          HReg argR = iselVecExpr(env, triop->arg3);
   3668          HReg dst = newVRegV(env);
   3669          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3670          /* XXXROUNDINGFIXME */
   3671          /* set roundingmode here */
   3672          addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
   3673          return dst;
   3674       }
   3675 
   3676       case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm;
   3677       case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm;
   3678       case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm;
   3679       case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm;
   3680       do_32Fx4_w_rm:
   3681       {
   3682          HReg argL = iselVecExpr(env, triop->arg2);
   3683          HReg argR = iselVecExpr(env, triop->arg3);
   3684          HReg dst = newVRegV(env);
   3685          addInstr(env, mk_vMOVsd_RR(argL, dst));
   3686          /* XXXROUNDINGFIXME */
   3687          /* set roundingmode here */
   3688          addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
   3689          return dst;
   3690       }
   3691 
   3692       default:
   3693          break;
   3694    } /* switch (triop->op) */
   3695    } /* if (e->tag == Iex_Triop) */
   3696 
   3697    if (e->tag == Iex_ITE) { // VFD
   3698       HReg r1  = iselVecExpr(env, e->Iex.ITE.iftrue);
   3699       HReg r0  = iselVecExpr(env, e->Iex.ITE.iffalse);
   3700       HReg dst = newVRegV(env);
   3701       addInstr(env, mk_vMOVsd_RR(r1,dst));
   3702       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   3703       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst));
   3704       return dst;
   3705    }
   3706 
   3707    //vec_fail:
   3708    vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
   3709               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   3710    ppIRExpr(e);
   3711    vpanic("iselVecExpr_wrk");
   3712 }
   3713 
   3714 
   3715 /*---------------------------------------------------------*/
   3716 /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs.    --*/
   3717 /*---------------------------------------------------------*/
   3718 
   3719 static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3720                            ISelEnv* env, IRExpr* e )
   3721 {
   3722    iselDVecExpr_wrk( rHi, rLo, env, e );
   3723 #  if 0
   3724    vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
   3725 #  endif
   3726    vassert(hregClass(*rHi) == HRcVec128);
   3727    vassert(hregClass(*rLo) == HRcVec128);
   3728    vassert(hregIsVirtual(*rHi));
   3729    vassert(hregIsVirtual(*rLo));
   3730 }
   3731 
   3732 
   3733 /* DO NOT CALL THIS DIRECTLY */
   3734 static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo,
   3735                                ISelEnv* env, IRExpr* e )
   3736 {
   3737    HWord fn = 0; /* address of helper fn, if required */
   3738    vassert(e);
   3739    IRType ty = typeOfIRExpr(env->type_env,e);
   3740    vassert(ty == Ity_V256);
   3741 
   3742    AMD64SseOp op = Asse_INVALID;
   3743 
   3744    /* read 256-bit IRTemp */
   3745    if (e->tag == Iex_RdTmp) {
   3746       lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
   3747       return;
   3748    }
   3749 
   3750    if (e->tag == Iex_Get) {
   3751       HReg        vHi  = newVRegV(env);
   3752       HReg        vLo  = newVRegV(env);
   3753       HReg        rbp  = hregAMD64_RBP();
   3754       AMD64AMode* am0  = AMD64AMode_IR(e->Iex.Get.offset + 0,  rbp);
   3755       AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp);
   3756       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3757       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3758       *rHi = vHi;
   3759       *rLo = vLo;
   3760       return;
   3761    }
   3762 
   3763    if (e->tag == Iex_Load) {
   3764       HReg        vHi  = newVRegV(env);
   3765       HReg        vLo  = newVRegV(env);
   3766       HReg        rA   = iselIntExpr_R(env, e->Iex.Load.addr);
   3767       AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   3768       AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   3769       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0));
   3770       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16));
   3771       *rHi = vHi;
   3772       *rLo = vLo;
   3773       return;
   3774    }
   3775 
   3776    if (e->tag == Iex_Const) {
   3777       vassert(e->Iex.Const.con->tag == Ico_V256);
   3778       switch (e->Iex.Const.con->Ico.V256) {
   3779          case 0x00000000: {
   3780             HReg vHi = generate_zeroes_V128(env);
   3781             HReg vLo = newVRegV(env);
   3782             addInstr(env, mk_vMOVsd_RR(vHi, vLo));
   3783             *rHi = vHi;
   3784             *rLo = vLo;
   3785             return;
   3786          }
   3787          default:
   3788             break; /* give up.   Until such time as is necessary. */
   3789       }
   3790    }
   3791 
   3792    if (e->tag == Iex_Unop) {
   3793    switch (e->Iex.Unop.op) {
   3794 
   3795       case Iop_NotV256: {
   3796          HReg argHi, argLo;
   3797          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3798          *rHi = do_sse_NotV128(env, argHi);
   3799          *rLo = do_sse_NotV128(env, argLo);
   3800          return;
   3801       }
   3802 
   3803       case Iop_Recip32Fx8: op = Asse_RCPF;   goto do_32Fx8_unary;
   3804       case Iop_Sqrt32Fx8:  op = Asse_SQRTF;  goto do_32Fx8_unary;
   3805       case Iop_RSqrt32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary;
   3806       do_32Fx8_unary:
   3807       {
   3808          HReg argHi, argLo;
   3809          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3810          HReg dstHi = newVRegV(env);
   3811          HReg dstLo = newVRegV(env);
   3812          addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi));
   3813          addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo));
   3814          *rHi = dstHi;
   3815          *rLo = dstLo;
   3816          return;
   3817       }
   3818 
   3819       case Iop_Sqrt64Fx4:  op = Asse_SQRTF;  goto do_64Fx4_unary;
   3820       do_64Fx4_unary:
   3821       {
   3822          HReg argHi, argLo;
   3823          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3824          HReg dstHi = newVRegV(env);
   3825          HReg dstLo = newVRegV(env);
   3826          addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi));
   3827          addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo));
   3828          *rHi = dstHi;
   3829          *rLo = dstLo;
   3830          return;
   3831       }
   3832 
   3833       case Iop_CmpNEZ64x4: {
   3834          /* We can use SSE2 instructions for this. */
   3835          /* Same scheme as Iop_CmpNEZ64x2, except twice as wide
   3836             (obviously).  See comment on Iop_CmpNEZ64x2 for
   3837             explanation of what's going on here. */
   3838          HReg argHi, argLo;
   3839          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3840          HReg tmpHi  = generate_zeroes_V128(env);
   3841          HReg tmpLo  = newVRegV(env);
   3842          addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo));
   3843          HReg dstHi  = newVRegV(env);
   3844          HReg dstLo  = newVRegV(env);
   3845          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi));
   3846          addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo));
   3847          tmpHi = do_sse_NotV128(env, tmpHi);
   3848          tmpLo = do_sse_NotV128(env, tmpLo);
   3849          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi));
   3850          addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo));
   3851          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi));
   3852          addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo));
   3853          *rHi = dstHi;
   3854          *rLo = dstLo;
   3855          return;
   3856       }
   3857 
   3858       case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
   3859       case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
   3860       case Iop_CmpNEZ8x32: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
   3861       do_CmpNEZ_vector:
   3862       {
   3863          HReg argHi, argLo;
   3864          iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg);
   3865          HReg tmpHi = newVRegV(env);
   3866          HReg tmpLo = newVRegV(env);
   3867          HReg zero  = generate_zeroes_V128(env);
   3868          HReg dstHi, dstLo;
   3869          addInstr(env, mk_vMOVsd_RR(argHi, tmpHi));
   3870          addInstr(env, mk_vMOVsd_RR(argLo, tmpLo));
   3871          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi));
   3872          addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo));
   3873          dstHi = do_sse_NotV128(env, tmpHi);
   3874          dstLo = do_sse_NotV128(env, tmpLo);
   3875          *rHi = dstHi;
   3876          *rLo = dstLo;
   3877          return;
   3878       }
   3879 
   3880       default:
   3881          break;
   3882    } /* switch (e->Iex.Unop.op) */
   3883    } /* if (e->tag == Iex_Unop) */
   3884 
   3885    if (e->tag == Iex_Binop) {
   3886    switch (e->Iex.Binop.op) {
   3887 
   3888       case Iop_Max64Fx4:   op = Asse_MAXF;   goto do_64Fx4;
   3889       case Iop_Min64Fx4:   op = Asse_MINF;   goto do_64Fx4;
   3890       do_64Fx4:
   3891       {
   3892          HReg argLhi, argLlo, argRhi, argRlo;
   3893          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3894          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3895          HReg dstHi = newVRegV(env);
   3896          HReg dstLo = newVRegV(env);
   3897          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3898          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3899          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
   3900          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
   3901          *rHi = dstHi;
   3902          *rLo = dstLo;
   3903          return;
   3904       }
   3905 
   3906       case Iop_Max32Fx8:   op = Asse_MAXF;   goto do_32Fx8;
   3907       case Iop_Min32Fx8:   op = Asse_MINF;   goto do_32Fx8;
   3908       do_32Fx8:
   3909       {
   3910          HReg argLhi, argLlo, argRhi, argRlo;
   3911          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3912          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3913          HReg dstHi = newVRegV(env);
   3914          HReg dstLo = newVRegV(env);
   3915          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3916          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3917          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
   3918          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
   3919          *rHi = dstHi;
   3920          *rLo = dstLo;
   3921          return;
   3922       }
   3923 
   3924       case Iop_AndV256:    op = Asse_AND;      goto do_SseReRg;
   3925       case Iop_OrV256:     op = Asse_OR;       goto do_SseReRg;
   3926       case Iop_XorV256:    op = Asse_XOR;      goto do_SseReRg;
   3927       case Iop_Add8x32:    op = Asse_ADD8;     goto do_SseReRg;
   3928       case Iop_Add16x16:   op = Asse_ADD16;    goto do_SseReRg;
   3929       case Iop_Add32x8:    op = Asse_ADD32;    goto do_SseReRg;
   3930       case Iop_Add64x4:    op = Asse_ADD64;    goto do_SseReRg;
   3931       case Iop_QAdd8Sx32:  op = Asse_QADD8S;   goto do_SseReRg;
   3932       case Iop_QAdd16Sx16: op = Asse_QADD16S;  goto do_SseReRg;
   3933       case Iop_QAdd8Ux32:  op = Asse_QADD8U;   goto do_SseReRg;
   3934       case Iop_QAdd16Ux16: op = Asse_QADD16U;  goto do_SseReRg;
   3935       case Iop_Avg8Ux32:   op = Asse_AVG8U;    goto do_SseReRg;
   3936       case Iop_Avg16Ux16:  op = Asse_AVG16U;   goto do_SseReRg;
   3937       case Iop_CmpEQ8x32:  op = Asse_CMPEQ8;   goto do_SseReRg;
   3938       case Iop_CmpEQ16x16: op = Asse_CMPEQ16;  goto do_SseReRg;
   3939       case Iop_CmpEQ32x8:  op = Asse_CMPEQ32;  goto do_SseReRg;
   3940       case Iop_CmpGT8Sx32: op = Asse_CMPGT8S;  goto do_SseReRg;
   3941       case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg;
   3942       case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg;
   3943       case Iop_Max16Sx16:  op = Asse_MAX16S;   goto do_SseReRg;
   3944       case Iop_Max8Ux32:   op = Asse_MAX8U;    goto do_SseReRg;
   3945       case Iop_Min16Sx16:  op = Asse_MIN16S;   goto do_SseReRg;
   3946       case Iop_Min8Ux32:   op = Asse_MIN8U;    goto do_SseReRg;
   3947       case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg;
   3948       case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg;
   3949       case Iop_Mul16x16:   op = Asse_MUL16;    goto do_SseReRg;
   3950       case Iop_Sub8x32:    op = Asse_SUB8;     goto do_SseReRg;
   3951       case Iop_Sub16x16:   op = Asse_SUB16;    goto do_SseReRg;
   3952       case Iop_Sub32x8:    op = Asse_SUB32;    goto do_SseReRg;
   3953       case Iop_Sub64x4:    op = Asse_SUB64;    goto do_SseReRg;
   3954       case Iop_QSub8Sx32:  op = Asse_QSUB8S;   goto do_SseReRg;
   3955       case Iop_QSub16Sx16: op = Asse_QSUB16S;  goto do_SseReRg;
   3956       case Iop_QSub8Ux32:  op = Asse_QSUB8U;   goto do_SseReRg;
   3957       case Iop_QSub16Ux16: op = Asse_QSUB16U;  goto do_SseReRg;
   3958       do_SseReRg:
   3959       {
   3960          HReg argLhi, argLlo, argRhi, argRlo;
   3961          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   3962          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   3963          HReg dstHi = newVRegV(env);
   3964          HReg dstLo = newVRegV(env);
   3965          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   3966          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   3967          addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi));
   3968          addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo));
   3969          *rHi = dstHi;
   3970          *rLo = dstLo;
   3971          return;
   3972       }
   3973 
   3974       case Iop_ShlN16x16: op = Asse_SHL16; goto do_SseShift;
   3975       case Iop_ShlN32x8:  op = Asse_SHL32; goto do_SseShift;
   3976       case Iop_ShlN64x4:  op = Asse_SHL64; goto do_SseShift;
   3977       case Iop_SarN16x16: op = Asse_SAR16; goto do_SseShift;
   3978       case Iop_SarN32x8:  op = Asse_SAR32; goto do_SseShift;
   3979       case Iop_ShrN16x16: op = Asse_SHR16; goto do_SseShift;
   3980       case Iop_ShrN32x8:  op = Asse_SHR32; goto do_SseShift;
   3981       case Iop_ShrN64x4:  op = Asse_SHR64; goto do_SseShift;
   3982       do_SseShift: {
   3983          HReg gregHi, gregLo;
   3984          iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1);
   3985          AMD64RMI*   rmi   = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
   3986          AMD64AMode* rsp0  = AMD64AMode_IR(0, hregAMD64_RSP());
   3987          HReg        ereg  = newVRegV(env);
   3988          HReg        dstHi = newVRegV(env);
   3989          HReg        dstLo = newVRegV(env);
   3990          addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
   3991          addInstr(env, AMD64Instr_Push(rmi));
   3992          addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
   3993          addInstr(env, mk_vMOVsd_RR(gregHi, dstHi));
   3994          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi));
   3995          addInstr(env, mk_vMOVsd_RR(gregLo, dstLo));
   3996          addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo));
   3997          add_to_rsp(env, 16);
   3998          *rHi = dstHi;
   3999          *rLo = dstLo;
   4000          return;
   4001       }
   4002 
   4003       case Iop_V128HLtoV256: {
   4004          *rHi = iselVecExpr(env, e->Iex.Binop.arg1);
   4005          *rLo = iselVecExpr(env, e->Iex.Binop.arg2);
   4006          return;
   4007       }
   4008 
   4009       case Iop_Mul32x8:    fn = (HWord)h_generic_calc_Mul32x4;
   4010                            goto do_SseAssistedBinary;
   4011       case Iop_Max32Sx8:   fn = (HWord)h_generic_calc_Max32Sx4;
   4012                            goto do_SseAssistedBinary;
   4013       case Iop_Min32Sx8:   fn = (HWord)h_generic_calc_Min32Sx4;
   4014                            goto do_SseAssistedBinary;
   4015       case Iop_Max32Ux8:   fn = (HWord)h_generic_calc_Max32Ux4;
   4016                            goto do_SseAssistedBinary;
   4017       case Iop_Min32Ux8:   fn = (HWord)h_generic_calc_Min32Ux4;
   4018                            goto do_SseAssistedBinary;
   4019       case Iop_Max16Ux16:  fn = (HWord)h_generic_calc_Max16Ux8;
   4020                            goto do_SseAssistedBinary;
   4021       case Iop_Min16Ux16:  fn = (HWord)h_generic_calc_Min16Ux8;
   4022                            goto do_SseAssistedBinary;
   4023       case Iop_Max8Sx32:   fn = (HWord)h_generic_calc_Max8Sx16;
   4024                            goto do_SseAssistedBinary;
   4025       case Iop_Min8Sx32:   fn = (HWord)h_generic_calc_Min8Sx16;
   4026                            goto do_SseAssistedBinary;
   4027       case Iop_CmpEQ64x4:  fn = (HWord)h_generic_calc_CmpEQ64x2;
   4028                            goto do_SseAssistedBinary;
   4029       case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2;
   4030                            goto do_SseAssistedBinary;
   4031       do_SseAssistedBinary: {
   4032          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   4033             well. */
   4034          vassert(fn != 0);
   4035          HReg dstHi = newVRegV(env);
   4036          HReg dstLo = newVRegV(env);
   4037          HReg argLhi, argLlo, argRhi, argRlo;
   4038          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   4039          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   4040          HReg argp = newVRegI(env);
   4041          /* subq $160, %rsp         -- make a space*/
   4042          sub_from_rsp(env, 160);
   4043          /* leaq 48(%rsp), %r_argp  -- point into it */
   4044          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   4045                                         argp));
   4046          /* andq $-16, %r_argp      -- 16-align the pointer */
   4047          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   4048                                          AMD64RMI_Imm( ~(UInt)15 ),
   4049                                          argp));
   4050          /* Prepare 3 arg regs:
   4051             leaq 0(%r_argp), %rdi
   4052             leaq 16(%r_argp), %rsi
   4053             leaq 32(%r_argp), %rdx
   4054          */
   4055          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   4056                                         hregAMD64_RDI()));
   4057          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
   4058                                         hregAMD64_RSI()));
   4059          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   4060                                         hregAMD64_RDX()));
   4061          /* Store the two high args, at (%rsi) and (%rdx):
   4062             movupd  %argLhi, 0(%rsi)
   4063             movupd  %argRhi, 0(%rdx)
   4064          */
   4065          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
   4066                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   4067          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
   4068                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   4069          /* Store the two low args, at 48(%rsi) and 48(%rdx):
   4070             movupd  %argLlo, 48(%rsi)
   4071             movupd  %argRlo, 48(%rdx)
   4072          */
   4073          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
   4074                                           AMD64AMode_IR(48, hregAMD64_RSI())));
   4075          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
   4076                                           AMD64AMode_IR(48, hregAMD64_RDX())));
   4077          /* call the helper */
   4078          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4079                                         mk_RetLoc_simple(RLPri_None) ));
   4080          /* Prepare 3 arg regs:
   4081             leaq 48(%r_argp), %rdi
   4082             leaq 64(%r_argp), %rsi
   4083             leaq 80(%r_argp), %rdx
   4084          */
   4085          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp),
   4086                                         hregAMD64_RDI()));
   4087          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
   4088                                         hregAMD64_RSI()));
   4089          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp),
   4090                                         hregAMD64_RDX()));
   4091          /* call the helper */
   4092          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4093                                         mk_RetLoc_simple(RLPri_None) ));
   4094          /* fetch the result from memory, using %r_argp, which the
   4095             register allocator will keep alive across the call. */
   4096          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
   4097                                           AMD64AMode_IR(0, argp)));
   4098          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
   4099                                           AMD64AMode_IR(48, argp)));
   4100          /* and finally, clear the space */
   4101          add_to_rsp(env, 160);
   4102          *rHi = dstHi;
   4103          *rLo = dstLo;
   4104          return;
   4105       }
   4106 
   4107       case Iop_Perm32x8:   fn = (HWord)h_generic_calc_Perm32x8;
   4108                            goto do_SseAssistedBinary256;
   4109       do_SseAssistedBinary256: {
   4110          /* RRRufff!  RRRufff code is what we're generating here.  Oh
   4111             well. */
   4112          vassert(fn != 0);
   4113          HReg dstHi = newVRegV(env);
   4114          HReg dstLo = newVRegV(env);
   4115          HReg argLhi, argLlo, argRhi, argRlo;
   4116          iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1);
   4117          iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2);
   4118          HReg argp = newVRegI(env);
   4119          /* subq $160, %rsp         -- make a space*/
   4120          sub_from_rsp(env, 160);
   4121          /* leaq 48(%rsp), %r_argp  -- point into it */
   4122          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
   4123                                         argp));
   4124          /* andq $-16, %r_argp      -- 16-align the pointer */
   4125          addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
   4126                                          AMD64RMI_Imm( ~(UInt)15 ),
   4127                                          argp));
   4128          /* Prepare 3 arg regs:
   4129             leaq 0(%r_argp), %rdi
   4130             leaq 32(%r_argp), %rsi
   4131             leaq 64(%r_argp), %rdx
   4132          */
   4133          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
   4134                                         hregAMD64_RDI()));
   4135          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
   4136                                         hregAMD64_RSI()));
   4137          addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp),
   4138                                         hregAMD64_RDX()));
   4139          /* Store the two args, at (%rsi) and (%rdx):
   4140             movupd  %argLlo, 0(%rsi)
   4141             movupd  %argLhi, 16(%rsi)
   4142             movupd  %argRlo, 0(%rdx)
   4143             movupd  %argRhi, 16(%rdx)
   4144          */
   4145          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo,
   4146                                           AMD64AMode_IR(0, hregAMD64_RSI())));
   4147          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi,
   4148                                           AMD64AMode_IR(16, hregAMD64_RSI())));
   4149          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo,
   4150                                           AMD64AMode_IR(0, hregAMD64_RDX())));
   4151          addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi,
   4152                                           AMD64AMode_IR(16, hregAMD64_RDX())));
   4153          /* call the helper */
   4154          addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3,
   4155                                         mk_RetLoc_simple(RLPri_None) ));
   4156          /* fetch the result from memory, using %r_argp, which the
   4157             register allocator will keep alive across the call. */
   4158          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo,
   4159                                           AMD64AMode_IR(0, argp)));
   4160          addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi,
   4161                                           AMD64AMode_IR(16, argp)));
   4162          /* and finally, clear the space */
   4163          add_to_rsp(env, 160);
   4164          *rHi = dstHi;
   4165          *rLo = dstLo;
   4166          return;
   4167       }
   4168 
   4169       default:
   4170          break;
   4171    } /* switch (e->Iex.Binop.op) */
   4172    } /* if (e->tag == Iex_Binop) */
   4173 
   4174    if (e->tag == Iex_Triop) {
   4175    IRTriop *triop = e->Iex.Triop.details;
   4176    switch (triop->op) {
   4177 
   4178       case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm;
   4179       case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm;
   4180       case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm;
   4181       case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm;
   4182       do_64Fx4_w_rm:
   4183       {
   4184          HReg argLhi, argLlo, argRhi, argRlo;
   4185          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
   4186          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
   4187          HReg dstHi = newVRegV(env);
   4188          HReg dstLo = newVRegV(env);
   4189          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   4190          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   4191          /* XXXROUNDINGFIXME */
   4192          /* set roundingmode here */
   4193          addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi));
   4194          addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo));
   4195          *rHi = dstHi;
   4196          *rLo = dstLo;
   4197          return;
   4198       }
   4199 
   4200       case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm;
   4201       case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm;
   4202       case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm;
   4203       case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm;
   4204       do_32Fx8_w_rm:
   4205       {
   4206          HReg argLhi, argLlo, argRhi, argRlo;
   4207          iselDVecExpr(&argLhi, &argLlo, env, triop->arg2);
   4208          iselDVecExpr(&argRhi, &argRlo, env, triop->arg3);
   4209          HReg dstHi = newVRegV(env);
   4210          HReg dstLo = newVRegV(env);
   4211          addInstr(env, mk_vMOVsd_RR(argLhi, dstHi));
   4212          addInstr(env, mk_vMOVsd_RR(argLlo, dstLo));
   4213          /* XXXROUNDINGFIXME */
   4214          /* set roundingmode here */
   4215          addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi));
   4216          addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo));
   4217          *rHi = dstHi;
   4218          *rLo = dstLo;
   4219          return;
   4220       }
   4221 
   4222       default:
   4223          break;
   4224    } /* switch (triop->op) */
   4225    } /* if (e->tag == Iex_Triop) */
   4226 
   4227 
   4228    if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) {
   4229       HReg        rsp     = hregAMD64_RSP();
   4230       HReg        vHi     = newVRegV(env);
   4231       HReg        vLo     = newVRegV(env);
   4232       AMD64AMode* m8_rsp  = AMD64AMode_IR(-8, rsp);
   4233       AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp);
   4234       /* arg1 is the most significant (Q3), arg4 the least (Q0) */
   4235       /* Get all the args into regs, before messing with the stack. */
   4236       AMD64RI* q3  = iselIntExpr_RI(env, e->Iex.Qop.details->arg1);
   4237       AMD64RI* q2  = iselIntExpr_RI(env, e->Iex.Qop.details->arg2);
   4238       AMD64RI* q1  = iselIntExpr_RI(env, e->Iex.Qop.details->arg3);
   4239       AMD64RI* q0  = iselIntExpr_RI(env, e->Iex.Qop.details->arg4);
   4240       /* less significant lane (Q2) at the lower address (-16(rsp)) */
   4241       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q3, m8_rsp));
   4242       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q2, m16_rsp));
   4243       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, m16_rsp));
   4244       /* and then the lower half .. */
   4245       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q1, m8_rsp));
   4246       addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, q0, m16_rsp));
   4247       addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, m16_rsp));
   4248       *rHi = vHi;
   4249       *rLo = vLo;
   4250       return;
   4251    }
   4252 
   4253    if (e->tag == Iex_ITE) {
   4254       HReg r1Hi, r1Lo, r0Hi, r0Lo;
   4255       iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue);
   4256       iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse);
   4257       HReg dstHi = newVRegV(env);
   4258       HReg dstLo = newVRegV(env);
   4259       addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi));
   4260       addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo));
   4261       AMD64CondCode cc = iselCondCode(env, e->Iex.ITE.cond);
   4262       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi));
   4263       addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo));
   4264       *rHi = dstHi;
   4265       *rLo = dstLo;
   4266       return;
   4267    }
   4268 
   4269    //avx_fail:
   4270    vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n",
   4271               LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
   4272    ppIRExpr(e);
   4273    vpanic("iselDVecExpr_wrk");
   4274 }
   4275 
   4276 
   4277 /*---------------------------------------------------------*/
   4278 /*--- ISEL: Statements                                  ---*/
   4279 /*---------------------------------------------------------*/
   4280 
   4281 static void iselStmt ( ISelEnv* env, IRStmt* stmt )
   4282 {
   4283    if (vex_traceflags & VEX_TRACE_VCODE) {
   4284       vex_printf("\n-- ");
   4285       ppIRStmt(stmt);
   4286       vex_printf("\n");
   4287    }
   4288 
   4289    switch (stmt->tag) {
   4290 
   4291    /* --------- STORE --------- */
   4292    case Ist_Store: {
   4293       IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
   4294       IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
   4295       IREndness end   = stmt->Ist.Store.end;
   4296 
   4297       if (tya != Ity_I64 || end != Iend_LE)
   4298          goto stmt_fail;
   4299 
   4300       if (tyd == Ity_I64) {
   4301          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4302          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
   4303          addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
   4304          return;
   4305       }
   4306       if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
   4307          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4308          HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
   4309          addInstr(env, AMD64Instr_Store(
   4310                           toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
   4311                           r,am));
   4312          return;
   4313       }
   4314       if (tyd == Ity_F64) {
   4315          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4316          HReg r = iselDblExpr(env, stmt->Ist.Store.data);
   4317          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
   4318          return;
   4319       }
   4320       if (tyd == Ity_F32) {
   4321          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4322          HReg r = iselFltExpr(env, stmt->Ist.Store.data);
   4323          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
   4324          return;
   4325       }
   4326       if (tyd == Ity_V128) {
   4327          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
   4328          HReg r = iselVecExpr(env, stmt->Ist.Store.data);
   4329          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
   4330          return;
   4331       }
   4332       if (tyd == Ity_V256) {
   4333          HReg        rA   = iselIntExpr_R(env, stmt->Ist.Store.addr);
   4334          AMD64AMode* am0  = AMD64AMode_IR(0,  rA);
   4335          AMD64AMode* am16 = AMD64AMode_IR(16, rA);
   4336          HReg vHi, vLo;
   4337          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data);
   4338          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   4339          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   4340          return;
   4341       }
   4342       break;
   4343    }
   4344 
   4345    /* --------- PUT --------- */
   4346    case Ist_Put: {
   4347       IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
   4348       if (ty == Ity_I64) {
   4349          /* We're going to write to memory, so compute the RHS into an
   4350             AMD64RI. */
   4351          AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
   4352          addInstr(env,
   4353                   AMD64Instr_Alu64M(
   4354                      Aalu_MOV,
   4355                      ri,
   4356                      AMD64AMode_IR(stmt->Ist.Put.offset,
   4357                                    hregAMD64_RBP())
   4358                  ));
   4359          return;
   4360       }
   4361       if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
   4362          HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
   4363          addInstr(env, AMD64Instr_Store(
   4364                           toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
   4365                           r,
   4366                           AMD64AMode_IR(stmt->Ist.Put.offset,
   4367                                         hregAMD64_RBP())));
   4368          return;
   4369       }
   4370       if (ty == Ity_F32) {
   4371          HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
   4372          AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
   4373          set_SSE_rounding_default(env); /* paranoia */
   4374          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
   4375          return;
   4376       }
   4377       if (ty == Ity_F64) {
   4378          HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
   4379          AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset,
   4380                                          hregAMD64_RBP() );
   4381          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
   4382          return;
   4383       }
   4384       if (ty == Ity_V128) {
   4385          HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
   4386          AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset,
   4387                                          hregAMD64_RBP());
   4388          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
   4389          return;
   4390       }
   4391       if (ty == Ity_V256) {
   4392          HReg vHi, vLo;
   4393          iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data);
   4394          HReg        rbp  = hregAMD64_RBP();
   4395          AMD64AMode* am0  = AMD64AMode_IR(stmt->Ist.Put.offset + 0,  rbp);
   4396          AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp);
   4397          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0));
   4398          addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16));
   4399          return;
   4400       }
   4401       break;
   4402    }
   4403 
   4404    /* --------- Indexed PUT --------- */
   4405    case Ist_PutI: {
   4406       IRPutI *puti = stmt->Ist.PutI.details;
   4407 
   4408       AMD64AMode* am
   4409          = genGuestArrayOffset(
   4410               env, puti->descr,
   4411                    puti->ix, puti->bias );
   4412 
   4413       IRType ty = typeOfIRExpr(env->type_env, puti->data);
   4414       if (ty == Ity_F64) {
   4415          HReg val = iselDblExpr(env, puti->data);
   4416          addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
   4417          return;
   4418       }
   4419       if (ty == Ity_I8) {
   4420          HReg r = iselIntExpr_R(env, puti->data);
   4421          addInstr(env, AMD64Instr_Store( 1, r, am ));
   4422          return;
   4423       }
   4424       if (ty == Ity_I64) {
   4425          AMD64RI* ri = iselIntExpr_RI(env, puti->data);
   4426          addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
   4427          return;
   4428       }
   4429       break;
   4430    }
   4431 
   4432    /* --------- TMP --------- */
   4433    case Ist_WrTmp: {
   4434       IRTemp tmp = stmt->Ist.WrTmp.tmp;
   4435       IRType ty = typeOfIRTemp(env->type_env, tmp);
   4436 
   4437       /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
   4438          compute it into an AMode and then use LEA.  This usually
   4439          produces fewer instructions, often because (for memcheck
   4440          created IR) we get t = address-expression, (t is later used
   4441          twice) and so doing this naturally turns address-expression
   4442          back into an AMD64 amode. */
   4443       if (ty == Ity_I64
   4444           && stmt->Ist.WrTmp.data->tag == Iex_Binop
   4445           && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
   4446          AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
   4447          HReg dst = lookupIRTemp(env, tmp);
   4448          if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
   4449             /* Hmm, iselIntExpr_AMode wimped out and just computed the
   4450                value into a register.  Just emit a normal reg-reg move
   4451                so reg-alloc can coalesce it away in the usual way. */
   4452             HReg src = am->Aam.IR.reg;
   4453             addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
   4454          } else {
   4455             addInstr(env, AMD64Instr_Lea64(am,dst));
   4456          }
   4457          return;
   4458       }
   4459 
   4460       if (ty == Ity_I64 || ty == Ity_I32
   4461           || ty == Ity_I16 || ty == Ity_I8) {
   4462          AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
   4463          HReg dst = lookupIRTemp(env, tmp);
   4464          addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
   4465          return;
   4466       }
   4467       if (ty == Ity_I128) {
   4468          HReg rHi, rLo, dstHi, dstLo;
   4469          iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   4470          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   4471          addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
   4472          addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
   4473          return;
   4474       }
   4475       if (ty == Ity_I1) {
   4476          AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
   4477          HReg dst = lookupIRTemp(env, tmp);
   4478          addInstr(env, AMD64Instr_Set64(cond, dst));
   4479          return;
   4480       }
   4481       if (ty == Ity_F64) {
   4482          HReg dst = lookupIRTemp(env, tmp);
   4483          HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
   4484          addInstr(env, mk_vMOVsd_RR(src, dst));
   4485          return;
   4486       }
   4487       if (ty == Ity_F32) {
   4488          HReg dst = lookupIRTemp(env, tmp);
   4489          HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
   4490          addInstr(env, mk_vMOVsd_RR(src, dst));
   4491          return;
   4492       }
   4493       if (ty == Ity_V128) {
   4494          HReg dst = lookupIRTemp(env, tmp);
   4495          HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
   4496          addInstr(env, mk_vMOVsd_RR(src, dst));
   4497          return;
   4498       }
   4499       if (ty == Ity_V256) {
   4500          HReg rHi, rLo, dstHi, dstLo;
   4501          iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
   4502          lookupIRTempPair( &dstHi, &dstLo, env, tmp);
   4503          addInstr(env, mk_vMOVsd_RR(rHi,dstHi) );
   4504          addInstr(env, mk_vMOVsd_RR(rLo,dstLo) );
   4505          return;
   4506       }
   4507       break;
   4508    }
   4509 
   4510    /* --------- Call to DIRTY helper --------- */
   4511    case Ist_Dirty: {
   4512       IRDirty* d = stmt->Ist.Dirty.details;
   4513 
   4514       /* Figure out the return type, if any. */
   4515       IRType retty = Ity_INVALID;
   4516       if (d->tmp != IRTemp_INVALID)
   4517          retty = typeOfIRTemp(env->type_env, d->tmp);
   4518 
   4519       /* Throw out any return types we don't know about. */
   4520       Bool retty_ok = False;
   4521       switch (retty) {
   4522          case Ity_INVALID: /* function doesn't return anything */
   4523          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
   4524          case Ity_V128: case Ity_V256:
   4525             retty_ok = True; break;
   4526          default:
   4527             break;
   4528       }
   4529       if (!retty_ok)
   4530          break; /* will go to stmt_fail: */
   4531 
   4532       /* Marshal args, do the call, and set the return value to
   4533          0x555..555 if this is a conditional call that returns a value
   4534          and the call is skipped. */
   4535       UInt   addToSp = 0;
   4536       RetLoc rloc    = mk_RetLoc_INVALID();
   4537       doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args );
   4538       vassert(is_sane_RetLoc(rloc));
   4539 
   4540       /* Now figure out what to do with the returned value, if any. */
   4541       switch (retty) {
   4542          case Ity_INVALID: {
   4543             /* No return value.  Nothing to do. */
   4544             vassert(d->tmp == IRTemp_INVALID);
   4545             vassert(rloc.pri == RLPri_None);
   4546             vassert(addToSp == 0);
   4547             return;
   4548          }
   4549          case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: {
   4550             /* The returned value is in %rax.  Park it in the register
   4551                associated with tmp. */
   4552             vassert(rloc.pri == RLPri_Int);
   4553             vassert(addToSp == 0);
   4554             HReg dst = lookupIRTemp(env, d->tmp);
   4555             addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
   4556             return;
   4557          }
   4558          case Ity_V128: {
   4559             /* The returned value is on the stack, and rloc.spOff
   4560                tells us where.  Fish it off the stack and then move
   4561                the stack pointer upwards to clear it, as directed by
   4562                doHelperCall. */
   4563             vassert(rloc.pri == RLPri_V128SpRel);
   4564             vassert(addToSp >= 16);
   4565             HReg        dst = lookupIRTemp(env, d->tmp);
   4566             AMD64AMode* am  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
   4567             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
   4568             add_to_rsp(env, addToSp);
   4569             return;
   4570          }
   4571          case Ity_V256: {
   4572             /* See comments for Ity_V128. */
   4573             vassert(rloc.pri == RLPri_V256SpRel);
   4574             vassert(addToSp >= 32);
   4575             HReg        dstLo, dstHi;
   4576             lookupIRTempPair(&dstHi, &dstLo, env, d->tmp);
   4577             AMD64AMode* amLo  = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP());
   4578             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo ));
   4579             AMD64AMode* amHi  = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP());
   4580             addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi ));
   4581             add_to_rsp(env, addToSp);
   4582             return;
   4583          }
   4584          default:
   4585             /*NOTREACHED*/
   4586             vassert(0);
   4587       }
   4588       break;
   4589    }
   4590 
   4591    /* --------- MEM FENCE --------- */
   4592    case Ist_MBE:
   4593       switch (stmt->Ist.MBE.event) {
   4594          case Imbe_Fence:
   4595             addInstr(env, AMD64Instr_MFence());
   4596             return;
   4597          default:
   4598             break;
   4599       }
   4600       break;
   4601 
   4602    /* --------- ACAS --------- */
   4603    case Ist_CAS:
   4604       if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
   4605          /* "normal" singleton CAS */
   4606          UChar  sz;
   4607          IRCAS* cas = stmt->Ist.CAS.details;
   4608          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4609          /* get: cas->expd into %rax, and cas->data into %rbx */
   4610          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4611          HReg rData = iselIntExpr_R(env, cas->dataLo);
   4612          HReg rExpd = iselIntExpr_R(env, cas->expdLo);
   4613          HReg rOld  = lookupIRTemp(env, cas->oldLo);
   4614          vassert(cas->expdHi == NULL);
   4615          vassert(cas->dataHi == NULL);
   4616          addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
   4617          addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
   4618          addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
   4619          switch (ty) {
   4620             case Ity_I64: sz = 8; break;
   4621             case Ity_I32: sz = 4; break;
   4622             case Ity_I16: sz = 2; break;
   4623             case Ity_I8:  sz = 1; break;
   4624             default: goto unhandled_cas;
   4625          }
   4626          addInstr(env, AMD64Instr_ACAS(am, sz));
   4627          addInstr(env, AMD64Instr_CMov64(
   4628                           Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
   4629          return;
   4630       } else {
   4631          /* double CAS */
   4632          UChar  sz;
   4633          IRCAS* cas = stmt->Ist.CAS.details;
   4634          IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
   4635          /* only 32-bit and 64-bit allowed in this case */
   4636          /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
   4637          /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
   4638          AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
   4639          HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
   4640          HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
   4641          HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
   4642          HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
   4643          HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
   4644          HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
   4645          switch (ty) {
   4646             case Ity_I64:
   4647                if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
   4648                   goto unhandled_cas; /* we'd have to generate
   4649                                          cmpxchg16b, but the host
   4650                                          doesn't support that */
   4651                sz = 8;
   4652                break;
   4653             case Ity_I32:
   4654                sz = 4;
   4655                break;
   4656             default:
   4657                goto unhandled_cas;
   4658          }
   4659          addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
   4660          addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
   4661          addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
   4662          addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
   4663          addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
   4664          addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
   4665          addInstr(env, AMD64Instr_DACAS(am, sz));
   4666          addInstr(env,
   4667                   AMD64Instr_CMov64(
   4668                      Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
   4669          addInstr(env,
   4670                   AMD64Instr_CMov64(
   4671                      Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
   4672          return;
   4673       }
   4674       unhandled_cas:
   4675       break;
   4676 
   4677    /* --------- INSTR MARK --------- */
   4678    /* Doesn't generate any executable code ... */
   4679    case Ist_IMark:
   4680        return;
   4681 
   4682    /* --------- ABI HINT --------- */
   4683    /* These have no meaning (denotation in the IR) and so we ignore
   4684       them ... if any actually made it this far. */
   4685    case Ist_AbiHint:
   4686        return;
   4687 
   4688    /* --------- NO-OP --------- */
   4689    case Ist_NoOp:
   4690        return;
   4691 
   4692    /* --------- EXIT --------- */
   4693    case Ist_Exit: {
   4694       if (stmt->Ist.Exit.dst->tag != Ico_U64)
   4695          vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
   4696 
   4697       AMD64CondCode cc    = iselCondCode(env, stmt->Ist.Exit.guard);
   4698       AMD64AMode*   amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP,
   4699                                           hregAMD64_RBP());
   4700 
   4701       /* Case: boring transfer to known address */
   4702       if (stmt->Ist.Exit.jk == Ijk_Boring) {
   4703          if (env->chainingAllowed) {
   4704             /* .. almost always true .. */
   4705             /* Skip the event check at the dst if this is a forwards
   4706                edge. */
   4707             Bool toFastEP
   4708                = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga;
   4709             if (0) vex_printf("%s", toFastEP ? "Y" : ",");
   4710             addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64,
   4711                                              amRIP, cc, toFastEP));
   4712          } else {
   4713             /* .. very occasionally .. */
   4714             /* We can't use chaining, so ask for an assisted transfer,
   4715                as that's the only alternative that is allowable. */
   4716             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4717             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring));
   4718          }
   4719          return;
   4720       }
   4721 
   4722       /* Case: assisted transfer to arbitrary address */
   4723       switch (stmt->Ist.Exit.jk) {
   4724          /* Keep this list in sync with that in iselNext below */
   4725          case Ijk_ClientReq:
   4726          case Ijk_EmWarn:
   4727          case Ijk_NoDecode:
   4728          case Ijk_NoRedir:
   4729          case Ijk_SigSEGV:
   4730          case Ijk_SigTRAP:
   4731          case Ijk_Sys_syscall:
   4732          case Ijk_InvalICache:
   4733          case Ijk_Yield:
   4734          {
   4735             HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
   4736             addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk));
   4737             return;
   4738          }
   4739          default:
   4740             break;
   4741       }
   4742 
   4743       /* Do we ever expect to see any other kind? */
   4744       goto stmt_fail;
   4745    }
   4746 
   4747    default: break;
   4748    }
   4749   stmt_fail:
   4750    ppIRStmt(stmt);
   4751    vpanic("iselStmt(amd64)");
   4752 }
   4753 
   4754 
   4755 /*---------------------------------------------------------*/
   4756 /*--- ISEL: Basic block terminators (Nexts)             ---*/
   4757 /*---------------------------------------------------------*/
   4758 
   4759 static void iselNext ( ISelEnv* env,
   4760                        IRExpr* next, IRJumpKind jk, Int offsIP )
   4761 {
   4762    if (vex_traceflags & VEX_TRACE_VCODE) {
   4763       vex_printf( "\n-- PUT(%d) = ", offsIP);
   4764       ppIRExpr( next );
   4765       vex_printf( "; exit-");
   4766       ppIRJumpKind(jk);
   4767       vex_printf( "\n");
   4768    }
   4769 
   4770    /* Case: boring transfer to known address */
   4771    if (next->tag == Iex_Const) {
   4772       IRConst* cdst = next->Iex.Const.con;
   4773       vassert(cdst->tag == Ico_U64);
   4774       if (jk == Ijk_Boring || jk == Ijk_Call) {
   4775          /* Boring transfer to known address */
   4776          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4777          if (env->chainingAllowed) {
   4778             /* .. almost always true .. */
   4779             /* Skip the event check at the dst if this is a forwards
   4780                edge. */
   4781             Bool toFastEP
   4782                = ((Addr64)cdst->Ico.U64) > env->max_ga;
   4783             if (0) vex_printf("%s", toFastEP ? "X" : ".");
   4784             addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64,
   4785                                              amRIP, Acc_ALWAYS,
   4786                                              toFastEP));
   4787          } else {
   4788             /* .. very occasionally .. */
   4789             /* We can't use chaining, so ask for an indirect transfer,
   4790                as that's the cheapest alternative that is
   4791                allowable. */
   4792             HReg r = iselIntExpr_R(env, next);
   4793             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4794                                                Ijk_Boring));
   4795          }
   4796          return;
   4797       }
   4798    }
   4799 
   4800    /* Case: call/return (==boring) transfer to any address */
   4801    switch (jk) {
   4802       case Ijk_Boring: case Ijk_Ret: case Ijk_Call: {
   4803          HReg        r     = iselIntExpr_R(env, next);
   4804          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4805          if (env->chainingAllowed) {
   4806             addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS));
   4807          } else {
   4808             addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS,
   4809                                                Ijk_Boring));
   4810          }
   4811          return;
   4812       }
   4813       default:
   4814          break;
   4815    }
   4816 
   4817    /* Case: assisted transfer to arbitrary address */
   4818    switch (jk) {
   4819       /* Keep this list in sync with that for Ist_Exit above */
   4820       case Ijk_ClientReq:
   4821       case Ijk_EmWarn:
   4822       case Ijk_NoDecode:
   4823       case Ijk_NoRedir:
   4824       case Ijk_SigSEGV:
   4825       case Ijk_SigTRAP:
   4826       case Ijk_Sys_syscall:
   4827       case Ijk_InvalICache:
   4828       case Ijk_Yield: {
   4829          HReg        r     = iselIntExpr_R(env, next);
   4830          AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP());
   4831          addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk));
   4832          return;
   4833       }
   4834       default:
   4835          break;
   4836    }
   4837 
   4838    vex_printf( "\n-- PUT(%d) = ", offsIP);
   4839    ppIRExpr( next );
   4840    vex_printf( "; exit-");
   4841    ppIRJumpKind(jk);
   4842    vex_printf( "\n");
   4843    vassert(0); // are we expecting any other kind?
   4844 }
   4845 
   4846 
   4847 /*---------------------------------------------------------*/
   4848 /*--- Insn selector top-level                           ---*/
   4849 /*---------------------------------------------------------*/
   4850 
   4851 /* Translate an entire SB to amd64 code. */
   4852 
   4853 HInstrArray* iselSB_AMD64 ( IRSB* bb,
   4854                             VexArch      arch_host,
   4855                             VexArchInfo* archinfo_host,
   4856                             VexAbiInfo*  vbi/*UNUSED*/,
   4857                             Int offs_Host_EvC_Counter,
   4858                             Int offs_Host_EvC_FailAddr,
   4859                             Bool chainingAllowed,
   4860                             Bool addProfInc,
   4861                             Addr64 max_ga )
   4862 {
   4863    Int        i, j;
   4864    HReg       hreg, hregHI;
   4865    ISelEnv*   env;
   4866    UInt       hwcaps_host = archinfo_host->hwcaps;
   4867    AMD64AMode *amCounter, *amFailAddr;
   4868 
   4869    /* sanity ... */
   4870    vassert(arch_host == VexArchAMD64);
   4871    vassert(0 == (hwcaps_host
   4872                  & ~(VEX_HWCAPS_AMD64_SSE3
   4873                      | VEX_HWCAPS_AMD64_CX16
   4874                      | VEX_HWCAPS_AMD64_LZCNT
   4875                      | VEX_HWCAPS_AMD64_AVX
   4876                      | VEX_HWCAPS_AMD64_RDTSCP
   4877                      | VEX_HWCAPS_AMD64_BMI
   4878                      | VEX_HWCAPS_AMD64_AVX2)));
   4879 
   4880    /* Make up an initial environment to use. */
   4881    env = LibVEX_Alloc(sizeof(ISelEnv));
   4882    env->vreg_ctr = 0;
   4883 
   4884    /* Set up output code array. */
   4885    env->code = newHInstrArray();
   4886 
   4887    /* Copy BB's type env. */
   4888    env->type_env = bb->tyenv;
   4889 
   4890    /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
   4891       change as we go along. */
   4892    env->n_vregmap = bb->tyenv->types_used;
   4893    env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4894    env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
   4895 
   4896    /* and finally ... */
   4897    env->chainingAllowed = chainingAllowed;
   4898    env->hwcaps          = hwcaps_host;
   4899    env->max_ga          = max_ga;
   4900 
   4901    /* For each IR temporary, allocate a suitably-kinded virtual
   4902       register. */
   4903    j = 0;
   4904    for (i = 0; i < env->n_vregmap; i++) {
   4905       hregHI = hreg = INVALID_HREG;
   4906       switch (bb->tyenv->types[i]) {
   4907          case Ity_I1:
   4908          case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64:
   4909             hreg = mkHReg(j++, HRcInt64, True);
   4910             break;
   4911          case Ity_I128:
   4912             hreg   = mkHReg(j++, HRcInt64, True);
   4913             hregHI = mkHReg(j++, HRcInt64, True);
   4914             break;
   4915          case Ity_F32:
   4916          case Ity_F64:
   4917          case Ity_V128:
   4918             hreg = mkHReg(j++, HRcVec128, True);
   4919             break;
   4920          case Ity_V256:
   4921             hreg   = mkHReg(j++, HRcVec128, True);
   4922             hregHI = mkHReg(j++, HRcVec128, True);
   4923             break;
   4924          default:
   4925             ppIRType(bb->tyenv->types[i]);
   4926             vpanic("iselBB(amd64): IRTemp type");
   4927       }
   4928       env->vregmap[i]   = hreg;
   4929       env->vregmapHI[i] = hregHI;
   4930    }
   4931    env->vreg_ctr = j;
   4932 
   4933    /* The very first instruction must be an event check. */
   4934    amCounter  = AMD64AMode_IR(offs_Host_EvC_Counter,  hregAMD64_RBP());
   4935    amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP());
   4936    addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr));
   4937 
   4938    /* Possibly a block counter increment (for profiling).  At this
   4939       point we don't know the address of the counter, so just pretend
   4940       it is zero.  It will have to be patched later, but before this
   4941       translation is used, by a call to LibVEX_patchProfCtr. */
   4942    if (addProfInc) {
   4943       addInstr(env, AMD64Instr_ProfInc());
   4944    }
   4945 
   4946    /* Ok, finally we can iterate over the statements. */
   4947    for (i = 0; i < bb->stmts_used; i++)
   4948       if (bb->stmts[i])
   4949          iselStmt(env, bb->stmts[i]);
   4950 
   4951    iselNext(env, bb->next, bb->jumpkind, bb->offsIP);
   4952 
   4953    /* record the number of vregs we used. */
   4954    env->code->n_vregs = env->vreg_ctr;
   4955    return env->code;
   4956 }
   4957 
   4958 
   4959 /*---------------------------------------------------------------*/
   4960 /*--- end                                   host_amd64_isel.c ---*/
   4961 /*---------------------------------------------------------------*/
   4962