Home | History | Annotate | Download | only in priv
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- begin                                     guest_amd64_toIR.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2015 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Translates AMD64 code to IR. */
     37 
     38 /* TODO:
     39 
     40    All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
     41    to ensure a 64-bit value is being written.
     42 
     43    x87 FP Limitations:
     44 
     45    * all arithmetic done at 64 bits
     46 
     47    * no FP exceptions, except for handling stack over/underflow
     48 
     49    * FP rounding mode observed only for float->int conversions and
     50      int->float conversions which could lose accuracy, and for
     51      float-to-float rounding.  For all other operations,
     52      round-to-nearest is used, regardless.
     53 
     54    * some of the FCOM cases could do with testing -- not convinced
     55      that the args are the right way round.
     56 
     57    * FSAVE does not re-initialise the FPU; it should do
     58 
     59    * FINIT not only initialises the FPU environment, it also zeroes
     60      all the FP registers.  It should leave the registers unchanged.
     61 
     62     SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
     63     per Intel docs this bit has no meaning anyway.  Since PUSHF is the
     64     only way to observe eflags[1], a proper fix would be to make that
     65     bit be set by PUSHF.
     66 
     67     This module uses global variables and so is not MT-safe (if that
     68     should ever become relevant).
     69 */
     70 
     71 /* Notes re address size overrides (0x67).
     72 
     73    According to the AMD documentation (24594 Rev 3.09, Sept 2003,
     74    "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
     75    and System Instructions"), Section 1.2.3 ("Address-Size Override
     76    Prefix"):
     77 
     78    0x67 applies to all explicit memory references, causing the top
     79    32 bits of the effective address to become zero.
     80 
     81    0x67 has no effect on stack references (push/pop); these always
     82    use a 64-bit address.
     83 
     84    0x67 changes the interpretation of instructions which implicitly
     85    reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
     86    instead.  These are:
     87 
     88       cmp{s,sb,sw,sd,sq}
     89       in{s,sb,sw,sd}
     90       jcxz, jecxz, jrcxz
     91       lod{s,sb,sw,sd,sq}
     92       loop{,e,bz,be,z}
     93       mov{s,sb,sw,sd,sq}
     94       out{s,sb,sw,sd}
     95       rep{,e,ne,nz}
     96       sca{s,sb,sw,sd,sq}
     97       sto{s,sb,sw,sd,sq}
     98       xlat{,b} */
     99 
    100 /* "Special" instructions.
    101 
    102    This instruction decoder can decode three special instructions
    103    which mean nothing natively (are no-ops as far as regs/mem are
    104    concerned) but have meaning for supporting Valgrind.  A special
    105    instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
    106    48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
    107    $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
    108    Following that, one of the following 3 are allowed (standard
    109    interpretation in parentheses):
    110 
    111       4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
    112       4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
    113       4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
    114       4887F6 (xchgq %rdi,%rdi)   IR injection
    115 
    116    Any other bytes following the 16-byte preamble are illegal and
    117    constitute a failure in instruction decoding.  This all assumes
    118    that the preamble will never occur except in specific code
    119    fragments designed for Valgrind to catch.
    120 
    121    No prefixes may precede a "Special" instruction.
    122 */
    123 
    124 /* casLE (implementation of lock-prefixed insns) and rep-prefixed
    125    insns: the side-exit back to the start of the insn is done with
    126    Ijk_Boring.  This is quite wrong, it should be done with
    127    Ijk_NoRedir, since otherwise the side exit, which is intended to
    128    restart the instruction for whatever reason, could go somewhere
    129    entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
    130    no-redir jumps performance critical, at least for rep-prefixed
    131    instructions, since all iterations thereof would involve such a
    132    jump.  It's not such a big deal with casLE since the side exit is
    133    only taken if the CAS fails, that is, the location is contended,
    134    which is relatively unlikely.
    135 
    136    Note also, the test for CAS success vs failure is done using
    137    Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
    138    Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
    139    shouldn't definedness-check these comparisons.  See
    140    COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
    141    background/rationale.
    142 */
    143 
    144 /* LOCK prefixed instructions.  These are translated using IR-level
    145    CAS statements (IRCAS) and are believed to preserve atomicity, even
    146    from the point of view of some other process racing against a
    147    simulated one (presumably they communicate via a shared memory
    148    segment).
    149 
    150    Handlers which are aware of LOCK prefixes are:
    151       dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
    152       dis_cmpxchg_G_E  (cmpxchg)
    153       dis_Grp1         (add, or, adc, sbb, and, sub, xor)
    154       dis_Grp3         (not, neg)
    155       dis_Grp4         (inc, dec)
    156       dis_Grp5         (inc, dec)
    157       dis_Grp8_Imm     (bts, btc, btr)
    158       dis_bt_G_E       (bts, btc, btr)
    159       dis_xadd_G_E     (xadd)
    160 */
    161 
    162 
    163 #include "libvex_basictypes.h"
    164 #include "libvex_ir.h"
    165 #include "libvex.h"
    166 #include "libvex_guest_amd64.h"
    167 
    168 #include "main_util.h"
    169 #include "main_globals.h"
    170 #include "guest_generic_bb_to_IR.h"
    171 #include "guest_generic_x87.h"
    172 #include "guest_amd64_defs.h"
    173 
    174 
    175 /*------------------------------------------------------------*/
    176 /*--- Globals                                              ---*/
    177 /*------------------------------------------------------------*/
    178 
    179 /* These are set at the start of the translation of an insn, right
    180    down in disInstr_AMD64, so that we don't have to pass them around
    181    endlessly.  They are all constant during the translation of any
    182    given insn. */
    183 
    184 /* These are set at the start of the translation of a BB, so
    185    that we don't have to pass them around endlessly. */
    186 
    187 /* We need to know this to do sub-register accesses correctly. */
    188 static VexEndness host_endness;
    189 
    190 /* Pointer to the guest code area (points to start of BB, not to the
    191    insn being processed). */
    192 static const UChar* guest_code;
    193 
    194 /* The guest address corresponding to guest_code[0]. */
    195 static Addr64 guest_RIP_bbstart;
    196 
    197 /* The guest address for the instruction currently being
    198    translated. */
    199 static Addr64 guest_RIP_curr_instr;
    200 
    201 /* The IRSB* into which we're generating code. */
    202 static IRSB* irsb;
    203 
    204 /* For ensuring that %rip-relative addressing is done right.  A read
    205    of %rip generates the address of the next instruction.  It may be
    206    that we don't conveniently know that inside disAMode().  For sanity
    207    checking, if the next insn %rip is needed, we make a guess at what
    208    it is, record that guess here, and set the accompanying Bool to
    209    indicate that -- after this insn's decode is finished -- that guess
    210    needs to be checked.  */
    211 
    212 /* At the start of each insn decode, is set to (0, False).
    213    After the decode, if _mustcheck is now True, _assumed is
    214    checked. */
    215 
    216 static Addr64 guest_RIP_next_assumed;
    217 static Bool   guest_RIP_next_mustcheck;
    218 
    219 
    220 /*------------------------------------------------------------*/
    221 /*--- Helpers for constructing IR.                         ---*/
    222 /*------------------------------------------------------------*/
    223 
    224 /* Generate a new temporary of the given type. */
    225 static IRTemp newTemp ( IRType ty )
    226 {
    227    vassert(isPlausibleIRType(ty));
    228    return newIRTemp( irsb->tyenv, ty );
    229 }
    230 
    231 /* Add a statement to the list held by "irsb". */
    232 static void stmt ( IRStmt* st )
    233 {
    234    addStmtToIRSB( irsb, st );
    235 }
    236 
    237 /* Generate a statement "dst := e". */
    238 static void assign ( IRTemp dst, IRExpr* e )
    239 {
    240    stmt( IRStmt_WrTmp(dst, e) );
    241 }
    242 
    243 static IRExpr* unop ( IROp op, IRExpr* a )
    244 {
    245    return IRExpr_Unop(op, a);
    246 }
    247 
    248 static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
    249 {
    250    return IRExpr_Binop(op, a1, a2);
    251 }
    252 
    253 static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
    254 {
    255    return IRExpr_Triop(op, a1, a2, a3);
    256 }
    257 
    258 static IRExpr* mkexpr ( IRTemp tmp )
    259 {
    260    return IRExpr_RdTmp(tmp);
    261 }
    262 
    263 static IRExpr* mkU8 ( ULong i )
    264 {
    265    vassert(i < 256);
    266    return IRExpr_Const(IRConst_U8( (UChar)i ));
    267 }
    268 
    269 static IRExpr* mkU16 ( ULong i )
    270 {
    271    vassert(i < 0x10000ULL);
    272    return IRExpr_Const(IRConst_U16( (UShort)i ));
    273 }
    274 
    275 static IRExpr* mkU32 ( ULong i )
    276 {
    277    vassert(i < 0x100000000ULL);
    278    return IRExpr_Const(IRConst_U32( (UInt)i ));
    279 }
    280 
    281 static IRExpr* mkU64 ( ULong i )
    282 {
    283    return IRExpr_Const(IRConst_U64(i));
    284 }
    285 
    286 static IRExpr* mkU ( IRType ty, ULong i )
    287 {
    288    switch (ty) {
    289       case Ity_I8:  return mkU8(i);
    290       case Ity_I16: return mkU16(i);
    291       case Ity_I32: return mkU32(i);
    292       case Ity_I64: return mkU64(i);
    293       default: vpanic("mkU(amd64)");
    294    }
    295 }
    296 
    297 static void storeLE ( IRExpr* addr, IRExpr* data )
    298 {
    299    stmt( IRStmt_Store(Iend_LE, addr, data) );
    300 }
    301 
    302 static IRExpr* loadLE ( IRType ty, IRExpr* addr )
    303 {
    304    return IRExpr_Load(Iend_LE, ty, addr);
    305 }
    306 
    307 static IROp mkSizedOp ( IRType ty, IROp op8 )
    308 {
    309    vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
    310            || op8 == Iop_Mul8
    311            || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
    312            || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
    313            || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
    314            || op8 == Iop_CasCmpNE8
    315            || op8 == Iop_Not8 );
    316    switch (ty) {
    317       case Ity_I8:  return 0 +op8;
    318       case Ity_I16: return 1 +op8;
    319       case Ity_I32: return 2 +op8;
    320       case Ity_I64: return 3 +op8;
    321       default: vpanic("mkSizedOp(amd64)");
    322    }
    323 }
    324 
    325 static
    326 IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
    327 {
    328    if (szSmall == 1 && szBig == 4) {
    329       return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
    330    }
    331    if (szSmall == 1 && szBig == 2) {
    332       return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
    333    }
    334    if (szSmall == 2 && szBig == 4) {
    335       return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
    336    }
    337    if (szSmall == 1 && szBig == 8 && !signd) {
    338       return unop(Iop_8Uto64, src);
    339    }
    340    if (szSmall == 1 && szBig == 8 && signd) {
    341       return unop(Iop_8Sto64, src);
    342    }
    343    if (szSmall == 2 && szBig == 8 && !signd) {
    344       return unop(Iop_16Uto64, src);
    345    }
    346    if (szSmall == 2 && szBig == 8 && signd) {
    347       return unop(Iop_16Sto64, src);
    348    }
    349    vpanic("doScalarWidening(amd64)");
    350 }
    351 
    352 static
    353 void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
    354 {
    355    IRType ty = typeOfIRExpr(irsb->tyenv, value);
    356    stmt( IRStmt_Put(gstOffB,
    357                     IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
    358 }
    359 
    360 
    361 /*------------------------------------------------------------*/
    362 /*--- Debugging output                                     ---*/
    363 /*------------------------------------------------------------*/
    364 
    365 /* Bomb out if we can't handle something. */
    366 __attribute__ ((noreturn))
    367 static void unimplemented ( const HChar* str )
    368 {
    369    vex_printf("amd64toIR: unimplemented feature\n");
    370    vpanic(str);
    371 }
    372 
    373 #define DIP(format, args...)           \
    374    if (vex_traceflags & VEX_TRACE_FE)  \
    375       vex_printf(format, ## args)
    376 
    377 #define DIS(buf, format, args...)      \
    378    if (vex_traceflags & VEX_TRACE_FE)  \
    379       vex_sprintf(buf, format, ## args)
    380 
    381 
    382 /*------------------------------------------------------------*/
    383 /*--- Offsets of various parts of the amd64 guest state.   ---*/
    384 /*------------------------------------------------------------*/
    385 
    386 #define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
    387 #define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
    388 #define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
    389 #define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
    390 #define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
    391 #define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
    392 #define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
    393 #define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
    394 #define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
    395 #define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
    396 #define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
    397 #define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
    398 #define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
    399 #define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
    400 #define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
    401 #define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
    402 
    403 #define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
    404 
    405 #define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
    406 #define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
    407 
    408 #define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
    409 #define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
    410 #define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
    411 #define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
    412 
    413 #define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
    414 #define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
    415 #define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
    416 #define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
    417 #define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
    418 #define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
    419 #define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
    420 #define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
    421 
    422 #define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
    423 #define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
    424 #define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
    425 #define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
    426 #define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
    427 #define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
    428 #define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
    429 #define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
    430 #define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
    431 #define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
    432 #define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
    433 #define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
    434 #define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
    435 #define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
    436 #define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
    437 #define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
    438 #define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
    439 #define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
    440 
    441 #define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
    442 #define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
    443 #define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
    444 
    445 #define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
    446 
    447 
    448 /*------------------------------------------------------------*/
    449 /*--- Helper bits and pieces for deconstructing the        ---*/
    450 /*--- amd64 insn stream.                                   ---*/
    451 /*------------------------------------------------------------*/
    452 
    453 /* This is the AMD64 register encoding -- integer regs. */
    454 #define R_RAX 0
    455 #define R_RCX 1
    456 #define R_RDX 2
    457 #define R_RBX 3
    458 #define R_RSP 4
    459 #define R_RBP 5
    460 #define R_RSI 6
    461 #define R_RDI 7
    462 #define R_R8  8
    463 #define R_R9  9
    464 #define R_R10 10
    465 #define R_R11 11
    466 #define R_R12 12
    467 #define R_R13 13
    468 #define R_R14 14
    469 #define R_R15 15
    470 
    471 /* This is the Intel register encoding -- segment regs. */
    472 #define R_ES 0
    473 #define R_CS 1
    474 #define R_SS 2
    475 #define R_DS 3
    476 #define R_FS 4
    477 #define R_GS 5
    478 
    479 
    480 /* Various simple conversions */
    481 
    482 static ULong extend_s_8to64 ( UChar x )
    483 {
    484    return (ULong)((Long)(((ULong)x) << 56) >> 56);
    485 }
    486 
    487 static ULong extend_s_16to64 ( UShort x )
    488 {
    489    return (ULong)((Long)(((ULong)x) << 48) >> 48);
    490 }
    491 
    492 static ULong extend_s_32to64 ( UInt x )
    493 {
    494    return (ULong)((Long)(((ULong)x) << 32) >> 32);
    495 }
    496 
    497 /* Figure out whether the mod and rm parts of a modRM byte refer to a
    498    register or memory.  If so, the byte will have the form 11XXXYYY,
    499    where YYY is the register number. */
    500 inline
    501 static Bool epartIsReg ( UChar mod_reg_rm )
    502 {
    503    return toBool(0xC0 == (mod_reg_rm & 0xC0));
    504 }
    505 
    506 /* Extract the 'g' field from a modRM byte.  This only produces 3
    507    bits, which is not a complete register number.  You should avoid
    508    this function if at all possible. */
    509 inline
    510 static Int gregLO3ofRM ( UChar mod_reg_rm )
    511 {
    512    return (Int)( (mod_reg_rm >> 3) & 7 );
    513 }
    514 
    515 /* Ditto the 'e' field of a modRM byte. */
    516 inline
    517 static Int eregLO3ofRM ( UChar mod_reg_rm )
    518 {
    519    return (Int)(mod_reg_rm & 0x7);
    520 }
    521 
    522 /* Get a 8/16/32-bit unsigned value out of the insn stream. */
    523 
    524 static inline UChar getUChar ( Long delta )
    525 {
    526    UChar v = guest_code[delta+0];
    527    return v;
    528 }
    529 
    530 static UInt getUDisp16 ( Long delta )
    531 {
    532    UInt v = guest_code[delta+1]; v <<= 8;
    533    v |= guest_code[delta+0];
    534    return v & 0xFFFF;
    535 }
    536 
    537 //.. static UInt getUDisp ( Int size, Long delta )
    538 //.. {
    539 //..    switch (size) {
    540 //..       case 4: return getUDisp32(delta);
    541 //..       case 2: return getUDisp16(delta);
    542 //..       case 1: return getUChar(delta);
    543 //..       default: vpanic("getUDisp(x86)");
    544 //..    }
    545 //..    return 0; /*notreached*/
    546 //.. }
    547 
    548 
    549 /* Get a byte value out of the insn stream and sign-extend to 64
    550    bits. */
    551 static Long getSDisp8 ( Long delta )
    552 {
    553    return extend_s_8to64( guest_code[delta] );
    554 }
    555 
    556 /* Get a 16-bit value out of the insn stream and sign-extend to 64
    557    bits. */
    558 static Long getSDisp16 ( Long delta )
    559 {
    560    UInt v = guest_code[delta+1]; v <<= 8;
    561    v |= guest_code[delta+0];
    562    return extend_s_16to64( (UShort)v );
    563 }
    564 
    565 /* Get a 32-bit value out of the insn stream and sign-extend to 64
    566    bits. */
    567 static Long getSDisp32 ( Long delta )
    568 {
    569    UInt v = guest_code[delta+3]; v <<= 8;
    570    v |= guest_code[delta+2]; v <<= 8;
    571    v |= guest_code[delta+1]; v <<= 8;
    572    v |= guest_code[delta+0];
    573    return extend_s_32to64( v );
    574 }
    575 
    576 /* Get a 64-bit value out of the insn stream. */
    577 static Long getDisp64 ( Long delta )
    578 {
    579    ULong v = 0;
    580    v |= guest_code[delta+7]; v <<= 8;
    581    v |= guest_code[delta+6]; v <<= 8;
    582    v |= guest_code[delta+5]; v <<= 8;
    583    v |= guest_code[delta+4]; v <<= 8;
    584    v |= guest_code[delta+3]; v <<= 8;
    585    v |= guest_code[delta+2]; v <<= 8;
    586    v |= guest_code[delta+1]; v <<= 8;
    587    v |= guest_code[delta+0];
    588    return v;
    589 }
    590 
    591 /* Note: because AMD64 doesn't allow 64-bit literals, it is an error
    592    if this is called with size==8.  Should not happen. */
    593 static Long getSDisp ( Int size, Long delta )
    594 {
    595    switch (size) {
    596       case 4: return getSDisp32(delta);
    597       case 2: return getSDisp16(delta);
    598       case 1: return getSDisp8(delta);
    599       default: vpanic("getSDisp(amd64)");
    600   }
    601 }
    602 
    603 static ULong mkSizeMask ( Int sz )
    604 {
    605    switch (sz) {
    606       case 1: return 0x00000000000000FFULL;
    607       case 2: return 0x000000000000FFFFULL;
    608       case 4: return 0x00000000FFFFFFFFULL;
    609       case 8: return 0xFFFFFFFFFFFFFFFFULL;
    610       default: vpanic("mkSzMask(amd64)");
    611    }
    612 }
    613 
    614 static Int imin ( Int a, Int b )
    615 {
    616    return (a < b) ? a : b;
    617 }
    618 
    619 static IRType szToITy ( Int n )
    620 {
    621    switch (n) {
    622       case 1: return Ity_I8;
    623       case 2: return Ity_I16;
    624       case 4: return Ity_I32;
    625       case 8: return Ity_I64;
    626       default: vex_printf("\nszToITy(%d)\n", n);
    627                vpanic("szToITy(amd64)");
    628    }
    629 }
    630 
    631 
    632 /*------------------------------------------------------------*/
    633 /*--- For dealing with prefixes.                           ---*/
    634 /*------------------------------------------------------------*/
    635 
    636 /* The idea is to pass around an int holding a bitmask summarising
    637    info from the prefixes seen on the current instruction, including
    638    info from the REX byte.  This info is used in various places, but
    639    most especially when making sense of register fields in
    640    instructions.
    641 
    642    The top 8 bits of the prefix are 0x55, just as a hacky way to
    643    ensure it really is a valid prefix.
    644 
    645    Things you can safely assume about a well-formed prefix:
    646    * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
    647    * if REX is not present then REXW,REXR,REXX,REXB will read
    648      as zero.
    649    * F2 and F3 will not both be 1.
    650 */
    651 
    652 typedef UInt  Prefix;
    653 
    654 #define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
    655 #define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
    656 #define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
    657 #define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
    658 #define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
    659 #define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
    660 #define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
    661 #define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
    662 #define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
    663 #define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
    664 #define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
    665 #define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
    666 #define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
    667 #define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
    668 #define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
    669 #define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
    670 #define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
    671 #define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
    672 /* The extra register field VEX.vvvv is encoded (after not-ing it) as
    673    PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
    674    positions. */
    675 #define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
    676 #define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
    677 #define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
    678 #define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
    679 
    680 
    681 #define PFX_EMPTY 0x55000000
    682 
    683 static Bool IS_VALID_PFX ( Prefix pfx ) {
    684    return toBool((pfx & 0xFF000000) == PFX_EMPTY);
    685 }
    686 
    687 static Bool haveREX ( Prefix pfx ) {
    688    return toBool(pfx & PFX_REX);
    689 }
    690 
    691 static Int getRexW ( Prefix pfx ) {
    692    return (pfx & PFX_REXW) ? 1 : 0;
    693 }
    694 static Int getRexR ( Prefix pfx ) {
    695    return (pfx & PFX_REXR) ? 1 : 0;
    696 }
    697 static Int getRexX ( Prefix pfx ) {
    698    return (pfx & PFX_REXX) ? 1 : 0;
    699 }
    700 static Int getRexB ( Prefix pfx ) {
    701    return (pfx & PFX_REXB) ? 1 : 0;
    702 }
    703 
    704 /* Check a prefix doesn't have F2 or F3 set in it, since usually that
    705    completely changes what instruction it really is. */
    706 static Bool haveF2orF3 ( Prefix pfx ) {
    707    return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
    708 }
    709 static Bool haveF2andF3 ( Prefix pfx ) {
    710    return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
    711 }
    712 static Bool haveF2 ( Prefix pfx ) {
    713    return toBool((pfx & PFX_F2) > 0);
    714 }
    715 static Bool haveF3 ( Prefix pfx ) {
    716    return toBool((pfx & PFX_F3) > 0);
    717 }
    718 
    719 static Bool have66 ( Prefix pfx ) {
    720    return toBool((pfx & PFX_66) > 0);
    721 }
    722 static Bool haveASO ( Prefix pfx ) {
    723    return toBool((pfx & PFX_ASO) > 0);
    724 }
    725 static Bool haveLOCK ( Prefix pfx ) {
    726    return toBool((pfx & PFX_LOCK) > 0);
    727 }
    728 
    729 /* Return True iff pfx has 66 set and F2 and F3 clear */
    730 static Bool have66noF2noF3 ( Prefix pfx )
    731 {
    732   return
    733      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
    734 }
    735 
    736 /* Return True iff pfx has F2 set and 66 and F3 clear */
    737 static Bool haveF2no66noF3 ( Prefix pfx )
    738 {
    739   return
    740      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
    741 }
    742 
    743 /* Return True iff pfx has F3 set and 66 and F2 clear */
    744 static Bool haveF3no66noF2 ( Prefix pfx )
    745 {
    746   return
    747      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
    748 }
    749 
    750 /* Return True iff pfx has F3 set and F2 clear */
    751 static Bool haveF3noF2 ( Prefix pfx )
    752 {
    753   return
    754      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
    755 }
    756 
    757 /* Return True iff pfx has F2 set and F3 clear */
    758 static Bool haveF2noF3 ( Prefix pfx )
    759 {
    760   return
    761      toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
    762 }
    763 
    764 /* Return True iff pfx has 66, F2 and F3 clear */
    765 static Bool haveNo66noF2noF3 ( Prefix pfx )
    766 {
    767   return
    768      toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
    769 }
    770 
    771 /* Return True iff pfx has any of 66, F2 and F3 set */
    772 static Bool have66orF2orF3 ( Prefix pfx )
    773 {
    774   return toBool( ! haveNo66noF2noF3(pfx) );
    775 }
    776 
    777 /* Return True iff pfx has 66 or F3 set */
    778 static Bool have66orF3 ( Prefix pfx )
    779 {
    780    return toBool((pfx & (PFX_66|PFX_F3)) > 0);
    781 }
    782 
    783 /* Clear all the segment-override bits in a prefix. */
    784 static Prefix clearSegBits ( Prefix p )
    785 {
    786    return
    787       p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
    788 }
    789 
    790 /* Get the (inverted, hence back to "normal") VEX.vvvv field. */
    791 static UInt getVexNvvvv ( Prefix pfx ) {
    792    UInt r = (UInt)pfx;
    793    r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
    794    return r & 0xF;
    795 }
    796 
    797 static Bool haveVEX ( Prefix pfx ) {
    798    return toBool(pfx & PFX_VEX);
    799 }
    800 
    801 static Int getVexL ( Prefix pfx ) {
    802    return (pfx & PFX_VEXL) ? 1 : 0;
    803 }
    804 
    805 
    806 /*------------------------------------------------------------*/
    807 /*--- For dealing with escapes                             ---*/
    808 /*------------------------------------------------------------*/
    809 
    810 
    811 /* Escapes come after the prefixes, but before the primary opcode
    812    byte.  They escape the primary opcode byte into a bigger space.
    813    The 0xF0000000 isn't significant, except so as to make it not
    814    overlap valid Prefix values, for sanity checking.
    815 */
    816 
    817 typedef
    818    enum {
    819       ESC_NONE=0xF0000000, // none
    820       ESC_0F,              // 0F
    821       ESC_0F38,            // 0F 38
    822       ESC_0F3A             // 0F 3A
    823    }
    824    Escape;
    825 
    826 
    827 /*------------------------------------------------------------*/
    828 /*--- For dealing with integer registers                   ---*/
    829 /*------------------------------------------------------------*/
    830 
    831 /* This is somewhat complex.  The rules are:
    832 
    833    For 64, 32 and 16 bit register references, the e or g fields in the
    834    modrm bytes supply the low 3 bits of the register number.  The
    835    fourth (most-significant) bit of the register number is supplied by
    836    the REX byte, if it is present; else that bit is taken to be zero.
    837 
    838    The REX.R bit supplies the high bit corresponding to the g register
    839    field, and the REX.B bit supplies the high bit corresponding to the
    840    e register field (when the mod part of modrm indicates that modrm's
    841    e component refers to a register and not to memory).
    842 
    843    The REX.X bit supplies a high register bit for certain registers
    844    in SIB address modes, and is generally rarely used.
    845 
    846    For 8 bit register references, the presence of the REX byte itself
    847    has significance.  If there is no REX present, then the 3-bit
    848    number extracted from the modrm e or g field is treated as an index
    849    into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
    850    old x86 encoding scheme.
    851 
    852    But if there is a REX present, the register reference is
    853    interpreted in the same way as for 64/32/16-bit references: a high
    854    bit is extracted from REX, giving a 4-bit number, and the denoted
    855    register is the lowest 8 bits of the 16 integer registers denoted
    856    by the number.  In particular, values 3 through 7 of this sequence
    857    do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
    858    %rsp %rbp %rsi %rdi.
    859 
    860    The REX.W bit has no bearing at all on register numbers.  Instead
    861    its presence indicates that the operand size is to be overridden
    862    from its default value (32 bits) to 64 bits instead.  This is in
    863    the same fashion that an 0x66 prefix indicates the operand size is
    864    to be overridden from 32 bits down to 16 bits.  When both REX.W and
    865    0x66 are present there is a conflict, and REX.W takes precedence.
    866 
    867    Rather than try to handle this complexity using a single huge
    868    function, several smaller ones are provided.  The aim is to make it
    869    as difficult as possible to screw up register decoding in a subtle
    870    and hard-to-track-down way.
    871 
    872    Because these routines fish around in the host's memory (that is,
    873    in the guest state area) for sub-parts of guest registers, their
    874    correctness depends on the host's endianness.  So far these
    875    routines only work for little-endian hosts.  Those for which
    876    endianness is important have assertions to ensure sanity.
    877 */
    878 
    879 
    880 /* About the simplest question you can ask: where do the 64-bit
    881    integer registers live (in the guest state) ? */
    882 
    883 static Int integerGuestReg64Offset ( UInt reg )
    884 {
    885    switch (reg) {
    886       case R_RAX: return OFFB_RAX;
    887       case R_RCX: return OFFB_RCX;
    888       case R_RDX: return OFFB_RDX;
    889       case R_RBX: return OFFB_RBX;
    890       case R_RSP: return OFFB_RSP;
    891       case R_RBP: return OFFB_RBP;
    892       case R_RSI: return OFFB_RSI;
    893       case R_RDI: return OFFB_RDI;
    894       case R_R8:  return OFFB_R8;
    895       case R_R9:  return OFFB_R9;
    896       case R_R10: return OFFB_R10;
    897       case R_R11: return OFFB_R11;
    898       case R_R12: return OFFB_R12;
    899       case R_R13: return OFFB_R13;
    900       case R_R14: return OFFB_R14;
    901       case R_R15: return OFFB_R15;
    902       default: vpanic("integerGuestReg64Offset(amd64)");
    903    }
    904 }
    905 
    906 
    907 /* Produce the name of an integer register, for printing purposes.
    908    reg is a number in the range 0 .. 15 that has been generated from a
    909    3-bit reg-field number and a REX extension bit.  irregular denotes
    910    the case where sz==1 and no REX byte is present. */
    911 
    912 static
    913 const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
    914 {
    915    static const HChar* ireg64_names[16]
    916      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
    917          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
    918    static const HChar* ireg32_names[16]
    919      = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
    920          "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
    921    static const HChar* ireg16_names[16]
    922      = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
    923          "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
    924    static const HChar* ireg8_names[16]
    925      = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
    926          "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
    927    static const HChar* ireg8_irregular[8]
    928      = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
    929 
    930    vassert(reg < 16);
    931    if (sz == 1) {
    932       if (irregular)
    933          vassert(reg < 8);
    934    } else {
    935       vassert(irregular == False);
    936    }
    937 
    938    switch (sz) {
    939       case 8: return ireg64_names[reg];
    940       case 4: return ireg32_names[reg];
    941       case 2: return ireg16_names[reg];
    942       case 1: if (irregular) {
    943                  return ireg8_irregular[reg];
    944               } else {
    945                  return ireg8_names[reg];
    946               }
    947       default: vpanic("nameIReg(amd64)");
    948    }
    949 }
    950 
    951 /* Using the same argument conventions as nameIReg, produce the
    952    guest state offset of an integer register. */
    953 
    954 static
    955 Int offsetIReg ( Int sz, UInt reg, Bool irregular )
    956 {
    957    vassert(reg < 16);
    958    if (sz == 1) {
    959       if (irregular)
    960          vassert(reg < 8);
    961    } else {
    962       vassert(irregular == False);
    963    }
    964 
    965    /* Deal with irregular case -- sz==1 and no REX present */
    966    if (sz == 1 && irregular) {
    967       switch (reg) {
    968          case R_RSP: return 1+ OFFB_RAX;
    969          case R_RBP: return 1+ OFFB_RCX;
    970          case R_RSI: return 1+ OFFB_RDX;
    971          case R_RDI: return 1+ OFFB_RBX;
    972          default:    break; /* use the normal case */
    973       }
    974    }
    975 
    976    /* Normal case */
    977    return integerGuestReg64Offset(reg);
    978 }
    979 
    980 
    981 /* Read the %CL register :: Ity_I8, for shift/rotate operations. */
    982 
    983 static IRExpr* getIRegCL ( void )
    984 {
    985    vassert(host_endness == VexEndnessLE);
    986    return IRExpr_Get( OFFB_RCX, Ity_I8 );
    987 }
    988 
    989 
    990 /* Write to the %AH register. */
    991 
    992 static void putIRegAH ( IRExpr* e )
    993 {
    994    vassert(host_endness == VexEndnessLE);
    995    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
    996    stmt( IRStmt_Put( OFFB_RAX+1, e ) );
    997 }
    998 
    999 
   1000 /* Read/write various widths of %RAX, as it has various
   1001    special-purpose uses. */
   1002 
   1003 static const HChar* nameIRegRAX ( Int sz )
   1004 {
   1005    switch (sz) {
   1006       case 1: return "%al";
   1007       case 2: return "%ax";
   1008       case 4: return "%eax";
   1009       case 8: return "%rax";
   1010       default: vpanic("nameIRegRAX(amd64)");
   1011    }
   1012 }
   1013 
   1014 static IRExpr* getIRegRAX ( Int sz )
   1015 {
   1016    vassert(host_endness == VexEndnessLE);
   1017    switch (sz) {
   1018       case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
   1019       case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
   1020       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
   1021       case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
   1022       default: vpanic("getIRegRAX(amd64)");
   1023    }
   1024 }
   1025 
   1026 static void putIRegRAX ( Int sz, IRExpr* e )
   1027 {
   1028    IRType ty = typeOfIRExpr(irsb->tyenv, e);
   1029    vassert(host_endness == VexEndnessLE);
   1030    switch (sz) {
   1031       case 8: vassert(ty == Ity_I64);
   1032               stmt( IRStmt_Put( OFFB_RAX, e ));
   1033               break;
   1034       case 4: vassert(ty == Ity_I32);
   1035               stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
   1036               break;
   1037       case 2: vassert(ty == Ity_I16);
   1038               stmt( IRStmt_Put( OFFB_RAX, e ));
   1039               break;
   1040       case 1: vassert(ty == Ity_I8);
   1041               stmt( IRStmt_Put( OFFB_RAX, e ));
   1042               break;
   1043       default: vpanic("putIRegRAX(amd64)");
   1044    }
   1045 }
   1046 
   1047 
   1048 /* Read/write various widths of %RDX, as it has various
   1049    special-purpose uses. */
   1050 
   1051 static const HChar* nameIRegRDX ( Int sz )
   1052 {
   1053    switch (sz) {
   1054       case 1: return "%dl";
   1055       case 2: return "%dx";
   1056       case 4: return "%edx";
   1057       case 8: return "%rdx";
   1058       default: vpanic("nameIRegRDX(amd64)");
   1059    }
   1060 }
   1061 
   1062 static IRExpr* getIRegRDX ( Int sz )
   1063 {
   1064    vassert(host_endness == VexEndnessLE);
   1065    switch (sz) {
   1066       case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
   1067       case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
   1068       case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
   1069       case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
   1070       default: vpanic("getIRegRDX(amd64)");
   1071    }
   1072 }
   1073 
   1074 static void putIRegRDX ( Int sz, IRExpr* e )
   1075 {
   1076    vassert(host_endness == VexEndnessLE);
   1077    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1078    switch (sz) {
   1079       case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
   1080               break;
   1081       case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
   1082               break;
   1083       case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
   1084               break;
   1085       case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
   1086               break;
   1087       default: vpanic("putIRegRDX(amd64)");
   1088    }
   1089 }
   1090 
   1091 
   1092 /* Simplistic functions to deal with the integer registers as a
   1093    straightforward bank of 16 64-bit regs. */
   1094 
   1095 static IRExpr* getIReg64 ( UInt regno )
   1096 {
   1097    return IRExpr_Get( integerGuestReg64Offset(regno),
   1098                       Ity_I64 );
   1099 }
   1100 
   1101 static void putIReg64 ( UInt regno, IRExpr* e )
   1102 {
   1103    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1104    stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
   1105 }
   1106 
   1107 static const HChar* nameIReg64 ( UInt regno )
   1108 {
   1109    return nameIReg( 8, regno, False );
   1110 }
   1111 
   1112 
   1113 /* Simplistic functions to deal with the lower halves of integer
   1114    registers as a straightforward bank of 16 32-bit regs. */
   1115 
   1116 static IRExpr* getIReg32 ( UInt regno )
   1117 {
   1118    vassert(host_endness == VexEndnessLE);
   1119    return unop(Iop_64to32,
   1120                IRExpr_Get( integerGuestReg64Offset(regno),
   1121                            Ity_I64 ));
   1122 }
   1123 
   1124 static void putIReg32 ( UInt regno, IRExpr* e )
   1125 {
   1126    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1127    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1128                      unop(Iop_32Uto64,e) ) );
   1129 }
   1130 
   1131 static const HChar* nameIReg32 ( UInt regno )
   1132 {
   1133    return nameIReg( 4, regno, False );
   1134 }
   1135 
   1136 
   1137 /* Simplistic functions to deal with the lower quarters of integer
   1138    registers as a straightforward bank of 16 16-bit regs. */
   1139 
   1140 static IRExpr* getIReg16 ( UInt regno )
   1141 {
   1142    vassert(host_endness == VexEndnessLE);
   1143    return IRExpr_Get( integerGuestReg64Offset(regno),
   1144                       Ity_I16 );
   1145 }
   1146 
   1147 static void putIReg16 ( UInt regno, IRExpr* e )
   1148 {
   1149    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
   1150    stmt( IRStmt_Put( integerGuestReg64Offset(regno),
   1151                      unop(Iop_16Uto64,e) ) );
   1152 }
   1153 
   1154 static const HChar* nameIReg16 ( UInt regno )
   1155 {
   1156    return nameIReg( 2, regno, False );
   1157 }
   1158 
   1159 
   1160 /* Sometimes what we know is a 3-bit register number, a REX byte, and
   1161    which field of the REX byte is to be used to extend to a 4-bit
   1162    number.  These functions cater for that situation.
   1163 */
   1164 static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
   1165 {
   1166    vassert(lo3bits < 8);
   1167    vassert(IS_VALID_PFX(pfx));
   1168    return getIReg64( lo3bits | (getRexX(pfx) << 3) );
   1169 }
   1170 
   1171 static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
   1172 {
   1173    vassert(lo3bits < 8);
   1174    vassert(IS_VALID_PFX(pfx));
   1175    return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
   1176 }
   1177 
   1178 static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1179 {
   1180    vassert(lo3bits < 8);
   1181    vassert(IS_VALID_PFX(pfx));
   1182    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1183    return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1184                         toBool(sz==1 && !haveREX(pfx)) );
   1185 }
   1186 
   1187 static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
   1188 {
   1189    vassert(lo3bits < 8);
   1190    vassert(IS_VALID_PFX(pfx));
   1191    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1192    if (sz == 4) {
   1193       sz = 8;
   1194       return unop(Iop_64to32,
   1195                   IRExpr_Get(
   1196                      offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1197                                      False/*!irregular*/ ),
   1198                      szToITy(sz)
   1199                  )
   1200              );
   1201    } else {
   1202       return IRExpr_Get(
   1203                 offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1204                                 toBool(sz==1 && !haveREX(pfx)) ),
   1205                 szToITy(sz)
   1206              );
   1207    }
   1208 }
   1209 
   1210 static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
   1211 {
   1212    vassert(lo3bits < 8);
   1213    vassert(IS_VALID_PFX(pfx));
   1214    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1215    vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
   1216    stmt( IRStmt_Put(
   1217             offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
   1218                             toBool(sz==1 && !haveREX(pfx)) ),
   1219             sz==4 ? unop(Iop_32Uto64,e) : e
   1220    ));
   1221 }
   1222 
   1223 
   1224 /* Functions for getting register numbers from modrm bytes and REX
   1225    when we don't have to consider the complexities of integer subreg
   1226    accesses.
   1227 */
   1228 /* Extract the g reg field from a modRM byte, and augment it using the
   1229    REX.R bit from the supplied REX byte.  The R bit usually is
   1230    associated with the g register field.
   1231 */
   1232 static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1233 {
   1234    Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
   1235    reg += (pfx & PFX_REXR) ? 8 : 0;
   1236    return reg;
   1237 }
   1238 
   1239 /* Extract the e reg field from a modRM byte, and augment it using the
   1240    REX.B bit from the supplied REX byte.  The B bit usually is
   1241    associated with the e register field (when modrm indicates e is a
   1242    register, that is).
   1243 */
   1244 static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
   1245 {
   1246    Int rm;
   1247    vassert(epartIsReg(mod_reg_rm));
   1248    rm = (Int)(mod_reg_rm & 0x7);
   1249    rm += (pfx & PFX_REXB) ? 8 : 0;
   1250    return rm;
   1251 }
   1252 
   1253 
   1254 /* General functions for dealing with integer register access. */
   1255 
   1256 /* Produce the guest state offset for a reference to the 'g' register
   1257    field in a modrm byte, taking into account REX (or its absence),
   1258    and the size of the access.
   1259 */
   1260 static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1261 {
   1262    UInt reg;
   1263    vassert(host_endness == VexEndnessLE);
   1264    vassert(IS_VALID_PFX(pfx));
   1265    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1266    reg = gregOfRexRM( pfx, mod_reg_rm );
   1267    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1268 }
   1269 
   1270 static
   1271 IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1272 {
   1273    if (sz == 4) {
   1274       sz = 8;
   1275       return unop(Iop_64to32,
   1276                   IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1277                               szToITy(sz) ));
   1278    } else {
   1279       return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
   1280                          szToITy(sz) );
   1281    }
   1282 }
   1283 
   1284 static
   1285 void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1286 {
   1287    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1288    if (sz == 4) {
   1289       e = unop(Iop_32Uto64,e);
   1290    }
   1291    stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
   1292 }
   1293 
   1294 static
   1295 const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1296 {
   1297    return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
   1298                         toBool(sz==1 && !haveREX(pfx)) );
   1299 }
   1300 
   1301 
   1302 static
   1303 IRExpr* getIRegV ( Int sz, Prefix pfx )
   1304 {
   1305    if (sz == 4) {
   1306       sz = 8;
   1307       return unop(Iop_64to32,
   1308                   IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1309                               szToITy(sz) ));
   1310    } else {
   1311       return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
   1312                          szToITy(sz) );
   1313    }
   1314 }
   1315 
   1316 static
   1317 void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
   1318 {
   1319    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1320    if (sz == 4) {
   1321       e = unop(Iop_32Uto64,e);
   1322    }
   1323    stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
   1324 }
   1325 
   1326 static
   1327 const HChar* nameIRegV ( Int sz, Prefix pfx )
   1328 {
   1329    return nameIReg( sz, getVexNvvvv(pfx), False );
   1330 }
   1331 
   1332 
   1333 
   1334 /* Produce the guest state offset for a reference to the 'e' register
   1335    field in a modrm byte, taking into account REX (or its absence),
   1336    and the size of the access.  eregOfRexRM will assert if mod_reg_rm
   1337    denotes a memory access rather than a register access.
   1338 */
   1339 static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1340 {
   1341    UInt reg;
   1342    vassert(host_endness == VexEndnessLE);
   1343    vassert(IS_VALID_PFX(pfx));
   1344    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
   1345    reg = eregOfRexRM( pfx, mod_reg_rm );
   1346    return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
   1347 }
   1348 
   1349 static
   1350 IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1351 {
   1352    if (sz == 4) {
   1353       sz = 8;
   1354       return unop(Iop_64to32,
   1355                   IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1356                               szToITy(sz) ));
   1357    } else {
   1358       return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
   1359                          szToITy(sz) );
   1360    }
   1361 }
   1362 
   1363 static
   1364 void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
   1365 {
   1366    vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
   1367    if (sz == 4) {
   1368       e = unop(Iop_32Uto64,e);
   1369    }
   1370    stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
   1371 }
   1372 
   1373 static
   1374 const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
   1375 {
   1376    return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
   1377                         toBool(sz==1 && !haveREX(pfx)) );
   1378 }
   1379 
   1380 
   1381 /*------------------------------------------------------------*/
   1382 /*--- For dealing with XMM registers                       ---*/
   1383 /*------------------------------------------------------------*/
   1384 
   1385 static Int ymmGuestRegOffset ( UInt ymmreg )
   1386 {
   1387    switch (ymmreg) {
   1388       case 0:  return OFFB_YMM0;
   1389       case 1:  return OFFB_YMM1;
   1390       case 2:  return OFFB_YMM2;
   1391       case 3:  return OFFB_YMM3;
   1392       case 4:  return OFFB_YMM4;
   1393       case 5:  return OFFB_YMM5;
   1394       case 6:  return OFFB_YMM6;
   1395       case 7:  return OFFB_YMM7;
   1396       case 8:  return OFFB_YMM8;
   1397       case 9:  return OFFB_YMM9;
   1398       case 10: return OFFB_YMM10;
   1399       case 11: return OFFB_YMM11;
   1400       case 12: return OFFB_YMM12;
   1401       case 13: return OFFB_YMM13;
   1402       case 14: return OFFB_YMM14;
   1403       case 15: return OFFB_YMM15;
   1404       default: vpanic("ymmGuestRegOffset(amd64)");
   1405    }
   1406 }
   1407 
   1408 static Int xmmGuestRegOffset ( UInt xmmreg )
   1409 {
   1410    /* Correct for little-endian host only. */
   1411    vassert(host_endness == VexEndnessLE);
   1412    return ymmGuestRegOffset( xmmreg );
   1413 }
   1414 
   1415 /* Lanes of vector registers are always numbered from zero being the
   1416    least significant lane (rightmost in the register).  */
   1417 
   1418 static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
   1419 {
   1420    /* Correct for little-endian host only. */
   1421    vassert(host_endness == VexEndnessLE);
   1422    vassert(laneno >= 0 && laneno < 8);
   1423    return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
   1424 }
   1425 
   1426 static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
   1427 {
   1428    /* Correct for little-endian host only. */
   1429    vassert(host_endness == VexEndnessLE);
   1430    vassert(laneno >= 0 && laneno < 4);
   1431    return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
   1432 }
   1433 
   1434 static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
   1435 {
   1436    /* Correct for little-endian host only. */
   1437    vassert(host_endness == VexEndnessLE);
   1438    vassert(laneno >= 0 && laneno < 2);
   1439    return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
   1440 }
   1441 
   1442 static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
   1443 {
   1444    /* Correct for little-endian host only. */
   1445    vassert(host_endness == VexEndnessLE);
   1446    vassert(laneno >= 0 && laneno < 2);
   1447    return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
   1448 }
   1449 
   1450 static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
   1451 {
   1452    /* Correct for little-endian host only. */
   1453    vassert(host_endness == VexEndnessLE);
   1454    vassert(laneno >= 0 && laneno < 4);
   1455    return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
   1456 }
   1457 
   1458 static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
   1459 {
   1460    /* Correct for little-endian host only. */
   1461    vassert(host_endness == VexEndnessLE);
   1462    vassert(laneno >= 0 && laneno < 8);
   1463    return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
   1464 }
   1465 
   1466 static IRExpr* getXMMReg ( UInt xmmreg )
   1467 {
   1468    return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
   1469 }
   1470 
   1471 static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
   1472 {
   1473    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
   1474 }
   1475 
   1476 static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
   1477 {
   1478    return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
   1479 }
   1480 
   1481 static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
   1482 {
   1483    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
   1484 }
   1485 
   1486 static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
   1487 {
   1488    return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
   1489 }
   1490 
   1491 static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
   1492 {
   1493   return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
   1494 }
   1495 
   1496 static void putXMMReg ( UInt xmmreg, IRExpr* e )
   1497 {
   1498    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1499    stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
   1500 }
   1501 
   1502 static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
   1503 {
   1504    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1505    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1506 }
   1507 
   1508 static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
   1509 {
   1510    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1511    stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
   1512 }
   1513 
   1514 static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
   1515 {
   1516    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1517    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1518 }
   1519 
   1520 static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
   1521 {
   1522    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1523    stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
   1524 }
   1525 
   1526 static IRExpr* getYMMReg ( UInt xmmreg )
   1527 {
   1528    return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
   1529 }
   1530 
   1531 static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
   1532 {
   1533    return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
   1534 }
   1535 
   1536 static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
   1537 {
   1538    return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
   1539 }
   1540 
   1541 static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
   1542 {
   1543    return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
   1544 }
   1545 
   1546 static void putYMMReg ( UInt ymmreg, IRExpr* e )
   1547 {
   1548    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
   1549    stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
   1550 }
   1551 
   1552 static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
   1553 {
   1554    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
   1555    stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
   1556 }
   1557 
   1558 static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
   1559 {
   1560    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
   1561    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1562 }
   1563 
   1564 static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
   1565 {
   1566    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   1567    stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
   1568 }
   1569 
   1570 static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
   1571 {
   1572    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
   1573    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1574 }
   1575 
   1576 static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
   1577 {
   1578    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
   1579    stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
   1580 }
   1581 
   1582 static IRExpr* mkV128 ( UShort mask )
   1583 {
   1584    return IRExpr_Const(IRConst_V128(mask));
   1585 }
   1586 
   1587 /* Write the low half of a YMM reg and zero out the upper half. */
   1588 static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
   1589 {
   1590    putYMMRegLane128( ymmreg, 0, e );
   1591    putYMMRegLane128( ymmreg, 1, mkV128(0) );
   1592 }
   1593 
   1594 static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
   1595 {
   1596    vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
   1597    vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
   1598    return unop(Iop_64to1,
   1599                binop(Iop_And64,
   1600                      unop(Iop_1Uto64,x),
   1601                      unop(Iop_1Uto64,y)));
   1602 }
   1603 
   1604 /* Generate a compare-and-swap operation, operating on memory at
   1605    'addr'.  The expected value is 'expVal' and the new value is
   1606    'newVal'.  If the operation fails, then transfer control (with a
   1607    no-redir jump (XXX no -- see comment at top of this file)) to
   1608    'restart_point', which is presumably the address of the guest
   1609    instruction again -- retrying, essentially. */
   1610 static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
   1611                     Addr64 restart_point )
   1612 {
   1613    IRCAS* cas;
   1614    IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
   1615    IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
   1616    IRTemp oldTmp = newTemp(tyE);
   1617    IRTemp expTmp = newTemp(tyE);
   1618    vassert(tyE == tyN);
   1619    vassert(tyE == Ity_I64 || tyE == Ity_I32
   1620            || tyE == Ity_I16 || tyE == Ity_I8);
   1621    assign(expTmp, expVal);
   1622    cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
   1623                   NULL, mkexpr(expTmp), NULL, newVal );
   1624    stmt( IRStmt_CAS(cas) );
   1625    stmt( IRStmt_Exit(
   1626             binop( mkSizedOp(tyE,Iop_CasCmpNE8),
   1627                    mkexpr(oldTmp), mkexpr(expTmp) ),
   1628             Ijk_Boring, /*Ijk_NoRedir*/
   1629             IRConst_U64( restart_point ),
   1630             OFFB_RIP
   1631          ));
   1632 }
   1633 
   1634 
   1635 /*------------------------------------------------------------*/
   1636 /*--- Helpers for %rflags.                                 ---*/
   1637 /*------------------------------------------------------------*/
   1638 
   1639 /* -------------- Evaluating the flags-thunk. -------------- */
   1640 
   1641 /* Build IR to calculate all the eflags from stored
   1642    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1643    Ity_I64. */
   1644 static IRExpr* mk_amd64g_calculate_rflags_all ( void )
   1645 {
   1646    IRExpr** args
   1647       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1648                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1649                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1650                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1651    IRExpr* call
   1652       = mkIRExprCCall(
   1653            Ity_I64,
   1654            0/*regparm*/,
   1655            "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
   1656            args
   1657         );
   1658    /* Exclude OP and NDEP from definedness checking.  We're only
   1659       interested in DEP1 and DEP2. */
   1660    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1661    return call;
   1662 }
   1663 
   1664 /* Build IR to calculate some particular condition from stored
   1665    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   1666    Ity_Bit. */
   1667 static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
   1668 {
   1669    IRExpr** args
   1670       = mkIRExprVec_5( mkU64(cond),
   1671                        IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1672                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1673                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1674                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1675    IRExpr* call
   1676       = mkIRExprCCall(
   1677            Ity_I64,
   1678            0/*regparm*/,
   1679            "amd64g_calculate_condition", &amd64g_calculate_condition,
   1680            args
   1681         );
   1682    /* Exclude the requested condition, OP and NDEP from definedness
   1683       checking.  We're only interested in DEP1 and DEP2. */
   1684    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
   1685    return unop(Iop_64to1, call);
   1686 }
   1687 
   1688 /* Build IR to calculate just the carry flag from stored
   1689    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
   1690 static IRExpr* mk_amd64g_calculate_rflags_c ( void )
   1691 {
   1692    IRExpr** args
   1693       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
   1694                        IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
   1695                        IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
   1696                        IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   1697    IRExpr* call
   1698       = mkIRExprCCall(
   1699            Ity_I64,
   1700            0/*regparm*/,
   1701            "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
   1702            args
   1703         );
   1704    /* Exclude OP and NDEP from definedness checking.  We're only
   1705       interested in DEP1 and DEP2. */
   1706    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   1707    return call;
   1708 }
   1709 
   1710 
   1711 /* -------------- Building the flags-thunk. -------------- */
   1712 
   1713 /* The machinery in this section builds the flag-thunk following a
   1714    flag-setting operation.  Hence the various setFlags_* functions.
   1715 */
   1716 
   1717 static Bool isAddSub ( IROp op8 )
   1718 {
   1719    return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
   1720 }
   1721 
   1722 static Bool isLogic ( IROp op8 )
   1723 {
   1724    return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
   1725 }
   1726 
   1727 /* U-widen 1/8/16/32/64 bit int expr to 64. */
   1728 static IRExpr* widenUto64 ( IRExpr* e )
   1729 {
   1730    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1731       case Ity_I64: return e;
   1732       case Ity_I32: return unop(Iop_32Uto64, e);
   1733       case Ity_I16: return unop(Iop_16Uto64, e);
   1734       case Ity_I8:  return unop(Iop_8Uto64, e);
   1735       case Ity_I1:  return unop(Iop_1Uto64, e);
   1736       default: vpanic("widenUto64");
   1737    }
   1738 }
   1739 
   1740 /* S-widen 8/16/32/64 bit int expr to 32. */
   1741 static IRExpr* widenSto64 ( IRExpr* e )
   1742 {
   1743    switch (typeOfIRExpr(irsb->tyenv,e)) {
   1744       case Ity_I64: return e;
   1745       case Ity_I32: return unop(Iop_32Sto64, e);
   1746       case Ity_I16: return unop(Iop_16Sto64, e);
   1747       case Ity_I8:  return unop(Iop_8Sto64, e);
   1748       default: vpanic("widenSto64");
   1749    }
   1750 }
   1751 
   1752 /* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
   1753    of these combinations make sense. */
   1754 static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
   1755 {
   1756    IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
   1757    if (src_ty == dst_ty)
   1758       return e;
   1759    if (src_ty == Ity_I32 && dst_ty == Ity_I16)
   1760       return unop(Iop_32to16, e);
   1761    if (src_ty == Ity_I32 && dst_ty == Ity_I8)
   1762       return unop(Iop_32to8, e);
   1763    if (src_ty == Ity_I64 && dst_ty == Ity_I32)
   1764       return unop(Iop_64to32, e);
   1765    if (src_ty == Ity_I64 && dst_ty == Ity_I16)
   1766       return unop(Iop_64to16, e);
   1767    if (src_ty == Ity_I64 && dst_ty == Ity_I8)
   1768       return unop(Iop_64to8, e);
   1769 
   1770    vex_printf("\nsrc, dst tys are: ");
   1771    ppIRType(src_ty);
   1772    vex_printf(", ");
   1773    ppIRType(dst_ty);
   1774    vex_printf("\n");
   1775    vpanic("narrowTo(amd64)");
   1776 }
   1777 
   1778 
   1779 /* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
   1780    auto-sized up to the real op. */
   1781 
   1782 static
   1783 void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
   1784 {
   1785    Int ccOp = 0;
   1786    switch (ty) {
   1787       case Ity_I8:  ccOp = 0; break;
   1788       case Ity_I16: ccOp = 1; break;
   1789       case Ity_I32: ccOp = 2; break;
   1790       case Ity_I64: ccOp = 3; break;
   1791       default: vassert(0);
   1792    }
   1793    switch (op8) {
   1794       case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
   1795       case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
   1796       default:       ppIROp(op8);
   1797                      vpanic("setFlags_DEP1_DEP2(amd64)");
   1798    }
   1799    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1800    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1801    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
   1802 }
   1803 
   1804 
   1805 /* Set the OP and DEP1 fields only, and write zero to DEP2. */
   1806 
   1807 static
   1808 void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
   1809 {
   1810    Int ccOp = 0;
   1811    switch (ty) {
   1812       case Ity_I8:  ccOp = 0; break;
   1813       case Ity_I16: ccOp = 1; break;
   1814       case Ity_I32: ccOp = 2; break;
   1815       case Ity_I64: ccOp = 3; break;
   1816       default: vassert(0);
   1817    }
   1818    switch (op8) {
   1819       case Iop_Or8:
   1820       case Iop_And8:
   1821       case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
   1822       default:       ppIROp(op8);
   1823                      vpanic("setFlags_DEP1(amd64)");
   1824    }
   1825    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1826    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
   1827    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1828 }
   1829 
   1830 
   1831 /* For shift operations, we put in the result and the undershifted
   1832    result.  Except if the shift amount is zero, the thunk is left
   1833    unchanged. */
   1834 
   1835 static void setFlags_DEP1_DEP2_shift ( IROp    op64,
   1836                                        IRTemp  res,
   1837                                        IRTemp  resUS,
   1838                                        IRType  ty,
   1839                                        IRTemp  guard )
   1840 {
   1841    Int ccOp = 0;
   1842    switch (ty) {
   1843       case Ity_I8:  ccOp = 0; break;
   1844       case Ity_I16: ccOp = 1; break;
   1845       case Ity_I32: ccOp = 2; break;
   1846       case Ity_I64: ccOp = 3; break;
   1847       default: vassert(0);
   1848    }
   1849 
   1850    vassert(guard);
   1851 
   1852    /* Both kinds of right shifts are handled by the same thunk
   1853       operation. */
   1854    switch (op64) {
   1855       case Iop_Shr64:
   1856       case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
   1857       case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
   1858       default:        ppIROp(op64);
   1859                       vpanic("setFlags_DEP1_DEP2_shift(amd64)");
   1860    }
   1861 
   1862    /* guard :: Ity_I8.  We need to convert it to I1. */
   1863    IRTemp guardB = newTemp(Ity_I1);
   1864    assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
   1865 
   1866    /* DEP1 contains the result, DEP2 contains the undershifted value. */
   1867    stmt( IRStmt_Put( OFFB_CC_OP,
   1868                      IRExpr_ITE( mkexpr(guardB),
   1869                                  mkU64(ccOp),
   1870                                  IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   1871    stmt( IRStmt_Put( OFFB_CC_DEP1,
   1872                      IRExpr_ITE( mkexpr(guardB),
   1873                                  widenUto64(mkexpr(res)),
   1874                                  IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   1875    stmt( IRStmt_Put( OFFB_CC_DEP2,
   1876                      IRExpr_ITE( mkexpr(guardB),
   1877                                  widenUto64(mkexpr(resUS)),
   1878                                  IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   1879 }
   1880 
   1881 
   1882 /* For the inc/dec case, we store in DEP1 the result value and in NDEP
   1883    the former value of the carry flag, which unfortunately we have to
   1884    compute. */
   1885 
   1886 static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
   1887 {
   1888    Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
   1889 
   1890    switch (ty) {
   1891       case Ity_I8:  ccOp += 0; break;
   1892       case Ity_I16: ccOp += 1; break;
   1893       case Ity_I32: ccOp += 2; break;
   1894       case Ity_I64: ccOp += 3; break;
   1895       default: vassert(0);
   1896    }
   1897 
   1898    /* This has to come first, because calculating the C flag
   1899       may require reading all four thunk fields. */
   1900    stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
   1901    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
   1902    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
   1903    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
   1904 }
   1905 
   1906 
   1907 /* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
   1908    two arguments. */
   1909 
   1910 static
   1911 void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
   1912 {
   1913    switch (ty) {
   1914       case Ity_I8:
   1915          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
   1916          break;
   1917       case Ity_I16:
   1918          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
   1919          break;
   1920       case Ity_I32:
   1921          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
   1922          break;
   1923       case Ity_I64:
   1924          stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
   1925          break;
   1926       default:
   1927          vpanic("setFlags_MUL(amd64)");
   1928    }
   1929    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
   1930    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
   1931 }
   1932 
   1933 
   1934 /* -------------- Condition codes. -------------- */
   1935 
   1936 /* Condition codes, using the AMD encoding.  */
   1937 
   1938 static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
   1939 {
   1940    switch (cond) {
   1941       case AMD64CondO:      return "o";
   1942       case AMD64CondNO:     return "no";
   1943       case AMD64CondB:      return "b";
   1944       case AMD64CondNB:     return "ae"; /*"nb";*/
   1945       case AMD64CondZ:      return "e"; /*"z";*/
   1946       case AMD64CondNZ:     return "ne"; /*"nz";*/
   1947       case AMD64CondBE:     return "be";
   1948       case AMD64CondNBE:    return "a"; /*"nbe";*/
   1949       case AMD64CondS:      return "s";
   1950       case AMD64CondNS:     return "ns";
   1951       case AMD64CondP:      return "p";
   1952       case AMD64CondNP:     return "np";
   1953       case AMD64CondL:      return "l";
   1954       case AMD64CondNL:     return "ge"; /*"nl";*/
   1955       case AMD64CondLE:     return "le";
   1956       case AMD64CondNLE:    return "g"; /*"nle";*/
   1957       case AMD64CondAlways: return "ALWAYS";
   1958       default: vpanic("name_AMD64Condcode");
   1959    }
   1960 }
   1961 
   1962 static
   1963 AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
   1964                                           /*OUT*/Bool*   needInvert )
   1965 {
   1966    vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
   1967    if (cond & 1) {
   1968       *needInvert = True;
   1969       return cond-1;
   1970    } else {
   1971       *needInvert = False;
   1972       return cond;
   1973    }
   1974 }
   1975 
   1976 
   1977 /* -------------- Helpers for ADD/SUB with carry. -------------- */
   1978 
   1979 /* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
   1980    appropriately.
   1981 
   1982    Optionally, generate a store for the 'tres' value.  This can either
   1983    be a normal store, or it can be a cas-with-possible-failure style
   1984    store:
   1985 
   1986    if taddr is IRTemp_INVALID, then no store is generated.
   1987 
   1988    if taddr is not IRTemp_INVALID, then a store (using taddr as
   1989    the address) is generated:
   1990 
   1991      if texpVal is IRTemp_INVALID then a normal store is
   1992      generated, and restart_point must be zero (it is irrelevant).
   1993 
   1994      if texpVal is not IRTemp_INVALID then a cas-style store is
   1995      generated.  texpVal is the expected value, restart_point
   1996      is the restart point if the store fails, and texpVal must
   1997      have the same type as tres.
   1998 
   1999 */
   2000 static void helper_ADC ( Int sz,
   2001                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2002                          /* info about optional store: */
   2003                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2004 {
   2005    UInt    thunkOp;
   2006    IRType  ty    = szToITy(sz);
   2007    IRTemp  oldc  = newTemp(Ity_I64);
   2008    IRTemp  oldcn = newTemp(ty);
   2009    IROp    plus  = mkSizedOp(ty, Iop_Add8);
   2010    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2011 
   2012    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2013 
   2014    switch (sz) {
   2015       case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
   2016       case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
   2017       case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
   2018       case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
   2019       default: vassert(0);
   2020    }
   2021 
   2022    /* oldc = old carry flag, 0 or 1 */
   2023    assign( oldc,  binop(Iop_And64,
   2024                         mk_amd64g_calculate_rflags_c(),
   2025                         mkU64(1)) );
   2026 
   2027    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2028 
   2029    assign( tres, binop(plus,
   2030                        binop(plus,mkexpr(ta1),mkexpr(ta2)),
   2031                        mkexpr(oldcn)) );
   2032 
   2033    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2034       start of this function. */
   2035    if (taddr != IRTemp_INVALID) {
   2036       if (texpVal == IRTemp_INVALID) {
   2037          vassert(restart_point == 0);
   2038          storeLE( mkexpr(taddr), mkexpr(tres) );
   2039       } else {
   2040          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2041          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2042          casLE( mkexpr(taddr),
   2043                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2044       }
   2045    }
   2046 
   2047    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2048    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
   2049    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2050                                                          mkexpr(oldcn)) )) );
   2051    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2052 }
   2053 
   2054 
   2055 /* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
   2056    appropriately.  As with helper_ADC, possibly generate a store of
   2057    the result -- see comments on helper_ADC for details.
   2058 */
   2059 static void helper_SBB ( Int sz,
   2060                          IRTemp tres, IRTemp ta1, IRTemp ta2,
   2061                          /* info about optional store: */
   2062                          IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
   2063 {
   2064    UInt    thunkOp;
   2065    IRType  ty    = szToITy(sz);
   2066    IRTemp  oldc  = newTemp(Ity_I64);
   2067    IRTemp  oldcn = newTemp(ty);
   2068    IROp    minus = mkSizedOp(ty, Iop_Sub8);
   2069    IROp    xor   = mkSizedOp(ty, Iop_Xor8);
   2070 
   2071    vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
   2072 
   2073    switch (sz) {
   2074       case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
   2075       case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
   2076       case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
   2077       case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
   2078       default: vassert(0);
   2079    }
   2080 
   2081    /* oldc = old carry flag, 0 or 1 */
   2082    assign( oldc, binop(Iop_And64,
   2083                        mk_amd64g_calculate_rflags_c(),
   2084                        mkU64(1)) );
   2085 
   2086    assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
   2087 
   2088    assign( tres, binop(minus,
   2089                        binop(minus,mkexpr(ta1),mkexpr(ta2)),
   2090                        mkexpr(oldcn)) );
   2091 
   2092    /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
   2093       start of this function. */
   2094    if (taddr != IRTemp_INVALID) {
   2095       if (texpVal == IRTemp_INVALID) {
   2096          vassert(restart_point == 0);
   2097          storeLE( mkexpr(taddr), mkexpr(tres) );
   2098       } else {
   2099          vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
   2100          /* .. and hence 'texpVal' has the same type as 'tres'. */
   2101          casLE( mkexpr(taddr),
   2102                 mkexpr(texpVal), mkexpr(tres), restart_point );
   2103       }
   2104    }
   2105 
   2106    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
   2107    stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
   2108    stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
   2109                                                          mkexpr(oldcn)) )) );
   2110    stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
   2111 }
   2112 
   2113 
   2114 /* -------------- Helpers for disassembly printing. -------------- */
   2115 
   2116 static const HChar* nameGrp1 ( Int opc_aux )
   2117 {
   2118    static const HChar* grp1_names[8]
   2119      = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
   2120    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
   2121    return grp1_names[opc_aux];
   2122 }
   2123 
   2124 static const HChar* nameGrp2 ( Int opc_aux )
   2125 {
   2126    static const HChar* grp2_names[8]
   2127      = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
   2128    if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
   2129    return grp2_names[opc_aux];
   2130 }
   2131 
   2132 static const HChar* nameGrp4 ( Int opc_aux )
   2133 {
   2134    static const HChar* grp4_names[8]
   2135      = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
   2136    if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
   2137    return grp4_names[opc_aux];
   2138 }
   2139 
   2140 static const HChar* nameGrp5 ( Int opc_aux )
   2141 {
   2142    static const HChar* grp5_names[8]
   2143      = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
   2144    if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
   2145    return grp5_names[opc_aux];
   2146 }
   2147 
   2148 static const HChar* nameGrp8 ( Int opc_aux )
   2149 {
   2150    static const HChar* grp8_names[8]
   2151       = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
   2152    if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
   2153    return grp8_names[opc_aux];
   2154 }
   2155 
   2156 static const HChar* nameSReg ( UInt sreg )
   2157 {
   2158    switch (sreg) {
   2159       case R_ES: return "%es";
   2160       case R_CS: return "%cs";
   2161       case R_SS: return "%ss";
   2162       case R_DS: return "%ds";
   2163       case R_FS: return "%fs";
   2164       case R_GS: return "%gs";
   2165       default: vpanic("nameSReg(amd64)");
   2166    }
   2167 }
   2168 
   2169 static const HChar* nameMMXReg ( Int mmxreg )
   2170 {
   2171    static const HChar* mmx_names[8]
   2172      = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
   2173    if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
   2174    return mmx_names[mmxreg];
   2175 }
   2176 
   2177 static const HChar* nameXMMReg ( Int xmmreg )
   2178 {
   2179    static const HChar* xmm_names[16]
   2180      = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
   2181          "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
   2182          "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
   2183          "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
   2184    if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
   2185    return xmm_names[xmmreg];
   2186 }
   2187 
   2188 static const HChar* nameMMXGran ( Int gran )
   2189 {
   2190    switch (gran) {
   2191       case 0: return "b";
   2192       case 1: return "w";
   2193       case 2: return "d";
   2194       case 3: return "q";
   2195       default: vpanic("nameMMXGran(amd64,guest)");
   2196    }
   2197 }
   2198 
   2199 static HChar nameISize ( Int size )
   2200 {
   2201    switch (size) {
   2202       case 8: return 'q';
   2203       case 4: return 'l';
   2204       case 2: return 'w';
   2205       case 1: return 'b';
   2206       default: vpanic("nameISize(amd64)");
   2207    }
   2208 }
   2209 
   2210 static const HChar* nameYMMReg ( Int ymmreg )
   2211 {
   2212    static const HChar* ymm_names[16]
   2213      = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
   2214          "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
   2215          "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
   2216          "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
   2217    if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
   2218    return ymm_names[ymmreg];
   2219 }
   2220 
   2221 
   2222 /*------------------------------------------------------------*/
   2223 /*--- JMP helpers                                          ---*/
   2224 /*------------------------------------------------------------*/
   2225 
   2226 static void jmp_lit( /*MOD*/DisResult* dres,
   2227                      IRJumpKind kind, Addr64 d64 )
   2228 {
   2229    vassert(dres->whatNext    == Dis_Continue);
   2230    vassert(dres->len         == 0);
   2231    vassert(dres->continueAt  == 0);
   2232    vassert(dres->jk_StopHere == Ijk_INVALID);
   2233    dres->whatNext    = Dis_StopHere;
   2234    dres->jk_StopHere = kind;
   2235    stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
   2236 }
   2237 
   2238 static void jmp_treg( /*MOD*/DisResult* dres,
   2239                       IRJumpKind kind, IRTemp t )
   2240 {
   2241    vassert(dres->whatNext    == Dis_Continue);
   2242    vassert(dres->len         == 0);
   2243    vassert(dres->continueAt  == 0);
   2244    vassert(dres->jk_StopHere == Ijk_INVALID);
   2245    dres->whatNext    = Dis_StopHere;
   2246    dres->jk_StopHere = kind;
   2247    stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
   2248 }
   2249 
   2250 static
   2251 void jcc_01 ( /*MOD*/DisResult* dres,
   2252               AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
   2253 {
   2254    Bool          invert;
   2255    AMD64Condcode condPos;
   2256    vassert(dres->whatNext    == Dis_Continue);
   2257    vassert(dres->len         == 0);
   2258    vassert(dres->continueAt  == 0);
   2259    vassert(dres->jk_StopHere == Ijk_INVALID);
   2260    dres->whatNext    = Dis_StopHere;
   2261    dres->jk_StopHere = Ijk_Boring;
   2262    condPos = positiveIse_AMD64Condcode ( cond, &invert );
   2263    if (invert) {
   2264       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2265                          Ijk_Boring,
   2266                          IRConst_U64(d64_false),
   2267                          OFFB_RIP ) );
   2268       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
   2269    } else {
   2270       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
   2271                          Ijk_Boring,
   2272                          IRConst_U64(d64_true),
   2273                          OFFB_RIP ) );
   2274       stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
   2275    }
   2276 }
   2277 
   2278 /* Let new_rsp be the %rsp value after a call/return.  Let nia be the
   2279    guest address of the next instruction to be executed.
   2280 
   2281    This function generates an AbiHint to say that -128(%rsp)
   2282    .. -1(%rsp) should now be regarded as uninitialised.
   2283 */
   2284 static
   2285 void make_redzone_AbiHint ( const VexAbiInfo* vbi,
   2286                             IRTemp new_rsp, IRTemp nia, const HChar* who )
   2287 {
   2288    Int szB = vbi->guest_stack_redzone_size;
   2289    vassert(szB >= 0);
   2290 
   2291    /* A bit of a kludge.  Currently the only AbI we've guested AMD64
   2292       for is ELF.  So just check it's the expected 128 value
   2293       (paranoia). */
   2294    vassert(szB == 128);
   2295 
   2296    if (0) vex_printf("AbiHint: %s\n", who);
   2297    vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
   2298    vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
   2299    if (szB > 0)
   2300       stmt( IRStmt_AbiHint(
   2301                binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
   2302                szB,
   2303                mkexpr(nia)
   2304             ));
   2305 }
   2306 
   2307 
   2308 /*------------------------------------------------------------*/
   2309 /*--- Disassembling addressing modes                       ---*/
   2310 /*------------------------------------------------------------*/
   2311 
   2312 static
   2313 const HChar* segRegTxt ( Prefix pfx )
   2314 {
   2315    if (pfx & PFX_CS) return "%cs:";
   2316    if (pfx & PFX_DS) return "%ds:";
   2317    if (pfx & PFX_ES) return "%es:";
   2318    if (pfx & PFX_FS) return "%fs:";
   2319    if (pfx & PFX_GS) return "%gs:";
   2320    if (pfx & PFX_SS) return "%ss:";
   2321    return ""; /* no override */
   2322 }
   2323 
   2324 
   2325 /* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
   2326    linear address by adding any required segment override as indicated
   2327    by sorb, and also dealing with any address size override
   2328    present. */
   2329 static
   2330 IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
   2331                               Prefix pfx, IRExpr* virtual )
   2332 {
   2333    /* Note that the below are hacks that relies on the assumption
   2334       that %fs or %gs are constant.
   2335       Typically, %fs is always 0x63 on linux (in the main thread, it
   2336       stays at value 0), %gs always 0x60 on Darwin, ... */
   2337    /* --- segment overrides --- */
   2338    if (pfx & PFX_FS) {
   2339       if (vbi->guest_amd64_assume_fs_is_const) {
   2340          /* return virtual + guest_FS_CONST. */
   2341          virtual = binop(Iop_Add64, virtual,
   2342                                     IRExpr_Get(OFFB_FS_CONST, Ity_I64));
   2343       } else {
   2344          unimplemented("amd64 %fs segment override");
   2345       }
   2346    }
   2347 
   2348    if (pfx & PFX_GS) {
   2349       if (vbi->guest_amd64_assume_gs_is_const) {
   2350          /* return virtual + guest_GS_CONST. */
   2351          virtual = binop(Iop_Add64, virtual,
   2352                                     IRExpr_Get(OFFB_GS_CONST, Ity_I64));
   2353       } else {
   2354          unimplemented("amd64 %gs segment override");
   2355       }
   2356    }
   2357 
   2358    /* cs, ds, es and ss are simply ignored in 64-bit mode. */
   2359 
   2360    /* --- address size override --- */
   2361    if (haveASO(pfx))
   2362       virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
   2363 
   2364    return virtual;
   2365 }
   2366 
   2367 //.. {
   2368 //..    Int    sreg;
   2369 //..    IRType hWordTy;
   2370 //..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
   2371 //..
   2372 //..    if (sorb == 0)
   2373 //..       /* the common case - no override */
   2374 //..       return virtual;
   2375 //..
   2376 //..    switch (sorb) {
   2377 //..       case 0x3E: sreg = R_DS; break;
   2378 //..       case 0x26: sreg = R_ES; break;
   2379 //..       case 0x64: sreg = R_FS; break;
   2380 //..       case 0x65: sreg = R_GS; break;
   2381 //..       default: vpanic("handleAddrOverrides(x86,guest)");
   2382 //..    }
   2383 //..
   2384 //..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
   2385 //..
   2386 //..    seg_selector = newTemp(Ity_I32);
   2387 //..    ldt_ptr      = newTemp(hWordTy);
   2388 //..    gdt_ptr      = newTemp(hWordTy);
   2389 //..    r64          = newTemp(Ity_I64);
   2390 //..
   2391 //..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
   2392 //..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
   2393 //..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
   2394 //..
   2395 //..    /*
   2396 //..    Call this to do the translation and limit checks:
   2397 //..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
   2398 //..                                  UInt seg_selector, UInt virtual_addr )
   2399 //..    */
   2400 //..    assign(
   2401 //..       r64,
   2402 //..       mkIRExprCCall(
   2403 //..          Ity_I64,
   2404 //..          0/*regparms*/,
   2405 //..          "x86g_use_seg_selector",
   2406 //..          &x86g_use_seg_selector,
   2407 //..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
   2408 //..                         mkexpr(seg_selector), virtual)
   2409 //..       )
   2410 //..    );
   2411 //..
   2412 //..    /* If the high 32 of the result are non-zero, there was a
   2413 //..       failure in address translation.  In which case, make a
   2414 //..       quick exit.
   2415 //..    */
   2416 //..    stmt(
   2417 //..       IRStmt_Exit(
   2418 //..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
   2419 //..          Ijk_MapFail,
   2420 //..          IRConst_U32( guest_eip_curr_instr )
   2421 //..       )
   2422 //..    );
   2423 //..
   2424 //..    /* otherwise, here's the translated result. */
   2425 //..    return unop(Iop_64to32, mkexpr(r64));
   2426 //.. }
   2427 
   2428 
   2429 /* Generate IR to calculate an address indicated by a ModRM and
   2430    following SIB bytes.  The expression, and the number of bytes in
   2431    the address mode, are returned (the latter in *len).  Note that
   2432    this fn should not be called if the R/M part of the address denotes
   2433    a register instead of memory.  If print_codegen is true, text of
   2434    the addressing mode is placed in buf.
   2435 
   2436    The computed address is stored in a new tempreg, and the
   2437    identity of the tempreg is returned.
   2438 
   2439    extra_bytes holds the number of bytes after the amode, as supplied
   2440    by the caller.  This is needed to make sense of %rip-relative
   2441    addresses.  Note that the value that *len is set to is only the
   2442    length of the amode itself and does not include the value supplied
   2443    in extra_bytes.
   2444  */
   2445 
   2446 static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
   2447 {
   2448    IRTemp tmp = newTemp(Ity_I64);
   2449    assign( tmp, addr64 );
   2450    return tmp;
   2451 }
   2452 
   2453 static
   2454 IRTemp disAMode ( /*OUT*/Int* len,
   2455                   const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2456                   /*OUT*/HChar* buf, Int extra_bytes )
   2457 {
   2458    UChar mod_reg_rm = getUChar(delta);
   2459    delta++;
   2460 
   2461    buf[0] = (UChar)0;
   2462    vassert(extra_bytes >= 0 && extra_bytes < 10);
   2463 
   2464    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2465       jump table seems a bit excessive.
   2466    */
   2467    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2468    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2469                                                /* is now XX0XXYYY */
   2470    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2471    switch (mod_reg_rm) {
   2472 
   2473       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2474          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2475       */
   2476       case 0x00: case 0x01: case 0x02: case 0x03:
   2477       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2478          { UChar rm = toUChar(mod_reg_rm & 7);
   2479            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2480            *len = 1;
   2481            return disAMode_copy2tmp(
   2482                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
   2483          }
   2484 
   2485       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2486          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2487       */
   2488       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2489       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2490          { UChar rm = toUChar(mod_reg_rm & 7);
   2491            Long d   = getSDisp8(delta);
   2492            if (d == 0) {
   2493               DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
   2494            } else {
   2495               DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2496            }
   2497            *len = 2;
   2498            return disAMode_copy2tmp(
   2499                   handleAddrOverrides(vbi, pfx,
   2500                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2501          }
   2502 
   2503       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2504          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2505       */
   2506       case 0x10: case 0x11: case 0x12: case 0x13:
   2507       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2508          { UChar rm = toUChar(mod_reg_rm & 7);
   2509            Long  d  = getSDisp32(delta);
   2510            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
   2511            *len = 5;
   2512            return disAMode_copy2tmp(
   2513                   handleAddrOverrides(vbi, pfx,
   2514                      binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
   2515          }
   2516 
   2517       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2518       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2519       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2520       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2521          vpanic("disAMode(amd64): not an addr!");
   2522 
   2523       /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
   2524          correctly at the start of handling each instruction. */
   2525       case 0x05:
   2526          { Long d = getSDisp32(delta);
   2527            *len = 5;
   2528            DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
   2529            /* We need to know the next instruction's start address.
   2530               Try and figure out what it is, record the guess, and ask
   2531               the top-level driver logic (bbToIR_AMD64) to check we
   2532               guessed right, after the instruction is completely
   2533               decoded. */
   2534            guest_RIP_next_mustcheck = True;
   2535            guest_RIP_next_assumed = guest_RIP_bbstart
   2536                                     + delta+4 + extra_bytes;
   2537            return disAMode_copy2tmp(
   2538                      handleAddrOverrides(vbi, pfx,
   2539                         binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
   2540                                          mkU64(d))));
   2541          }
   2542 
   2543       case 0x04: {
   2544          /* SIB, with no displacement.  Special cases:
   2545             -- %rsp cannot act as an index value.
   2546                If index_r indicates %rsp, zero is used for the index.
   2547             -- when mod is zero and base indicates RBP or R13, base is
   2548                instead a 32-bit sign-extended literal.
   2549             It's all madness, I tell you.  Extract %index, %base and
   2550             scale from the SIB byte.  The value denoted is then:
   2551                | %index == %RSP && (%base == %RBP || %base == %R13)
   2552                = d32 following SIB byte
   2553                | %index == %RSP && !(%base == %RBP || %base == %R13)
   2554                = %base
   2555                | %index != %RSP && (%base == %RBP || %base == %R13)
   2556                = d32 following SIB byte + (%index << scale)
   2557                | %index != %RSP && !(%base == %RBP || %base == %R13)
   2558                = %base + (%index << scale)
   2559          */
   2560          UChar sib     = getUChar(delta);
   2561          UChar scale   = toUChar((sib >> 6) & 3);
   2562          UChar index_r = toUChar((sib >> 3) & 7);
   2563          UChar base_r  = toUChar(sib & 7);
   2564          /* correct since #(R13) == 8 + #(RBP) */
   2565          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2566          Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
   2567          delta++;
   2568 
   2569          if ((!index_is_SP) && (!base_is_BPor13)) {
   2570             if (scale == 0) {
   2571                DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2572                          nameIRegRexB(8,pfx,base_r),
   2573                          nameIReg64rexX(pfx,index_r));
   2574             } else {
   2575                DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2576                          nameIRegRexB(8,pfx,base_r),
   2577                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2578             }
   2579             *len = 2;
   2580             return
   2581                disAMode_copy2tmp(
   2582                handleAddrOverrides(vbi, pfx,
   2583                   binop(Iop_Add64,
   2584                         getIRegRexB(8,pfx,base_r),
   2585                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2586                               mkU8(scale)))));
   2587          }
   2588 
   2589          if ((!index_is_SP) && base_is_BPor13) {
   2590             Long d = getSDisp32(delta);
   2591             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
   2592                       nameIReg64rexX(pfx,index_r), 1<<scale);
   2593             *len = 6;
   2594             return
   2595                disAMode_copy2tmp(
   2596                handleAddrOverrides(vbi, pfx,
   2597                   binop(Iop_Add64,
   2598                         binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
   2599                                          mkU8(scale)),
   2600                         mkU64(d))));
   2601          }
   2602 
   2603          if (index_is_SP && (!base_is_BPor13)) {
   2604             DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
   2605             *len = 2;
   2606             return disAMode_copy2tmp(
   2607                    handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
   2608          }
   2609 
   2610          if (index_is_SP && base_is_BPor13) {
   2611             Long d = getSDisp32(delta);
   2612             DIS(buf, "%s%lld", segRegTxt(pfx), d);
   2613             *len = 6;
   2614             return disAMode_copy2tmp(
   2615                    handleAddrOverrides(vbi, pfx, mkU64(d)));
   2616          }
   2617 
   2618          vassert(0);
   2619       }
   2620 
   2621       /* SIB, with 8-bit displacement.  Special cases:
   2622          -- %esp cannot act as an index value.
   2623             If index_r indicates %esp, zero is used for the index.
   2624          Denoted value is:
   2625             | %index == %ESP
   2626             = d8 + %base
   2627             | %index != %ESP
   2628             = d8 + %base + (%index << scale)
   2629       */
   2630       case 0x0C: {
   2631          UChar sib     = getUChar(delta);
   2632          UChar scale   = toUChar((sib >> 6) & 3);
   2633          UChar index_r = toUChar((sib >> 3) & 7);
   2634          UChar base_r  = toUChar(sib & 7);
   2635          Long d        = getSDisp8(delta+1);
   2636 
   2637          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2638             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2639                                    d, nameIRegRexB(8,pfx,base_r));
   2640             *len = 3;
   2641             return disAMode_copy2tmp(
   2642                    handleAddrOverrides(vbi, pfx,
   2643                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2644          } else {
   2645             if (scale == 0) {
   2646                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2647                          nameIRegRexB(8,pfx,base_r),
   2648                          nameIReg64rexX(pfx,index_r));
   2649             } else {
   2650                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2651                          nameIRegRexB(8,pfx,base_r),
   2652                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2653             }
   2654             *len = 3;
   2655             return
   2656                 disAMode_copy2tmp(
   2657                 handleAddrOverrides(vbi, pfx,
   2658                   binop(Iop_Add64,
   2659                         binop(Iop_Add64,
   2660                               getIRegRexB(8,pfx,base_r),
   2661                               binop(Iop_Shl64,
   2662                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2663                         mkU64(d))));
   2664          }
   2665          vassert(0); /*NOTREACHED*/
   2666       }
   2667 
   2668       /* SIB, with 32-bit displacement.  Special cases:
   2669          -- %rsp cannot act as an index value.
   2670             If index_r indicates %rsp, zero is used for the index.
   2671          Denoted value is:
   2672             | %index == %RSP
   2673             = d32 + %base
   2674             | %index != %RSP
   2675             = d32 + %base + (%index << scale)
   2676       */
   2677       case 0x14: {
   2678          UChar sib     = getUChar(delta);
   2679          UChar scale   = toUChar((sib >> 6) & 3);
   2680          UChar index_r = toUChar((sib >> 3) & 7);
   2681          UChar base_r  = toUChar(sib & 7);
   2682          Long d        = getSDisp32(delta+1);
   2683 
   2684          if (index_r == R_RSP && 0==getRexX(pfx)) {
   2685             DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
   2686                                    d, nameIRegRexB(8,pfx,base_r));
   2687             *len = 6;
   2688             return disAMode_copy2tmp(
   2689                    handleAddrOverrides(vbi, pfx,
   2690                       binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
   2691          } else {
   2692             if (scale == 0) {
   2693                DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2694                          nameIRegRexB(8,pfx,base_r),
   2695                          nameIReg64rexX(pfx,index_r));
   2696             } else {
   2697                DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2698                          nameIRegRexB(8,pfx,base_r),
   2699                          nameIReg64rexX(pfx,index_r), 1<<scale);
   2700             }
   2701             *len = 6;
   2702             return
   2703                 disAMode_copy2tmp(
   2704                 handleAddrOverrides(vbi, pfx,
   2705                   binop(Iop_Add64,
   2706                         binop(Iop_Add64,
   2707                               getIRegRexB(8,pfx,base_r),
   2708                               binop(Iop_Shl64,
   2709                                     getIReg64rexX(pfx,index_r), mkU8(scale))),
   2710                         mkU64(d))));
   2711          }
   2712          vassert(0); /*NOTREACHED*/
   2713       }
   2714 
   2715       default:
   2716          vpanic("disAMode(amd64)");
   2717          return 0; /*notreached*/
   2718    }
   2719 }
   2720 
   2721 
   2722 /* Similarly for VSIB addressing.  This returns just the addend,
   2723    and fills in *rI and *vscale with the register number of the vector
   2724    index and its multiplicand.  */
   2725 static
   2726 IRTemp disAVSIBMode ( /*OUT*/Int* len,
   2727                       const VexAbiInfo* vbi, Prefix pfx, Long delta,
   2728                       /*OUT*/HChar* buf, /*OUT*/UInt* rI,
   2729                       IRType ty, /*OUT*/Int* vscale )
   2730 {
   2731    UChar mod_reg_rm = getUChar(delta);
   2732    const HChar *vindex;
   2733 
   2734    *len = 0;
   2735    *rI = 0;
   2736    *vscale = 0;
   2737    buf[0] = (UChar)0;
   2738    if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
   2739       return IRTemp_INVALID;
   2740 
   2741    UChar sib     = getUChar(delta+1);
   2742    UChar scale   = toUChar((sib >> 6) & 3);
   2743    UChar index_r = toUChar((sib >> 3) & 7);
   2744    UChar base_r  = toUChar(sib & 7);
   2745    Long  d       = 0;
   2746    /* correct since #(R13) == 8 + #(RBP) */
   2747    Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2748    delta += 2;
   2749    *len = 2;
   2750 
   2751    *rI = index_r | (getRexX(pfx) << 3);
   2752    if (ty == Ity_V128)
   2753       vindex = nameXMMReg(*rI);
   2754    else
   2755       vindex = nameYMMReg(*rI);
   2756    *vscale = 1<<scale;
   2757 
   2758    switch (mod_reg_rm >> 6) {
   2759    case 0:
   2760       if (base_is_BPor13) {
   2761          d = getSDisp32(delta);
   2762          *len += 4;
   2763          if (scale == 0) {
   2764             DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
   2765          } else {
   2766             DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
   2767          }
   2768          return disAMode_copy2tmp( mkU64(d) );
   2769       } else {
   2770          if (scale == 0) {
   2771             DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
   2772                      nameIRegRexB(8,pfx,base_r), vindex);
   2773          } else {
   2774             DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
   2775                      nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2776          }
   2777       }
   2778       break;
   2779    case 1:
   2780       d = getSDisp8(delta);
   2781       *len += 1;
   2782       goto have_disp;
   2783    case 2:
   2784       d = getSDisp32(delta);
   2785       *len += 4;
   2786    have_disp:
   2787       if (scale == 0) {
   2788          DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
   2789                   nameIRegRexB(8,pfx,base_r), vindex);
   2790       } else {
   2791          DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
   2792                   nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
   2793       }
   2794       break;
   2795    }
   2796 
   2797    if (!d)
   2798       return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
   2799    return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
   2800                                    mkU64(d)) );
   2801 }
   2802 
   2803 
   2804 /* Figure out the number of (insn-stream) bytes constituting the amode
   2805    beginning at delta.  Is useful for getting hold of literals beyond
   2806    the end of the amode before it has been disassembled.  */
   2807 
   2808 static UInt lengthAMode ( Prefix pfx, Long delta )
   2809 {
   2810    UChar mod_reg_rm = getUChar(delta);
   2811    delta++;
   2812 
   2813    /* squeeze out the reg field from mod_reg_rm, since a 256-entry
   2814       jump table seems a bit excessive.
   2815    */
   2816    mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
   2817    mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
   2818                                                /* is now XX0XXYYY */
   2819    mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
   2820    switch (mod_reg_rm) {
   2821 
   2822       /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
   2823          REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
   2824       */
   2825       case 0x00: case 0x01: case 0x02: case 0x03:
   2826       /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
   2827          return 1;
   2828 
   2829       /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
   2830          REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
   2831       */
   2832       case 0x08: case 0x09: case 0x0A: case 0x0B:
   2833       /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
   2834          return 2;
   2835 
   2836       /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
   2837          REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
   2838       */
   2839       case 0x10: case 0x11: case 0x12: case 0x13:
   2840       /* ! 14 */ case 0x15: case 0x16: case 0x17:
   2841          return 5;
   2842 
   2843       /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
   2844       /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
   2845       /* Not an address, but still handled. */
   2846       case 0x18: case 0x19: case 0x1A: case 0x1B:
   2847       case 0x1C: case 0x1D: case 0x1E: case 0x1F:
   2848          return 1;
   2849 
   2850       /* RIP + disp32. */
   2851       case 0x05:
   2852          return 5;
   2853 
   2854       case 0x04: {
   2855          /* SIB, with no displacement. */
   2856          UChar sib     = getUChar(delta);
   2857          UChar base_r  = toUChar(sib & 7);
   2858          /* correct since #(R13) == 8 + #(RBP) */
   2859          Bool  base_is_BPor13 = toBool(base_r == R_RBP);
   2860 
   2861          if (base_is_BPor13) {
   2862             return 6;
   2863          } else {
   2864             return 2;
   2865          }
   2866       }
   2867 
   2868       /* SIB, with 8-bit displacement. */
   2869       case 0x0C:
   2870          return 3;
   2871 
   2872       /* SIB, with 32-bit displacement. */
   2873       case 0x14:
   2874          return 6;
   2875 
   2876       default:
   2877          vpanic("lengthAMode(amd64)");
   2878          return 0; /*notreached*/
   2879    }
   2880 }
   2881 
   2882 
   2883 /*------------------------------------------------------------*/
   2884 /*--- Disassembling common idioms                          ---*/
   2885 /*------------------------------------------------------------*/
   2886 
   2887 /* Handle binary integer instructions of the form
   2888       op E, G  meaning
   2889       op reg-or-mem, reg
   2890    Is passed the a ptr to the modRM byte, the actual operation, and the
   2891    data size.  Returns the address advanced completely over this
   2892    instruction.
   2893 
   2894    E(src) is reg-or-mem
   2895    G(dst) is reg.
   2896 
   2897    If E is reg, -->    GET %G,  tmp
   2898                        OP %E,   tmp
   2899                        PUT tmp, %G
   2900 
   2901    If E is mem and OP is not reversible,
   2902                 -->    (getAddr E) -> tmpa
   2903                        LD (tmpa), tmpa
   2904                        GET %G, tmp2
   2905                        OP tmpa, tmp2
   2906                        PUT tmp2, %G
   2907 
   2908    If E is mem and OP is reversible
   2909                 -->    (getAddr E) -> tmpa
   2910                        LD (tmpa), tmpa
   2911                        OP %G, tmpa
   2912                        PUT tmpa, %G
   2913 */
   2914 static
   2915 ULong dis_op2_E_G ( const VexAbiInfo* vbi,
   2916                     Prefix      pfx,
   2917                     Bool        addSubCarry,
   2918                     IROp        op8,
   2919                     Bool        keep,
   2920                     Int         size,
   2921                     Long        delta0,
   2922                     const HChar* t_amd64opc )
   2923 {
   2924    HChar   dis_buf[50];
   2925    Int     len;
   2926    IRType  ty   = szToITy(size);
   2927    IRTemp  dst1 = newTemp(ty);
   2928    IRTemp  src  = newTemp(ty);
   2929    IRTemp  dst0 = newTemp(ty);
   2930    UChar   rm   = getUChar(delta0);
   2931    IRTemp  addr = IRTemp_INVALID;
   2932 
   2933    /* addSubCarry == True indicates the intended operation is
   2934       add-with-carry or subtract-with-borrow. */
   2935    if (addSubCarry) {
   2936       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   2937       vassert(keep);
   2938    }
   2939 
   2940    if (epartIsReg(rm)) {
   2941       /* Specially handle XOR reg,reg, because that doesn't really
   2942          depend on reg, and doing the obvious thing potentially
   2943          generates a spurious value check failure due to the bogus
   2944          dependency. */
   2945       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   2946           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   2947          if (False && op8 == Iop_Sub8)
   2948             vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
   2949          putIRegG(size,pfx,rm, mkU(ty,0));
   2950       }
   2951 
   2952       assign( dst0, getIRegG(size,pfx,rm) );
   2953       assign( src,  getIRegE(size,pfx,rm) );
   2954 
   2955       if (addSubCarry && op8 == Iop_Add8) {
   2956          helper_ADC( size, dst1, dst0, src,
   2957                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2958          putIRegG(size, pfx, rm, mkexpr(dst1));
   2959       } else
   2960       if (addSubCarry && op8 == Iop_Sub8) {
   2961          helper_SBB( size, dst1, dst0, src,
   2962                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2963          putIRegG(size, pfx, rm, mkexpr(dst1));
   2964       } else {
   2965          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2966          if (isAddSub(op8))
   2967             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2968          else
   2969             setFlags_DEP1(op8, dst1, ty);
   2970          if (keep)
   2971             putIRegG(size, pfx, rm, mkexpr(dst1));
   2972       }
   2973 
   2974       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   2975                           nameIRegE(size,pfx,rm),
   2976                           nameIRegG(size,pfx,rm));
   2977       return 1+delta0;
   2978    } else {
   2979       /* E refers to memory */
   2980       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   2981       assign( dst0, getIRegG(size,pfx,rm) );
   2982       assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
   2983 
   2984       if (addSubCarry && op8 == Iop_Add8) {
   2985          helper_ADC( size, dst1, dst0, src,
   2986                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2987          putIRegG(size, pfx, rm, mkexpr(dst1));
   2988       } else
   2989       if (addSubCarry && op8 == Iop_Sub8) {
   2990          helper_SBB( size, dst1, dst0, src,
   2991                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   2992          putIRegG(size, pfx, rm, mkexpr(dst1));
   2993       } else {
   2994          assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   2995          if (isAddSub(op8))
   2996             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   2997          else
   2998             setFlags_DEP1(op8, dst1, ty);
   2999          if (keep)
   3000             putIRegG(size, pfx, rm, mkexpr(dst1));
   3001       }
   3002 
   3003       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3004                           dis_buf, nameIRegG(size, pfx, rm));
   3005       return len+delta0;
   3006    }
   3007 }
   3008 
   3009 
   3010 
   3011 /* Handle binary integer instructions of the form
   3012       op G, E  meaning
   3013       op reg, reg-or-mem
   3014    Is passed the a ptr to the modRM byte, the actual operation, and the
   3015    data size.  Returns the address advanced completely over this
   3016    instruction.
   3017 
   3018    G(src) is reg.
   3019    E(dst) is reg-or-mem
   3020 
   3021    If E is reg, -->    GET %E,  tmp
   3022                        OP %G,   tmp
   3023                        PUT tmp, %E
   3024 
   3025    If E is mem, -->    (getAddr E) -> tmpa
   3026                        LD (tmpa), tmpv
   3027                        OP %G, tmpv
   3028                        ST tmpv, (tmpa)
   3029 */
   3030 static
   3031 ULong dis_op2_G_E ( const VexAbiInfo* vbi,
   3032                     Prefix      pfx,
   3033                     Bool        addSubCarry,
   3034                     IROp        op8,
   3035                     Bool        keep,
   3036                     Int         size,
   3037                     Long        delta0,
   3038                     const HChar* t_amd64opc )
   3039 {
   3040    HChar   dis_buf[50];
   3041    Int     len;
   3042    IRType  ty   = szToITy(size);
   3043    IRTemp  dst1 = newTemp(ty);
   3044    IRTemp  src  = newTemp(ty);
   3045    IRTemp  dst0 = newTemp(ty);
   3046    UChar   rm   = getUChar(delta0);
   3047    IRTemp  addr = IRTemp_INVALID;
   3048 
   3049    /* addSubCarry == True indicates the intended operation is
   3050       add-with-carry or subtract-with-borrow. */
   3051    if (addSubCarry) {
   3052       vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
   3053       vassert(keep);
   3054    }
   3055 
   3056    if (epartIsReg(rm)) {
   3057       /* Specially handle XOR reg,reg, because that doesn't really
   3058          depend on reg, and doing the obvious thing potentially
   3059          generates a spurious value check failure due to the bogus
   3060          dependency.  Ditto SBB reg,reg. */
   3061       if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
   3062           && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
   3063          putIRegE(size,pfx,rm, mkU(ty,0));
   3064       }
   3065 
   3066       assign(dst0, getIRegE(size,pfx,rm));
   3067       assign(src,  getIRegG(size,pfx,rm));
   3068 
   3069       if (addSubCarry && op8 == Iop_Add8) {
   3070          helper_ADC( size, dst1, dst0, src,
   3071                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3072          putIRegE(size, pfx, rm, mkexpr(dst1));
   3073       } else
   3074       if (addSubCarry && op8 == Iop_Sub8) {
   3075          helper_SBB( size, dst1, dst0, src,
   3076                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3077          putIRegE(size, pfx, rm, mkexpr(dst1));
   3078       } else {
   3079          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3080          if (isAddSub(op8))
   3081             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3082          else
   3083             setFlags_DEP1(op8, dst1, ty);
   3084          if (keep)
   3085             putIRegE(size, pfx, rm, mkexpr(dst1));
   3086       }
   3087 
   3088       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3089                           nameIRegG(size,pfx,rm),
   3090                           nameIRegE(size,pfx,rm));
   3091       return 1+delta0;
   3092    }
   3093 
   3094    /* E refers to memory */
   3095    {
   3096       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3097       assign(dst0, loadLE(ty,mkexpr(addr)));
   3098       assign(src,  getIRegG(size,pfx,rm));
   3099 
   3100       if (addSubCarry && op8 == Iop_Add8) {
   3101          if (haveLOCK(pfx)) {
   3102             /* cas-style store */
   3103             helper_ADC( size, dst1, dst0, src,
   3104                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3105          } else {
   3106             /* normal store */
   3107             helper_ADC( size, dst1, dst0, src,
   3108                         /*store*/addr, IRTemp_INVALID, 0 );
   3109          }
   3110       } else
   3111       if (addSubCarry && op8 == Iop_Sub8) {
   3112          if (haveLOCK(pfx)) {
   3113             /* cas-style store */
   3114             helper_SBB( size, dst1, dst0, src,
   3115                         /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3116          } else {
   3117             /* normal store */
   3118             helper_SBB( size, dst1, dst0, src,
   3119                         /*store*/addr, IRTemp_INVALID, 0 );
   3120          }
   3121       } else {
   3122          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3123          if (keep) {
   3124             if (haveLOCK(pfx)) {
   3125                if (0) vex_printf("locked case\n" );
   3126                casLE( mkexpr(addr),
   3127                       mkexpr(dst0)/*expval*/,
   3128                       mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
   3129             } else {
   3130                if (0) vex_printf("nonlocked case\n");
   3131                storeLE(mkexpr(addr), mkexpr(dst1));
   3132             }
   3133          }
   3134          if (isAddSub(op8))
   3135             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3136          else
   3137             setFlags_DEP1(op8, dst1, ty);
   3138       }
   3139 
   3140       DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
   3141                           nameIRegG(size,pfx,rm), dis_buf);
   3142       return len+delta0;
   3143    }
   3144 }
   3145 
   3146 
   3147 /* Handle move instructions of the form
   3148       mov E, G  meaning
   3149       mov reg-or-mem, reg
   3150    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3151    the address advanced completely over this instruction.
   3152 
   3153    E(src) is reg-or-mem
   3154    G(dst) is reg.
   3155 
   3156    If E is reg, -->    GET %E,  tmpv
   3157                        PUT tmpv, %G
   3158 
   3159    If E is mem  -->    (getAddr E) -> tmpa
   3160                        LD (tmpa), tmpb
   3161                        PUT tmpb, %G
   3162 */
   3163 static
   3164 ULong dis_mov_E_G ( const VexAbiInfo* vbi,
   3165                     Prefix      pfx,
   3166                     Int         size,
   3167                     Long        delta0 )
   3168 {
   3169    Int len;
   3170    UChar rm = getUChar(delta0);
   3171    HChar dis_buf[50];
   3172 
   3173    if (epartIsReg(rm)) {
   3174       putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
   3175       DIP("mov%c %s,%s\n", nameISize(size),
   3176                            nameIRegE(size,pfx,rm),
   3177                            nameIRegG(size,pfx,rm));
   3178       return 1+delta0;
   3179    }
   3180 
   3181    /* E refers to memory */
   3182    {
   3183       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3184       putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
   3185       DIP("mov%c %s,%s\n", nameISize(size),
   3186                            dis_buf,
   3187                            nameIRegG(size,pfx,rm));
   3188       return delta0+len;
   3189    }
   3190 }
   3191 
   3192 
   3193 /* Handle move instructions of the form
   3194       mov G, E  meaning
   3195       mov reg, reg-or-mem
   3196    Is passed the a ptr to the modRM byte, and the data size.  Returns
   3197    the address advanced completely over this instruction.
   3198    We have to decide here whether F2 or F3 are acceptable.  F2 never is.
   3199 
   3200    G(src) is reg.
   3201    E(dst) is reg-or-mem
   3202 
   3203    If E is reg, -->    GET %G,  tmp
   3204                        PUT tmp, %E
   3205 
   3206    If E is mem, -->    (getAddr E) -> tmpa
   3207                        GET %G, tmpv
   3208                        ST tmpv, (tmpa)
   3209 */
   3210 static
   3211 ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
   3212                     Prefix       pfx,
   3213                     Int          size,
   3214                     Long         delta0,
   3215                     /*OUT*/Bool* ok )
   3216 {
   3217    Int   len;
   3218    UChar rm = getUChar(delta0);
   3219    HChar dis_buf[50];
   3220 
   3221    *ok = True;
   3222 
   3223    if (epartIsReg(rm)) {
   3224       if (haveF2orF3(pfx)) { *ok = False; return delta0; }
   3225       putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
   3226       DIP("mov%c %s,%s\n", nameISize(size),
   3227                            nameIRegG(size,pfx,rm),
   3228                            nameIRegE(size,pfx,rm));
   3229       return 1+delta0;
   3230    }
   3231 
   3232    /* E refers to memory */
   3233    {
   3234       if (haveF2(pfx)) { *ok = False; return delta0; }
   3235       /* F3(XRELEASE) is acceptable, though. */
   3236       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   3237       storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
   3238       DIP("mov%c %s,%s\n", nameISize(size),
   3239                            nameIRegG(size,pfx,rm),
   3240                            dis_buf);
   3241       return len+delta0;
   3242    }
   3243 }
   3244 
   3245 
   3246 /* op $immediate, AL/AX/EAX/RAX. */
   3247 static
   3248 ULong dis_op_imm_A ( Int    size,
   3249                      Bool   carrying,
   3250                      IROp   op8,
   3251                      Bool   keep,
   3252                      Long   delta,
   3253                      const HChar* t_amd64opc )
   3254 {
   3255    Int    size4 = imin(size,4);
   3256    IRType ty    = szToITy(size);
   3257    IRTemp dst0  = newTemp(ty);
   3258    IRTemp src   = newTemp(ty);
   3259    IRTemp dst1  = newTemp(ty);
   3260    Long  lit    = getSDisp(size4,delta);
   3261    assign(dst0, getIRegRAX(size));
   3262    assign(src,  mkU(ty,lit & mkSizeMask(size)));
   3263 
   3264    if (isAddSub(op8) && !carrying) {
   3265       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3266       setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3267    }
   3268    else
   3269    if (isLogic(op8)) {
   3270       vassert(!carrying);
   3271       assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
   3272       setFlags_DEP1(op8, dst1, ty);
   3273    }
   3274    else
   3275    if (op8 == Iop_Add8 && carrying) {
   3276       helper_ADC( size, dst1, dst0, src,
   3277                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3278    }
   3279    else
   3280    if (op8 == Iop_Sub8 && carrying) {
   3281       helper_SBB( size, dst1, dst0, src,
   3282                   /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3283    }
   3284    else
   3285       vpanic("dis_op_imm_A(amd64,guest)");
   3286 
   3287    if (keep)
   3288       putIRegRAX(size, mkexpr(dst1));
   3289 
   3290    DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
   3291                            lit, nameIRegRAX(size));
   3292    return delta+size4;
   3293 }
   3294 
   3295 
   3296 /* Sign- and Zero-extending moves. */
   3297 static
   3298 ULong dis_movx_E_G ( const VexAbiInfo* vbi,
   3299                      Prefix pfx,
   3300                      Long delta, Int szs, Int szd, Bool sign_extend )
   3301 {
   3302    UChar rm = getUChar(delta);
   3303    if (epartIsReg(rm)) {
   3304       putIRegG(szd, pfx, rm,
   3305                     doScalarWidening(
   3306                        szs,szd,sign_extend,
   3307                        getIRegE(szs,pfx,rm)));
   3308       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3309                                nameISize(szs),
   3310                                nameISize(szd),
   3311                                nameIRegE(szs,pfx,rm),
   3312                                nameIRegG(szd,pfx,rm));
   3313       return 1+delta;
   3314    }
   3315 
   3316    /* E refers to memory */
   3317    {
   3318       Int    len;
   3319       HChar  dis_buf[50];
   3320       IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   3321       putIRegG(szd, pfx, rm,
   3322                     doScalarWidening(
   3323                        szs,szd,sign_extend,
   3324                        loadLE(szToITy(szs),mkexpr(addr))));
   3325       DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
   3326                                nameISize(szs),
   3327                                nameISize(szd),
   3328                                dis_buf,
   3329                                nameIRegG(szd,pfx,rm));
   3330       return len+delta;
   3331    }
   3332 }
   3333 
   3334 
   3335 /* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
   3336    the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
   3337 static
   3338 void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
   3339 {
   3340    /* special-case the 64-bit case */
   3341    if (sz == 8) {
   3342       IROp   op     = signed_divide ? Iop_DivModS128to64
   3343                                     : Iop_DivModU128to64;
   3344       IRTemp src128 = newTemp(Ity_I128);
   3345       IRTemp dst128 = newTemp(Ity_I128);
   3346       assign( src128, binop(Iop_64HLto128,
   3347                             getIReg64(R_RDX),
   3348                             getIReg64(R_RAX)) );
   3349       assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
   3350       putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
   3351       putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
   3352    } else {
   3353       IROp   op    = signed_divide ? Iop_DivModS64to32
   3354                                    : Iop_DivModU64to32;
   3355       IRTemp src64 = newTemp(Ity_I64);
   3356       IRTemp dst64 = newTemp(Ity_I64);
   3357       switch (sz) {
   3358       case 4:
   3359          assign( src64,
   3360                  binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
   3361          assign( dst64,
   3362                  binop(op, mkexpr(src64), mkexpr(t)) );
   3363          putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
   3364          putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
   3365          break;
   3366       case 2: {
   3367          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3368          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3369          assign( src64, unop(widen3264,
   3370                              binop(Iop_16HLto32,
   3371                                    getIRegRDX(2),
   3372                                    getIRegRAX(2))) );
   3373          assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
   3374          putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
   3375          putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
   3376          break;
   3377       }
   3378       case 1: {
   3379          IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
   3380          IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
   3381          IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
   3382          assign( src64, unop(widen3264,
   3383                         unop(widen1632, getIRegRAX(2))) );
   3384          assign( dst64,
   3385                  binop(op, mkexpr(src64),
   3386                            unop(widen1632, unop(widen816, mkexpr(t)))) );
   3387          putIRegRAX( 1, unop(Iop_16to8,
   3388                         unop(Iop_32to16,
   3389                         unop(Iop_64to32,mkexpr(dst64)))) );
   3390          putIRegAH( unop(Iop_16to8,
   3391                     unop(Iop_32to16,
   3392                     unop(Iop_64HIto32,mkexpr(dst64)))) );
   3393          break;
   3394       }
   3395       default:
   3396          vpanic("codegen_div(amd64)");
   3397       }
   3398    }
   3399 }
   3400 
   3401 static
   3402 ULong dis_Grp1 ( const VexAbiInfo* vbi,
   3403                  Prefix pfx,
   3404                  Long delta, UChar modrm,
   3405                  Int am_sz, Int d_sz, Int sz, Long d64 )
   3406 {
   3407    Int     len;
   3408    HChar   dis_buf[50];
   3409    IRType  ty   = szToITy(sz);
   3410    IRTemp  dst1 = newTemp(ty);
   3411    IRTemp  src  = newTemp(ty);
   3412    IRTemp  dst0 = newTemp(ty);
   3413    IRTemp  addr = IRTemp_INVALID;
   3414    IROp    op8  = Iop_INVALID;
   3415    ULong   mask = mkSizeMask(sz);
   3416 
   3417    switch (gregLO3ofRM(modrm)) {
   3418       case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
   3419       case 2: break;  // ADC
   3420       case 3: break;  // SBB
   3421       case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
   3422       case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
   3423       /*NOTREACHED*/
   3424       default: vpanic("dis_Grp1(amd64): unhandled case");
   3425    }
   3426 
   3427    if (epartIsReg(modrm)) {
   3428       vassert(am_sz == 1);
   3429 
   3430       assign(dst0, getIRegE(sz,pfx,modrm));
   3431       assign(src,  mkU(ty,d64 & mask));
   3432 
   3433       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3434          helper_ADC( sz, dst1, dst0, src,
   3435                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3436       } else
   3437       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3438          helper_SBB( sz, dst1, dst0, src,
   3439                      /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
   3440       } else {
   3441          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3442          if (isAddSub(op8))
   3443             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3444          else
   3445             setFlags_DEP1(op8, dst1, ty);
   3446       }
   3447 
   3448       if (gregLO3ofRM(modrm) < 7)
   3449          putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3450 
   3451       delta += (am_sz + d_sz);
   3452       DIP("%s%c $%lld, %s\n",
   3453           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
   3454           nameIRegE(sz,pfx,modrm));
   3455    } else {
   3456       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3457 
   3458       assign(dst0, loadLE(ty,mkexpr(addr)));
   3459       assign(src, mkU(ty,d64 & mask));
   3460 
   3461       if (gregLO3ofRM(modrm) == 2 /* ADC */) {
   3462          if (haveLOCK(pfx)) {
   3463             /* cas-style store */
   3464             helper_ADC( sz, dst1, dst0, src,
   3465                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3466          } else {
   3467             /* normal store */
   3468             helper_ADC( sz, dst1, dst0, src,
   3469                         /*store*/addr, IRTemp_INVALID, 0 );
   3470          }
   3471       } else
   3472       if (gregLO3ofRM(modrm) == 3 /* SBB */) {
   3473          if (haveLOCK(pfx)) {
   3474             /* cas-style store */
   3475             helper_SBB( sz, dst1, dst0, src,
   3476                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
   3477          } else {
   3478             /* normal store */
   3479             helper_SBB( sz, dst1, dst0, src,
   3480                         /*store*/addr, IRTemp_INVALID, 0 );
   3481          }
   3482       } else {
   3483          assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
   3484          if (gregLO3ofRM(modrm) < 7) {
   3485             if (haveLOCK(pfx)) {
   3486                casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
   3487                                     mkexpr(dst1)/*newVal*/,
   3488                                     guest_RIP_curr_instr );
   3489             } else {
   3490                storeLE(mkexpr(addr), mkexpr(dst1));
   3491             }
   3492          }
   3493          if (isAddSub(op8))
   3494             setFlags_DEP1_DEP2(op8, dst0, src, ty);
   3495          else
   3496             setFlags_DEP1(op8, dst1, ty);
   3497       }
   3498 
   3499       delta += (len+d_sz);
   3500       DIP("%s%c $%lld, %s\n",
   3501           nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
   3502           d64, dis_buf);
   3503    }
   3504    return delta;
   3505 }
   3506 
   3507 
   3508 /* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
   3509    expression. */
   3510 
   3511 static
   3512 ULong dis_Grp2 ( const VexAbiInfo* vbi,
   3513                  Prefix pfx,
   3514                  Long delta, UChar modrm,
   3515                  Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
   3516                  const HChar* shift_expr_txt, Bool* decode_OK )
   3517 {
   3518    /* delta on entry points at the modrm byte. */
   3519    HChar  dis_buf[50];
   3520    Int    len;
   3521    Bool   isShift, isRotate, isRotateC;
   3522    IRType ty    = szToITy(sz);
   3523    IRTemp dst0  = newTemp(ty);
   3524    IRTemp dst1  = newTemp(ty);
   3525    IRTemp addr  = IRTemp_INVALID;
   3526 
   3527    *decode_OK = True;
   3528 
   3529    vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
   3530 
   3531    /* Put value to shift/rotate in dst0. */
   3532    if (epartIsReg(modrm)) {
   3533       assign(dst0, getIRegE(sz, pfx, modrm));
   3534       delta += (am_sz + d_sz);
   3535    } else {
   3536       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
   3537       assign(dst0, loadLE(ty,mkexpr(addr)));
   3538       delta += len + d_sz;
   3539    }
   3540 
   3541    isShift = False;
   3542    switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
   3543 
   3544    isRotate = False;
   3545    switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
   3546 
   3547    isRotateC = False;
   3548    switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
   3549 
   3550    if (!isShift && !isRotate && !isRotateC) {
   3551       /*NOTREACHED*/
   3552       vpanic("dis_Grp2(Reg): unhandled case(amd64)");
   3553    }
   3554 
   3555    if (isRotateC) {
   3556       /* Call a helper; this insn is so ridiculous it does not deserve
   3557          better.  One problem is, the helper has to calculate both the
   3558          new value and the new flags.  This is more than 64 bits, and
   3559          there is no way to return more than 64 bits from the helper.
   3560          Hence the crude and obvious solution is to call it twice,
   3561          using the sign of the sz field to indicate whether it is the
   3562          value or rflags result we want.
   3563       */
   3564       Bool     left = toBool(gregLO3ofRM(modrm) == 2);
   3565       IRExpr** argsVALUE;
   3566       IRExpr** argsRFLAGS;
   3567 
   3568       IRTemp new_value  = newTemp(Ity_I64);
   3569       IRTemp new_rflags = newTemp(Ity_I64);
   3570       IRTemp old_rflags = newTemp(Ity_I64);
   3571 
   3572       assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
   3573 
   3574       argsVALUE
   3575          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3576                           widenUto64(shift_expr),   /* rotate amount */
   3577                           mkexpr(old_rflags),
   3578                           mkU64(sz) );
   3579       assign( new_value,
   3580                  mkIRExprCCall(
   3581                     Ity_I64,
   3582                     0/*regparm*/,
   3583                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3584                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3585                     argsVALUE
   3586                  )
   3587             );
   3588 
   3589       argsRFLAGS
   3590          = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
   3591                           widenUto64(shift_expr),   /* rotate amount */
   3592                           mkexpr(old_rflags),
   3593                           mkU64(-sz) );
   3594       assign( new_rflags,
   3595                  mkIRExprCCall(
   3596                     Ity_I64,
   3597                     0/*regparm*/,
   3598                     left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
   3599                     left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
   3600                     argsRFLAGS
   3601                  )
   3602             );
   3603 
   3604       assign( dst1, narrowTo(ty, mkexpr(new_value)) );
   3605       stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3606       stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
   3607       stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3608       stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3609    }
   3610 
   3611    else
   3612    if (isShift) {
   3613 
   3614       IRTemp pre64     = newTemp(Ity_I64);
   3615       IRTemp res64     = newTemp(Ity_I64);
   3616       IRTemp res64ss   = newTemp(Ity_I64);
   3617       IRTemp shift_amt = newTemp(Ity_I8);
   3618       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3619       IROp   op64;
   3620 
   3621       switch (gregLO3ofRM(modrm)) {
   3622          case 4: op64 = Iop_Shl64; break;
   3623          case 5: op64 = Iop_Shr64; break;
   3624          case 6: op64 = Iop_Shl64; break;
   3625          case 7: op64 = Iop_Sar64; break;
   3626          /*NOTREACHED*/
   3627          default: vpanic("dis_Grp2:shift"); break;
   3628       }
   3629 
   3630       /* Widen the value to be shifted to 64 bits, do the shift, and
   3631          narrow back down.  This seems surprisingly long-winded, but
   3632          unfortunately the AMD semantics requires that 8/16/32-bit
   3633          shifts give defined results for shift values all the way up
   3634          to 32, and this seems the simplest way to do it.  It has the
   3635          advantage that the only IR level shifts generated are of 64
   3636          bit values, and the shift amount is guaranteed to be in the
   3637          range 0 .. 63, thereby observing the IR semantics requiring
   3638          all shift values to be in the range 0 .. 2^word_size-1.
   3639 
   3640          Therefore the shift amount is masked with 63 for 64-bit shifts
   3641          and 31 for all others.
   3642       */
   3643       /* shift_amt = shift_expr & MASK, regardless of operation size */
   3644       assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
   3645 
   3646       /* suitably widen the value to be shifted to 64 bits. */
   3647       assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
   3648                                      : widenUto64(mkexpr(dst0)) );
   3649 
   3650       /* res64 = pre64 `shift` shift_amt */
   3651       assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
   3652 
   3653       /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
   3654       assign( res64ss,
   3655               binop(op64,
   3656                     mkexpr(pre64),
   3657                     binop(Iop_And8,
   3658                           binop(Iop_Sub8,
   3659                                 mkexpr(shift_amt), mkU8(1)),
   3660                           mkU8(mask))) );
   3661 
   3662       /* Build the flags thunk. */
   3663       setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
   3664 
   3665       /* Narrow the result back down. */
   3666       assign( dst1, narrowTo(ty, mkexpr(res64)) );
   3667 
   3668    } /* if (isShift) */
   3669 
   3670    else
   3671    if (isRotate) {
   3672       Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
   3673                                         : (ty==Ity_I32 ? 2 : 3));
   3674       Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
   3675       IRTemp rot_amt   = newTemp(Ity_I8);
   3676       IRTemp rot_amt64 = newTemp(Ity_I8);
   3677       IRTemp oldFlags  = newTemp(Ity_I64);
   3678       UChar  mask      = toUChar(sz==8 ? 63 : 31);
   3679 
   3680       /* rot_amt = shift_expr & mask */
   3681       /* By masking the rotate amount thusly, the IR-level Shl/Shr
   3682          expressions never shift beyond the word size and thus remain
   3683          well defined. */
   3684       assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
   3685 
   3686       if (ty == Ity_I64)
   3687          assign(rot_amt, mkexpr(rot_amt64));
   3688       else
   3689          assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
   3690 
   3691       if (left) {
   3692 
   3693          /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
   3694          assign(dst1,
   3695             binop( mkSizedOp(ty,Iop_Or8),
   3696                    binop( mkSizedOp(ty,Iop_Shl8),
   3697                           mkexpr(dst0),
   3698                           mkexpr(rot_amt)
   3699                    ),
   3700                    binop( mkSizedOp(ty,Iop_Shr8),
   3701                           mkexpr(dst0),
   3702                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3703                    )
   3704             )
   3705          );
   3706          ccOp += AMD64G_CC_OP_ROLB;
   3707 
   3708       } else { /* right */
   3709 
   3710          /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
   3711          assign(dst1,
   3712             binop( mkSizedOp(ty,Iop_Or8),
   3713                    binop( mkSizedOp(ty,Iop_Shr8),
   3714                           mkexpr(dst0),
   3715                           mkexpr(rot_amt)
   3716                    ),
   3717                    binop( mkSizedOp(ty,Iop_Shl8),
   3718                           mkexpr(dst0),
   3719                           binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
   3720                    )
   3721             )
   3722          );
   3723          ccOp += AMD64G_CC_OP_RORB;
   3724 
   3725       }
   3726 
   3727       /* dst1 now holds the rotated value.  Build flag thunk.  We
   3728          need the resulting value for this, and the previous flags.
   3729          Except don't set it if the rotate count is zero. */
   3730 
   3731       assign(oldFlags, mk_amd64g_calculate_rflags_all());
   3732 
   3733       /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
   3734       IRTemp rot_amt64b = newTemp(Ity_I1);
   3735       assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
   3736 
   3737       /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
   3738       stmt( IRStmt_Put( OFFB_CC_OP,
   3739                         IRExpr_ITE( mkexpr(rot_amt64b),
   3740                                     mkU64(ccOp),
   3741                                     IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
   3742       stmt( IRStmt_Put( OFFB_CC_DEP1,
   3743                         IRExpr_ITE( mkexpr(rot_amt64b),
   3744                                     widenUto64(mkexpr(dst1)),
   3745                                     IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
   3746       stmt( IRStmt_Put( OFFB_CC_DEP2,
   3747                         IRExpr_ITE( mkexpr(rot_amt64b),
   3748                                     mkU64(0),
   3749                                     IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
   3750       stmt( IRStmt_Put( OFFB_CC_NDEP,
   3751                         IRExpr_ITE( mkexpr(rot_amt64b),
   3752                                     mkexpr(oldFlags),
   3753                                     IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
   3754    } /* if (isRotate) */
   3755 
   3756    /* Save result, and finish up. */
   3757    if (epartIsReg(modrm)) {
   3758       putIRegE(sz, pfx, modrm, mkexpr(dst1));
   3759       if (vex_traceflags & VEX_TRACE_FE) {
   3760          vex_printf("%s%c ",
   3761                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3762          if (shift_expr_txt)
   3763             vex_printf("%s", shift_expr_txt);
   3764          else
   3765             ppIRExpr(shift_expr);
   3766          vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
   3767       }
   3768    } else {
   3769       storeLE(mkexpr(addr), mkexpr(dst1));
   3770       if (vex_traceflags & VEX_TRACE_FE) {
   3771          vex_printf("%s%c ",
   3772                     nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
   3773          if (shift_expr_txt)
   3774             vex_printf("%s", shift_expr_txt);
   3775          else
   3776             ppIRExpr(shift_expr);
   3777          vex_printf(", %s\n", dis_buf);
   3778       }
   3779    }
   3780    return delta;
   3781 }
   3782 
   3783 
   3784 /* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
   3785 static
   3786 ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
   3787                      Prefix pfx,
   3788                      Long delta, UChar modrm,
   3789                      Int am_sz, Int sz, ULong src_val,
   3790                      Bool* decode_OK )
   3791 {
   3792    /* src_val denotes a d8.
   3793       And delta on entry points at the modrm byte. */
   3794 
   3795    IRType ty     = szToITy(sz);
   3796    IRTemp t2     = newTemp(Ity_I64);
   3797    IRTemp t2m    = newTemp(Ity_I64);
   3798    IRTemp t_addr = IRTemp_INVALID;
   3799    HChar  dis_buf[50];
   3800    ULong  mask;
   3801 
   3802    /* we're optimists :-) */
   3803    *decode_OK = True;
   3804 
   3805    /* Check whether F2 or F3 are acceptable. */
   3806    if (epartIsReg(modrm)) {
   3807       /* F2 or F3 are not allowed in the register case. */
   3808       if (haveF2orF3(pfx)) {
   3809          *decode_OK = False;
   3810          return delta;
   3811      }
   3812    } else {
   3813       /* F2 or F3 (but not both) are allowable provided LOCK is also
   3814          present. */
   3815       if (haveF2orF3(pfx)) {
   3816          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   3817             *decode_OK = False;
   3818             return delta;
   3819          }
   3820       }
   3821    }
   3822 
   3823    /* Limit src_val -- the bit offset -- to something within a word.
   3824       The Intel docs say that literal offsets larger than a word are
   3825       masked in this way. */
   3826    switch (sz) {
   3827       case 2:  src_val &= 15; break;
   3828       case 4:  src_val &= 31; break;
   3829       case 8:  src_val &= 63; break;
   3830       default: *decode_OK = False; return delta;
   3831    }
   3832 
   3833    /* Invent a mask suitable for the operation. */
   3834    switch (gregLO3ofRM(modrm)) {
   3835       case 4: /* BT */  mask = 0;                  break;
   3836       case 5: /* BTS */ mask = 1ULL << src_val;    break;
   3837       case 6: /* BTR */ mask = ~(1ULL << src_val); break;
   3838       case 7: /* BTC */ mask = 1ULL << src_val;    break;
   3839          /* If this needs to be extended, probably simplest to make a
   3840             new function to handle the other cases (0 .. 3).  The
   3841             Intel docs do however not indicate any use for 0 .. 3, so
   3842             we don't expect this to happen. */
   3843       default: *decode_OK = False; return delta;
   3844    }
   3845 
   3846    /* Fetch the value to be tested and modified into t2, which is
   3847       64-bits wide regardless of sz. */
   3848    if (epartIsReg(modrm)) {
   3849       vassert(am_sz == 1);
   3850       assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
   3851       delta += (am_sz + 1);
   3852       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3853                                 nameISize(sz),
   3854                                 src_val, nameIRegE(sz,pfx,modrm));
   3855    } else {
   3856       Int len;
   3857       t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
   3858       delta  += (len+1);
   3859       assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
   3860       DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
   3861                                 nameISize(sz),
   3862                                 src_val, dis_buf);
   3863    }
   3864 
   3865    /* Compute the new value into t2m, if non-BT. */
   3866    switch (gregLO3ofRM(modrm)) {
   3867       case 4: /* BT */
   3868          break;
   3869       case 5: /* BTS */
   3870          assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
   3871          break;
   3872       case 6: /* BTR */
   3873          assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
   3874          break;
   3875       case 7: /* BTC */
   3876          assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
   3877          break;
   3878      default:
   3879          /*NOTREACHED*/ /*the previous switch guards this*/
   3880          vassert(0);
   3881    }
   3882 
   3883    /* Write the result back, if non-BT. */
   3884    if (gregLO3ofRM(modrm) != 4 /* BT */) {
   3885       if (epartIsReg(modrm)) {
   3886         putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
   3887       } else {
   3888          if (haveLOCK(pfx)) {
   3889             casLE( mkexpr(t_addr),
   3890                    narrowTo(ty, mkexpr(t2))/*expd*/,
   3891                    narrowTo(ty, mkexpr(t2m))/*new*/,
   3892                    guest_RIP_curr_instr );
   3893          } else {
   3894             storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
   3895          }
   3896       }
   3897    }
   3898 
   3899    /* Copy relevant bit from t2 into the carry flag. */
   3900    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   3901    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   3902    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   3903    stmt( IRStmt_Put(
   3904             OFFB_CC_DEP1,
   3905             binop(Iop_And64,
   3906                   binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
   3907                   mkU64(1))
   3908        ));
   3909    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   3910       elimination of previous stores to this field work better. */
   3911    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   3912 
   3913    return delta;
   3914 }
   3915 
   3916 
   3917 /* Signed/unsigned widening multiply.  Generate IR to multiply the
   3918    value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
   3919    RDX:RAX/EDX:EAX/DX:AX/AX.
   3920 */
   3921 static void codegen_mulL_A_D ( Int sz, Bool syned,
   3922                                IRTemp tmp, const HChar* tmp_txt )
   3923 {
   3924    IRType ty = szToITy(sz);
   3925    IRTemp t1 = newTemp(ty);
   3926 
   3927    assign( t1, getIRegRAX(sz) );
   3928 
   3929    switch (ty) {
   3930       case Ity_I64: {
   3931          IRTemp res128  = newTemp(Ity_I128);
   3932          IRTemp resHi   = newTemp(Ity_I64);
   3933          IRTemp resLo   = newTemp(Ity_I64);
   3934          IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
   3935          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3936          setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
   3937          assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3938          assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
   3939          assign( resLo, unop(Iop_128to64,mkexpr(res128)));
   3940          putIReg64(R_RDX, mkexpr(resHi));
   3941          putIReg64(R_RAX, mkexpr(resLo));
   3942          break;
   3943       }
   3944       case Ity_I32: {
   3945          IRTemp res64   = newTemp(Ity_I64);
   3946          IRTemp resHi   = newTemp(Ity_I32);
   3947          IRTemp resLo   = newTemp(Ity_I32);
   3948          IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
   3949          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3950          setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
   3951          assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3952          assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
   3953          assign( resLo, unop(Iop_64to32,mkexpr(res64)));
   3954          putIRegRDX(4, mkexpr(resHi));
   3955          putIRegRAX(4, mkexpr(resLo));
   3956          break;
   3957       }
   3958       case Ity_I16: {
   3959          IRTemp res32   = newTemp(Ity_I32);
   3960          IRTemp resHi   = newTemp(Ity_I16);
   3961          IRTemp resLo   = newTemp(Ity_I16);
   3962          IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
   3963          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3964          setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
   3965          assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3966          assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
   3967          assign( resLo, unop(Iop_32to16,mkexpr(res32)));
   3968          putIRegRDX(2, mkexpr(resHi));
   3969          putIRegRAX(2, mkexpr(resLo));
   3970          break;
   3971       }
   3972       case Ity_I8: {
   3973          IRTemp res16   = newTemp(Ity_I16);
   3974          IRTemp resHi   = newTemp(Ity_I8);
   3975          IRTemp resLo   = newTemp(Ity_I8);
   3976          IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
   3977          UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
   3978          setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
   3979          assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
   3980          assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
   3981          assign( resLo, unop(Iop_16to8,mkexpr(res16)));
   3982          putIRegRAX(2, mkexpr(res16));
   3983          break;
   3984       }
   3985       default:
   3986          ppIRType(ty);
   3987          vpanic("codegen_mulL_A_D(amd64)");
   3988    }
   3989    DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
   3990 }
   3991 
   3992 
   3993 /* Group 3 extended opcodes.  We have to decide here whether F2 and F3
   3994    might be valid.*/
   3995 static
   3996 ULong dis_Grp3 ( const VexAbiInfo* vbi,
   3997                  Prefix pfx, Int sz, Long delta, Bool* decode_OK )
   3998 {
   3999    Long    d64;
   4000    UChar   modrm;
   4001    HChar   dis_buf[50];
   4002    Int     len;
   4003    IRTemp  addr;
   4004    IRType  ty = szToITy(sz);
   4005    IRTemp  t1 = newTemp(ty);
   4006    IRTemp dst1, src, dst0;
   4007    *decode_OK = True;
   4008    modrm = getUChar(delta);
   4009    if (epartIsReg(modrm)) {
   4010       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4011       if (haveF2orF3(pfx)) goto unhandled;
   4012       switch (gregLO3ofRM(modrm)) {
   4013          case 0: { /* TEST */
   4014             delta++;
   4015             d64 = getSDisp(imin(4,sz), delta);
   4016             delta += imin(4,sz);
   4017             dst1 = newTemp(ty);
   4018             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4019                                getIRegE(sz,pfx,modrm),
   4020                                mkU(ty, d64 & mkSizeMask(sz))));
   4021             setFlags_DEP1( Iop_And8, dst1, ty );
   4022             DIP("test%c $%lld, %s\n",
   4023                 nameISize(sz), d64,
   4024                 nameIRegE(sz, pfx, modrm));
   4025             break;
   4026          }
   4027          case 1:
   4028             *decode_OK = False;
   4029             return delta;
   4030          case 2: /* NOT */
   4031             delta++;
   4032             putIRegE(sz, pfx, modrm,
   4033                               unop(mkSizedOp(ty,Iop_Not8),
   4034                                    getIRegE(sz, pfx, modrm)));
   4035             DIP("not%c %s\n", nameISize(sz),
   4036                               nameIRegE(sz, pfx, modrm));
   4037             break;
   4038          case 3: /* NEG */
   4039             delta++;
   4040             dst0 = newTemp(ty);
   4041             src  = newTemp(ty);
   4042             dst1 = newTemp(ty);
   4043             assign(dst0, mkU(ty,0));
   4044             assign(src,  getIRegE(sz, pfx, modrm));
   4045             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4046                                                        mkexpr(src)));
   4047             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4048             putIRegE(sz, pfx, modrm, mkexpr(dst1));
   4049             DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
   4050             break;
   4051          case 4: /* MUL (unsigned widening) */
   4052             delta++;
   4053             src = newTemp(ty);
   4054             assign(src, getIRegE(sz,pfx,modrm));
   4055             codegen_mulL_A_D ( sz, False, src,
   4056                                nameIRegE(sz,pfx,modrm) );
   4057             break;
   4058          case 5: /* IMUL (signed widening) */
   4059             delta++;
   4060             src = newTemp(ty);
   4061             assign(src, getIRegE(sz,pfx,modrm));
   4062             codegen_mulL_A_D ( sz, True, src,
   4063                                nameIRegE(sz,pfx,modrm) );
   4064             break;
   4065          case 6: /* DIV */
   4066             delta++;
   4067             assign( t1, getIRegE(sz, pfx, modrm) );
   4068             codegen_div ( sz, t1, False );
   4069             DIP("div%c %s\n", nameISize(sz),
   4070                               nameIRegE(sz, pfx, modrm));
   4071             break;
   4072          case 7: /* IDIV */
   4073             delta++;
   4074             assign( t1, getIRegE(sz, pfx, modrm) );
   4075             codegen_div ( sz, t1, True );
   4076             DIP("idiv%c %s\n", nameISize(sz),
   4077                                nameIRegE(sz, pfx, modrm));
   4078             break;
   4079          default:
   4080             /*NOTREACHED*/
   4081             vpanic("Grp3(amd64,R)");
   4082       }
   4083    } else {
   4084       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4085       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4086       if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
   4087           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4088          validF2orF3 = True;
   4089       }
   4090       if (!validF2orF3) goto unhandled;
   4091       /* */
   4092       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   4093                         /* we have to inform disAMode of any immediate
   4094                            bytes used */
   4095                         gregLO3ofRM(modrm)==0/*TEST*/
   4096                            ? imin(4,sz)
   4097                            : 0
   4098                       );
   4099       t1   = newTemp(ty);
   4100       delta += len;
   4101       assign(t1, loadLE(ty,mkexpr(addr)));
   4102       switch (gregLO3ofRM(modrm)) {
   4103          case 0: { /* TEST */
   4104             d64 = getSDisp(imin(4,sz), delta);
   4105             delta += imin(4,sz);
   4106             dst1 = newTemp(ty);
   4107             assign(dst1, binop(mkSizedOp(ty,Iop_And8),
   4108                                mkexpr(t1),
   4109                                mkU(ty, d64 & mkSizeMask(sz))));
   4110             setFlags_DEP1( Iop_And8, dst1, ty );
   4111             DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
   4112             break;
   4113          }
   4114          case 1:
   4115             *decode_OK = False;
   4116             return delta;
   4117          case 2: /* NOT */
   4118             dst1 = newTemp(ty);
   4119             assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
   4120             if (haveLOCK(pfx)) {
   4121                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4122                                     guest_RIP_curr_instr );
   4123             } else {
   4124                storeLE( mkexpr(addr), mkexpr(dst1) );
   4125             }
   4126             DIP("not%c %s\n", nameISize(sz), dis_buf);
   4127             break;
   4128          case 3: /* NEG */
   4129             dst0 = newTemp(ty);
   4130             src  = newTemp(ty);
   4131             dst1 = newTemp(ty);
   4132             assign(dst0, mkU(ty,0));
   4133             assign(src,  mkexpr(t1));
   4134             assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
   4135                                                        mkexpr(src)));
   4136             if (haveLOCK(pfx)) {
   4137                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
   4138                                     guest_RIP_curr_instr );
   4139             } else {
   4140                storeLE( mkexpr(addr), mkexpr(dst1) );
   4141             }
   4142             setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
   4143             DIP("neg%c %s\n", nameISize(sz), dis_buf);
   4144             break;
   4145          case 4: /* MUL (unsigned widening) */
   4146             codegen_mulL_A_D ( sz, False, t1, dis_buf );
   4147             break;
   4148          case 5: /* IMUL */
   4149             codegen_mulL_A_D ( sz, True, t1, dis_buf );
   4150             break;
   4151          case 6: /* DIV */
   4152             codegen_div ( sz, t1, False );
   4153             DIP("div%c %s\n", nameISize(sz), dis_buf);
   4154             break;
   4155          case 7: /* IDIV */
   4156             codegen_div ( sz, t1, True );
   4157             DIP("idiv%c %s\n", nameISize(sz), dis_buf);
   4158             break;
   4159          default:
   4160             /*NOTREACHED*/
   4161             vpanic("Grp3(amd64,M)");
   4162       }
   4163    }
   4164    return delta;
   4165   unhandled:
   4166    *decode_OK = False;
   4167    return delta;
   4168 }
   4169 
   4170 
   4171 /* Group 4 extended opcodes.  We have to decide here whether F2 and F3
   4172    might be valid. */
   4173 static
   4174 ULong dis_Grp4 ( const VexAbiInfo* vbi,
   4175                  Prefix pfx, Long delta, Bool* decode_OK )
   4176 {
   4177    Int   alen;
   4178    UChar modrm;
   4179    HChar dis_buf[50];
   4180    IRType ty = Ity_I8;
   4181    IRTemp t1 = newTemp(ty);
   4182    IRTemp t2 = newTemp(ty);
   4183 
   4184    *decode_OK = True;
   4185 
   4186    modrm = getUChar(delta);
   4187    if (epartIsReg(modrm)) {
   4188       /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
   4189       if (haveF2orF3(pfx)) goto unhandled;
   4190       assign(t1, getIRegE(1, pfx, modrm));
   4191       switch (gregLO3ofRM(modrm)) {
   4192          case 0: /* INC */
   4193             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4194             putIRegE(1, pfx, modrm, mkexpr(t2));
   4195             setFlags_INC_DEC( True, t2, ty );
   4196             break;
   4197          case 1: /* DEC */
   4198             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4199             putIRegE(1, pfx, modrm, mkexpr(t2));
   4200             setFlags_INC_DEC( False, t2, ty );
   4201             break;
   4202          default:
   4203             *decode_OK = False;
   4204             return delta;
   4205       }
   4206       delta++;
   4207       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
   4208                       nameIRegE(1, pfx, modrm));
   4209    } else {
   4210       /* Decide if F2/XACQ or F3/XREL might be valid. */
   4211       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4212       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4213           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4214          validF2orF3 = True;
   4215       }
   4216       if (!validF2orF3) goto unhandled;
   4217       /* */
   4218       IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   4219       assign( t1, loadLE(ty, mkexpr(addr)) );
   4220       switch (gregLO3ofRM(modrm)) {
   4221          case 0: /* INC */
   4222             assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
   4223             if (haveLOCK(pfx)) {
   4224                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4225                       guest_RIP_curr_instr );
   4226             } else {
   4227                storeLE( mkexpr(addr), mkexpr(t2) );
   4228             }
   4229             setFlags_INC_DEC( True, t2, ty );
   4230             break;
   4231          case 1: /* DEC */
   4232             assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
   4233             if (haveLOCK(pfx)) {
   4234                casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
   4235                       guest_RIP_curr_instr );
   4236             } else {
   4237                storeLE( mkexpr(addr), mkexpr(t2) );
   4238             }
   4239             setFlags_INC_DEC( False, t2, ty );
   4240             break;
   4241          default:
   4242             *decode_OK = False;
   4243             return delta;
   4244       }
   4245       delta += alen;
   4246       DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
   4247    }
   4248    return delta;
   4249   unhandled:
   4250    *decode_OK = False;
   4251    return delta;
   4252 }
   4253 
   4254 
   4255 /* Group 5 extended opcodes.  We have to decide here whether F2 and F3
   4256    might be valid. */
   4257 static
   4258 ULong dis_Grp5 ( const VexAbiInfo* vbi,
   4259                  Prefix pfx, Int sz, Long delta,
   4260                  /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
   4261 {
   4262    Int     len;
   4263    UChar   modrm;
   4264    HChar   dis_buf[50];
   4265    IRTemp  addr = IRTemp_INVALID;
   4266    IRType  ty = szToITy(sz);
   4267    IRTemp  t1 = newTemp(ty);
   4268    IRTemp  t2 = IRTemp_INVALID;
   4269    IRTemp  t3 = IRTemp_INVALID;
   4270    Bool    showSz = True;
   4271 
   4272    *decode_OK = True;
   4273 
   4274    modrm = getUChar(delta);
   4275    if (epartIsReg(modrm)) {
   4276       /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
   4277          F2/CALL and F2/JMP may have bnd prefix. */
   4278      if (haveF2orF3(pfx)
   4279          && ! (haveF2(pfx)
   4280                && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
   4281         goto unhandledR;
   4282       assign(t1, getIRegE(sz,pfx,modrm));
   4283       switch (gregLO3ofRM(modrm)) {
   4284          case 0: /* INC */
   4285             t2 = newTemp(ty);
   4286             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4287                              mkexpr(t1), mkU(ty,1)));
   4288             setFlags_INC_DEC( True, t2, ty );
   4289             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4290             break;
   4291          case 1: /* DEC */
   4292             t2 = newTemp(ty);
   4293             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4294                              mkexpr(t1), mkU(ty,1)));
   4295             setFlags_INC_DEC( False, t2, ty );
   4296             putIRegE(sz,pfx,modrm, mkexpr(t2));
   4297             break;
   4298          case 2: /* call Ev */
   4299             /* Ignore any sz value and operate as if sz==8. */
   4300             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4301             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4302             sz = 8;
   4303             t3 = newTemp(Ity_I64);
   4304             assign(t3, getIRegE(sz,pfx,modrm));
   4305             t2 = newTemp(Ity_I64);
   4306             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4307             putIReg64(R_RSP, mkexpr(t2));
   4308             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
   4309             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
   4310             jmp_treg(dres, Ijk_Call, t3);
   4311             vassert(dres->whatNext == Dis_StopHere);
   4312             showSz = False;
   4313             break;
   4314          case 4: /* jmp Ev */
   4315             /* Ignore any sz value and operate as if sz==8. */
   4316             if (!(sz == 4 || sz == 8)) goto unhandledR;
   4317             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4318             sz = 8;
   4319             t3 = newTemp(Ity_I64);
   4320             assign(t3, getIRegE(sz,pfx,modrm));
   4321             jmp_treg(dres, Ijk_Boring, t3);
   4322             vassert(dres->whatNext == Dis_StopHere);
   4323             showSz = False;
   4324             break;
   4325          case 6: /* PUSH Ev */
   4326             /* There is no encoding for 32-bit operand size; hence ... */
   4327             if (sz == 4) sz = 8;
   4328             if (sz == 8 || sz == 2) {
   4329                ty = szToITy(sz); /* redo it, since sz might have changed */
   4330                t3 = newTemp(ty);
   4331                assign(t3, getIRegE(sz,pfx,modrm));
   4332                t2 = newTemp(Ity_I64);
   4333                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4334                putIReg64(R_RSP, mkexpr(t2) );
   4335                storeLE( mkexpr(t2), mkexpr(t3) );
   4336                break;
   4337             } else {
   4338                goto unhandledR; /* awaiting test case */
   4339             }
   4340          default:
   4341          unhandledR:
   4342             *decode_OK = False;
   4343             return delta;
   4344       }
   4345       delta++;
   4346       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4347                        showSz ? nameISize(sz) : ' ',
   4348                        nameIRegE(sz, pfx, modrm));
   4349    } else {
   4350       /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
   4351       Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
   4352       if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
   4353           && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
   4354          validF2orF3 = True;
   4355       } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
   4356                  && (haveF2(pfx) && !haveF3(pfx))) {
   4357          validF2orF3 = True;
   4358       }
   4359       if (!validF2orF3) goto unhandledM;
   4360       /* */
   4361       addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   4362       if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
   4363                                   && gregLO3ofRM(modrm) != 6) {
   4364          assign(t1, loadLE(ty,mkexpr(addr)));
   4365       }
   4366       switch (gregLO3ofRM(modrm)) {
   4367          case 0: /* INC */
   4368             t2 = newTemp(ty);
   4369             assign(t2, binop(mkSizedOp(ty,Iop_Add8),
   4370                              mkexpr(t1), mkU(ty,1)));
   4371             if (haveLOCK(pfx)) {
   4372                casLE( mkexpr(addr),
   4373                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4374             } else {
   4375                storeLE(mkexpr(addr),mkexpr(t2));
   4376             }
   4377             setFlags_INC_DEC( True, t2, ty );
   4378             break;
   4379          case 1: /* DEC */
   4380             t2 = newTemp(ty);
   4381             assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
   4382                              mkexpr(t1), mkU(ty,1)));
   4383             if (haveLOCK(pfx)) {
   4384                casLE( mkexpr(addr),
   4385                       mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
   4386             } else {
   4387                storeLE(mkexpr(addr),mkexpr(t2));
   4388             }
   4389             setFlags_INC_DEC( False, t2, ty );
   4390             break;
   4391          case 2: /* call Ev */
   4392             /* Ignore any sz value and operate as if sz==8. */
   4393             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4394             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4395             sz = 8;
   4396             t3 = newTemp(Ity_I64);
   4397             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4398             t2 = newTemp(Ity_I64);
   4399             assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
   4400             putIReg64(R_RSP, mkexpr(t2));
   4401             storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
   4402             make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
   4403             jmp_treg(dres, Ijk_Call, t3);
   4404             vassert(dres->whatNext == Dis_StopHere);
   4405             showSz = False;
   4406             break;
   4407          case 4: /* JMP Ev */
   4408             /* Ignore any sz value and operate as if sz==8. */
   4409             if (!(sz == 4 || sz == 8)) goto unhandledM;
   4410             if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
   4411             sz = 8;
   4412             t3 = newTemp(Ity_I64);
   4413             assign(t3, loadLE(Ity_I64,mkexpr(addr)));
   4414             jmp_treg(dres, Ijk_Boring, t3);
   4415             vassert(dres->whatNext == Dis_StopHere);
   4416             showSz = False;
   4417             break;
   4418          case 6: /* PUSH Ev */
   4419             /* There is no encoding for 32-bit operand size; hence ... */
   4420             if (sz == 4) sz = 8;
   4421             if (sz == 8 || sz == 2) {
   4422                ty = szToITy(sz); /* redo it, since sz might have changed */
   4423                t3 = newTemp(ty);
   4424                assign(t3, loadLE(ty,mkexpr(addr)));
   4425                t2 = newTemp(Ity_I64);
   4426                assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
   4427                putIReg64(R_RSP, mkexpr(t2) );
   4428                storeLE( mkexpr(t2), mkexpr(t3) );
   4429                break;
   4430             } else {
   4431                goto unhandledM; /* awaiting test case */
   4432             }
   4433          default:
   4434          unhandledM:
   4435             *decode_OK = False;
   4436             return delta;
   4437       }
   4438       delta += len;
   4439       DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
   4440                        showSz ? nameISize(sz) : ' ',
   4441                        dis_buf);
   4442    }
   4443    return delta;
   4444 }
   4445 
   4446 
   4447 /*------------------------------------------------------------*/
   4448 /*--- Disassembling string ops (including REP prefixes)    ---*/
   4449 /*------------------------------------------------------------*/
   4450 
   4451 /* Code shared by all the string ops */
   4452 static
   4453 void dis_string_op_increment ( Int sz, IRTemp t_inc )
   4454 {
   4455    UChar logSz;
   4456    if (sz == 8 || sz == 4 || sz == 2) {
   4457       logSz = 1;
   4458       if (sz == 4) logSz = 2;
   4459       if (sz == 8) logSz = 3;
   4460       assign( t_inc,
   4461               binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
   4462                                mkU8(logSz) ) );
   4463    } else {
   4464       assign( t_inc,
   4465               IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
   4466    }
   4467 }
   4468 
   4469 static
   4470 void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
   4471                     Int sz, const HChar* name, Prefix pfx )
   4472 {
   4473    IRTemp t_inc = newTemp(Ity_I64);
   4474    /* Really we ought to inspect the override prefixes, but we don't.
   4475       The following assertion catches any resulting sillyness. */
   4476    vassert(pfx == clearSegBits(pfx));
   4477    dis_string_op_increment(sz, t_inc);
   4478    dis_OP( sz, t_inc, pfx );
   4479    DIP("%s%c\n", name, nameISize(sz));
   4480 }
   4481 
   4482 static
   4483 void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
   4484 {
   4485    IRType ty = szToITy(sz);
   4486    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4487    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4488    IRExpr *incd, *incs;
   4489 
   4490    if (haveASO(pfx)) {
   4491       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4492       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4493    } else {
   4494       assign( td, getIReg64(R_RDI) );
   4495       assign( ts, getIReg64(R_RSI) );
   4496    }
   4497 
   4498    storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
   4499 
   4500    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4501    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4502    if (haveASO(pfx)) {
   4503       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4504       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4505    }
   4506    putIReg64( R_RDI, incd );
   4507    putIReg64( R_RSI, incs );
   4508 }
   4509 
   4510 static
   4511 void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
   4512 {
   4513    IRType ty = szToITy(sz);
   4514    IRTemp ts = newTemp(Ity_I64);   /* RSI */
   4515    IRExpr *incs;
   4516 
   4517    if (haveASO(pfx))
   4518       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4519    else
   4520       assign( ts, getIReg64(R_RSI) );
   4521 
   4522    putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
   4523 
   4524    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4525    if (haveASO(pfx))
   4526       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4527    putIReg64( R_RSI, incs );
   4528 }
   4529 
   4530 static
   4531 void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
   4532 {
   4533    IRType ty = szToITy(sz);
   4534    IRTemp ta = newTemp(ty);        /* rAX */
   4535    IRTemp td = newTemp(Ity_I64);   /* RDI */
   4536    IRExpr *incd;
   4537 
   4538    assign( ta, getIRegRAX(sz) );
   4539 
   4540    if (haveASO(pfx))
   4541       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4542    else
   4543       assign( td, getIReg64(R_RDI) );
   4544 
   4545    storeLE( mkexpr(td), mkexpr(ta) );
   4546 
   4547    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4548    if (haveASO(pfx))
   4549       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4550    putIReg64( R_RDI, incd );
   4551 }
   4552 
   4553 static
   4554 void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
   4555 {
   4556    IRType ty  = szToITy(sz);
   4557    IRTemp tdv = newTemp(ty);      /* (RDI) */
   4558    IRTemp tsv = newTemp(ty);      /* (RSI) */
   4559    IRTemp td  = newTemp(Ity_I64); /*  RDI  */
   4560    IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
   4561    IRExpr *incd, *incs;
   4562 
   4563    if (haveASO(pfx)) {
   4564       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4565       assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
   4566    } else {
   4567       assign( td, getIReg64(R_RDI) );
   4568       assign( ts, getIReg64(R_RSI) );
   4569    }
   4570 
   4571    assign( tdv, loadLE(ty,mkexpr(td)) );
   4572 
   4573    assign( tsv, loadLE(ty,mkexpr(ts)) );
   4574 
   4575    setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
   4576 
   4577    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4578    incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
   4579    if (haveASO(pfx)) {
   4580       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4581       incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
   4582    }
   4583    putIReg64( R_RDI, incd );
   4584    putIReg64( R_RSI, incs );
   4585 }
   4586 
   4587 static
   4588 void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
   4589 {
   4590    IRType ty  = szToITy(sz);
   4591    IRTemp ta  = newTemp(ty);       /*  rAX  */
   4592    IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
   4593    IRTemp tdv = newTemp(ty);       /* (RDI) */
   4594    IRExpr *incd;
   4595 
   4596    assign( ta, getIRegRAX(sz) );
   4597 
   4598    if (haveASO(pfx))
   4599       assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
   4600    else
   4601       assign( td, getIReg64(R_RDI) );
   4602 
   4603    assign( tdv, loadLE(ty,mkexpr(td)) );
   4604 
   4605    setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
   4606 
   4607    incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
   4608    if (haveASO(pfx))
   4609       incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
   4610    putIReg64( R_RDI, incd );
   4611 }
   4612 
   4613 
   4614 /* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
   4615    the insn is the last one in the basic block, and so emit a jump to
   4616    the next insn, rather than just falling through. */
   4617 static
   4618 void dis_REP_op ( /*MOD*/DisResult* dres,
   4619                   AMD64Condcode cond,
   4620                   void (*dis_OP)(Int, IRTemp, Prefix),
   4621                   Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
   4622                   Prefix pfx )
   4623 {
   4624    IRTemp t_inc = newTemp(Ity_I64);
   4625    IRTemp tc;
   4626    IRExpr* cmp;
   4627 
   4628    /* Really we ought to inspect the override prefixes, but we don't.
   4629       The following assertion catches any resulting sillyness. */
   4630    vassert(pfx == clearSegBits(pfx));
   4631 
   4632    if (haveASO(pfx)) {
   4633       tc = newTemp(Ity_I32);  /*  ECX  */
   4634       assign( tc, getIReg32(R_RCX) );
   4635       cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
   4636    } else {
   4637       tc = newTemp(Ity_I64);  /*  RCX  */
   4638       assign( tc, getIReg64(R_RCX) );
   4639       cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
   4640    }
   4641 
   4642    stmt( IRStmt_Exit( cmp, Ijk_Boring,
   4643                       IRConst_U64(rip_next), OFFB_RIP ) );
   4644 
   4645    if (haveASO(pfx))
   4646       putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
   4647   else
   4648       putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
   4649 
   4650    dis_string_op_increment(sz, t_inc);
   4651    dis_OP (sz, t_inc, pfx);
   4652 
   4653    if (cond == AMD64CondAlways) {
   4654       jmp_lit(dres, Ijk_Boring, rip);
   4655       vassert(dres->whatNext == Dis_StopHere);
   4656    } else {
   4657       stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
   4658                          Ijk_Boring,
   4659                          IRConst_U64(rip),
   4660                          OFFB_RIP ) );
   4661       jmp_lit(dres, Ijk_Boring, rip_next);
   4662       vassert(dres->whatNext == Dis_StopHere);
   4663    }
   4664    DIP("%s%c\n", name, nameISize(sz));
   4665 }
   4666 
   4667 
   4668 /*------------------------------------------------------------*/
   4669 /*--- Arithmetic, etc.                                     ---*/
   4670 /*------------------------------------------------------------*/
   4671 
   4672 /* IMUL E, G.  Supplied eip points to the modR/M byte. */
   4673 static
   4674 ULong dis_mul_E_G ( const VexAbiInfo* vbi,
   4675                     Prefix      pfx,
   4676                     Int         size,
   4677                     Long        delta0 )
   4678 {
   4679    Int    alen;
   4680    HChar  dis_buf[50];
   4681    UChar  rm = getUChar(delta0);
   4682    IRType ty = szToITy(size);
   4683    IRTemp te = newTemp(ty);
   4684    IRTemp tg = newTemp(ty);
   4685    IRTemp resLo = newTemp(ty);
   4686 
   4687    assign( tg, getIRegG(size, pfx, rm) );
   4688    if (epartIsReg(rm)) {
   4689       assign( te, getIRegE(size, pfx, rm) );
   4690    } else {
   4691       IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
   4692       assign( te, loadLE(ty,mkexpr(addr)) );
   4693    }
   4694 
   4695    setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
   4696 
   4697    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
   4698 
   4699    putIRegG(size, pfx, rm, mkexpr(resLo) );
   4700 
   4701    if (epartIsReg(rm)) {
   4702       DIP("imul%c %s, %s\n", nameISize(size),
   4703                              nameIRegE(size,pfx,rm),
   4704                              nameIRegG(size,pfx,rm));
   4705       return 1+delta0;
   4706    } else {
   4707       DIP("imul%c %s, %s\n", nameISize(size),
   4708                              dis_buf,
   4709                              nameIRegG(size,pfx,rm));
   4710       return alen+delta0;
   4711    }
   4712 }
   4713 
   4714 
   4715 /* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
   4716 static
   4717 ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
   4718                        Prefix      pfx,
   4719                        Int         size,
   4720                        Long        delta,
   4721                        Int         litsize )
   4722 {
   4723    Long   d64;
   4724    Int    alen;
   4725    HChar  dis_buf[50];
   4726    UChar  rm = getUChar(delta);
   4727    IRType ty = szToITy(size);
   4728    IRTemp te = newTemp(ty);
   4729    IRTemp tl = newTemp(ty);
   4730    IRTemp resLo = newTemp(ty);
   4731 
   4732    vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
   4733 
   4734    if (epartIsReg(rm)) {
   4735       assign(te, getIRegE(size, pfx, rm));
   4736       delta++;
   4737    } else {
   4738       IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
   4739                                      imin(4,litsize) );
   4740       assign(te, loadLE(ty, mkexpr(addr)));
   4741       delta += alen;
   4742    }
   4743    d64 = getSDisp(imin(4,litsize),delta);
   4744    delta += imin(4,litsize);
   4745 
   4746    d64 &= mkSizeMask(size);
   4747    assign(tl, mkU(ty,d64));
   4748 
   4749    assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
   4750 
   4751    setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
   4752 
   4753    putIRegG(size, pfx, rm, mkexpr(resLo));
   4754 
   4755    DIP("imul%c $%lld, %s, %s\n",
   4756        nameISize(size), d64,
   4757        ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
   4758        nameIRegG(size,pfx,rm) );
   4759    return delta;
   4760 }
   4761 
   4762 
   4763 /* Generate an IR sequence to do a popcount operation on the supplied
   4764    IRTemp, and return a new IRTemp holding the result.  'ty' may be
   4765    Ity_I16, Ity_I32 or Ity_I64 only. */
   4766 static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
   4767 {
   4768    Int i;
   4769    if (ty == Ity_I16) {
   4770       IRTemp old = IRTemp_INVALID;
   4771       IRTemp nyu = IRTemp_INVALID;
   4772       IRTemp mask[4], shift[4];
   4773       for (i = 0; i < 4; i++) {
   4774          mask[i]  = newTemp(ty);
   4775          shift[i] = 1 << i;
   4776       }
   4777       assign(mask[0], mkU16(0x5555));
   4778       assign(mask[1], mkU16(0x3333));
   4779       assign(mask[2], mkU16(0x0F0F));
   4780       assign(mask[3], mkU16(0x00FF));
   4781       old = src;
   4782       for (i = 0; i < 4; i++) {
   4783          nyu = newTemp(ty);
   4784          assign(nyu,
   4785                 binop(Iop_Add16,
   4786                       binop(Iop_And16,
   4787                             mkexpr(old),
   4788                             mkexpr(mask[i])),
   4789                       binop(Iop_And16,
   4790                             binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
   4791                             mkexpr(mask[i]))));
   4792          old = nyu;
   4793       }
   4794       return nyu;
   4795    }
   4796    if (ty == Ity_I32) {
   4797       IRTemp old = IRTemp_INVALID;
   4798       IRTemp nyu = IRTemp_INVALID;
   4799       IRTemp mask[5], shift[5];
   4800       for (i = 0; i < 5; i++) {
   4801          mask[i]  = newTemp(ty);
   4802          shift[i] = 1 << i;
   4803       }
   4804       assign(mask[0], mkU32(0x55555555));
   4805       assign(mask[1], mkU32(0x33333333));
   4806       assign(mask[2], mkU32(0x0F0F0F0F));
   4807       assign(mask[3], mkU32(0x00FF00FF));
   4808       assign(mask[4], mkU32(0x0000FFFF));
   4809       old = src;
   4810       for (i = 0; i < 5; i++) {
   4811          nyu = newTemp(ty);
   4812          assign(nyu,
   4813                 binop(Iop_Add32,
   4814                       binop(Iop_And32,
   4815                             mkexpr(old),
   4816                             mkexpr(mask[i])),
   4817                       binop(Iop_And32,
   4818                             binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
   4819                             mkexpr(mask[i]))));
   4820          old = nyu;
   4821       }
   4822       return nyu;
   4823    }
   4824    if (ty == Ity_I64) {
   4825       IRTemp old = IRTemp_INVALID;
   4826       IRTemp nyu = IRTemp_INVALID;
   4827       IRTemp mask[6], shift[6];
   4828       for (i = 0; i < 6; i++) {
   4829          mask[i]  = newTemp(ty);
   4830          shift[i] = 1 << i;
   4831       }
   4832       assign(mask[0], mkU64(0x5555555555555555ULL));
   4833       assign(mask[1], mkU64(0x3333333333333333ULL));
   4834       assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
   4835       assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
   4836       assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
   4837       assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
   4838       old = src;
   4839       for (i = 0; i < 6; i++) {
   4840          nyu = newTemp(ty);
   4841          assign(nyu,
   4842                 binop(Iop_Add64,
   4843                       binop(Iop_And64,
   4844                             mkexpr(old),
   4845                             mkexpr(mask[i])),
   4846                       binop(Iop_And64,
   4847                             binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
   4848                             mkexpr(mask[i]))));
   4849          old = nyu;
   4850       }
   4851       return nyu;
   4852    }
   4853    /*NOTREACHED*/
   4854    vassert(0);
   4855 }
   4856 
   4857 
   4858 /* Generate an IR sequence to do a count-leading-zeroes operation on
   4859    the supplied IRTemp, and return a new IRTemp holding the result.
   4860    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4861    the argument is zero, return the number of bits in the word (the
   4862    natural semantics). */
   4863 static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
   4864 {
   4865    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4866 
   4867    IRTemp src64 = newTemp(Ity_I64);
   4868    assign(src64, widenUto64( mkexpr(src) ));
   4869 
   4870    IRTemp src64x = newTemp(Ity_I64);
   4871    assign(src64x,
   4872           binop(Iop_Shl64, mkexpr(src64),
   4873                            mkU8(64 - 8 * sizeofIRType(ty))));
   4874 
   4875    // Clz64 has undefined semantics when its input is zero, so
   4876    // special-case around that.
   4877    IRTemp res64 = newTemp(Ity_I64);
   4878    assign(res64,
   4879           IRExpr_ITE(
   4880              binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
   4881              mkU64(8 * sizeofIRType(ty)),
   4882              unop(Iop_Clz64, mkexpr(src64x))
   4883    ));
   4884 
   4885    IRTemp res = newTemp(ty);
   4886    assign(res, narrowTo(ty, mkexpr(res64)));
   4887    return res;
   4888 }
   4889 
   4890 
   4891 /* Generate an IR sequence to do a count-trailing-zeroes operation on
   4892    the supplied IRTemp, and return a new IRTemp holding the result.
   4893    'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
   4894    the argument is zero, return the number of bits in the word (the
   4895    natural semantics). */
   4896 static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
   4897 {
   4898    vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
   4899 
   4900    IRTemp src64 = newTemp(Ity_I64);
   4901    assign(src64, widenUto64( mkexpr(src) ));
   4902 
   4903    // Ctz64 has undefined semantics when its input is zero, so
   4904    // special-case around that.
   4905    IRTemp res64 = newTemp(Ity_I64);
   4906    assign(res64,
   4907           IRExpr_ITE(
   4908              binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
   4909              mkU64(8 * sizeofIRType(ty)),
   4910              unop(Iop_Ctz64, mkexpr(src64))
   4911    ));
   4912 
   4913    IRTemp res = newTemp(ty);
   4914    assign(res, narrowTo(ty, mkexpr(res64)));
   4915    return res;
   4916 }
   4917 
   4918 
   4919 /*------------------------------------------------------------*/
   4920 /*---                                                      ---*/
   4921 /*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
   4922 /*---                                                      ---*/
   4923 /*------------------------------------------------------------*/
   4924 
   4925 /* --- Helper functions for dealing with the register stack. --- */
   4926 
   4927 /* --- Set the emulation-warning pseudo-register. --- */
   4928 
   4929 static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
   4930 {
   4931    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4932    stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
   4933 }
   4934 
   4935 /* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
   4936 
   4937 static IRExpr* mkQNaN64 ( void )
   4938 {
   4939   /* QNaN is 0 2047 1 0(51times)
   4940      == 0b 11111111111b 1 0(51times)
   4941      == 0x7FF8 0000 0000 0000
   4942    */
   4943    return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
   4944 }
   4945 
   4946 /* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
   4947 
   4948 static IRExpr* get_ftop ( void )
   4949 {
   4950    return IRExpr_Get( OFFB_FTOP, Ity_I32 );
   4951 }
   4952 
   4953 static void put_ftop ( IRExpr* e )
   4954 {
   4955    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4956    stmt( IRStmt_Put( OFFB_FTOP, e ) );
   4957 }
   4958 
   4959 /* --------- Get/put the C3210 bits. --------- */
   4960 
   4961 static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
   4962 {
   4963    return IRExpr_Get( OFFB_FC3210, Ity_I64 );
   4964 }
   4965 
   4966 static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
   4967 {
   4968    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   4969    stmt( IRStmt_Put( OFFB_FC3210, e ) );
   4970 }
   4971 
   4972 /* --------- Get/put the FPU rounding mode. --------- */
   4973 static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
   4974 {
   4975    return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
   4976 }
   4977 
   4978 static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
   4979 {
   4980    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   4981    stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
   4982 }
   4983 
   4984 
   4985 /* --------- Synthesise a 2-bit FPU rounding mode. --------- */
   4986 /* Produces a value in 0 .. 3, which is encoded as per the type
   4987    IRRoundingMode.  Since the guest_FPROUND value is also encoded as
   4988    per IRRoundingMode, we merely need to get it and mask it for
   4989    safety.
   4990 */
   4991 static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
   4992 {
   4993    return binop( Iop_And32, get_fpround(), mkU32(3) );
   4994 }
   4995 
   4996 static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
   4997 {
   4998    return mkU32(Irrm_NEAREST);
   4999 }
   5000 
   5001 
   5002 /* --------- Get/set FP register tag bytes. --------- */
   5003 
   5004 /* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
   5005 
   5006 static void put_ST_TAG ( Int i, IRExpr* value )
   5007 {
   5008    IRRegArray* descr;
   5009    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
   5010    descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5011    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5012 }
   5013 
   5014 /* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
   5015    zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
   5016 
   5017 static IRExpr* get_ST_TAG ( Int i )
   5018 {
   5019    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   5020    return IRExpr_GetI( descr, get_ftop(), i );
   5021 }
   5022 
   5023 
   5024 /* --------- Get/set FP registers. --------- */
   5025 
   5026 /* Given i, and some expression e, emit 'ST(i) = e' and set the
   5027    register's tag to indicate the register is full.  The previous
   5028    state of the register is not checked. */
   5029 
   5030 static void put_ST_UNCHECKED ( Int i, IRExpr* value )
   5031 {
   5032    IRRegArray* descr;
   5033    vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
   5034    descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5035    stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
   5036    /* Mark the register as in-use. */
   5037    put_ST_TAG(i, mkU8(1));
   5038 }
   5039 
   5040 /* Given i, and some expression e, emit
   5041       ST(i) = is_full(i) ? NaN : e
   5042    and set the tag accordingly.
   5043 */
   5044 
   5045 static void put_ST ( Int i, IRExpr* value )
   5046 {
   5047    put_ST_UNCHECKED(
   5048       i,
   5049       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5050                   /* non-0 means full */
   5051                   mkQNaN64(),
   5052                   /* 0 means empty */
   5053                   value
   5054       )
   5055    );
   5056 }
   5057 
   5058 
   5059 /* Given i, generate an expression yielding 'ST(i)'. */
   5060 
   5061 static IRExpr* get_ST_UNCHECKED ( Int i )
   5062 {
   5063    IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
   5064    return IRExpr_GetI( descr, get_ftop(), i );
   5065 }
   5066 
   5067 
   5068 /* Given i, generate an expression yielding
   5069   is_full(i) ? ST(i) : NaN
   5070 */
   5071 
   5072 static IRExpr* get_ST ( Int i )
   5073 {
   5074    return
   5075       IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
   5076                   /* non-0 means full */
   5077                   get_ST_UNCHECKED(i),
   5078                   /* 0 means empty */
   5079                   mkQNaN64());
   5080 }
   5081 
   5082 
   5083 /* Given i, and some expression e, and a condition cond, generate IR
   5084    which has the same effect as put_ST(i,e) when cond is true and has
   5085    no effect when cond is false.  Given the lack of proper
   5086    if-then-else in the IR, this is pretty tricky.
   5087 */
   5088 
   5089 static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
   5090 {
   5091    // new_tag = if cond then FULL else old_tag
   5092    // new_val = if cond then (if old_tag==FULL then NaN else val)
   5093    //                   else old_val
   5094 
   5095    IRTemp old_tag = newTemp(Ity_I8);
   5096    assign(old_tag, get_ST_TAG(i));
   5097    IRTemp new_tag = newTemp(Ity_I8);
   5098    assign(new_tag,
   5099           IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
   5100 
   5101    IRTemp old_val = newTemp(Ity_F64);
   5102    assign(old_val, get_ST_UNCHECKED(i));
   5103    IRTemp new_val = newTemp(Ity_F64);
   5104    assign(new_val,
   5105           IRExpr_ITE(mkexpr(cond),
   5106                      IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
   5107                                 /* non-0 means full */
   5108                                 mkQNaN64(),
   5109                                 /* 0 means empty */
   5110                                 value),
   5111                      mkexpr(old_val)));
   5112 
   5113    put_ST_UNCHECKED(i, mkexpr(new_val));
   5114    // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
   5115    // now set it to new_tag instead.
   5116    put_ST_TAG(i, mkexpr(new_tag));
   5117 }
   5118 
   5119 /* Adjust FTOP downwards by one register. */
   5120 
   5121 static void fp_push ( void )
   5122 {
   5123    put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
   5124 }
   5125 
   5126 /* Adjust FTOP downwards by one register when COND is 1:I1.  Else
   5127    don't change it. */
   5128 
   5129 static void maybe_fp_push ( IRTemp cond )
   5130 {
   5131    put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
   5132 }
   5133 
   5134 /* Adjust FTOP upwards by one register, and mark the vacated register
   5135    as empty.  */
   5136 
   5137 static void fp_pop ( void )
   5138 {
   5139    put_ST_TAG(0, mkU8(0));
   5140    put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5141 }
   5142 
   5143 /* Set the C2 bit of the FPU status register to e[0].  Assumes that
   5144    e[31:1] == 0.
   5145 */
   5146 static void set_C2 ( IRExpr* e )
   5147 {
   5148    IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
   5149    put_C3210( binop(Iop_Or64,
   5150                     cleared,
   5151                     binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
   5152 }
   5153 
   5154 /* Generate code to check that abs(d64) < 2^63 and is finite.  This is
   5155    used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
   5156    test is simple, but the derivation of it is not so simple.
   5157 
   5158    The exponent field for an IEEE754 double is 11 bits.  That means it
   5159    can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
   5160    the number is either a NaN or an Infinity and so is not finite.
   5161    Furthermore, a finite value of exactly 2^63 is the smallest value
   5162    that has exponent value 0x43E.  Hence, what we need to do is
   5163    extract the exponent, ignoring the sign bit and mantissa, and check
   5164    it is < 0x43E, or <= 0x43D.
   5165 
   5166    To make this easily applicable to 32- and 64-bit targets, a
   5167    roundabout approach is used.  First the number is converted to I64,
   5168    then the top 32 bits are taken.  Shifting them right by 20 bits
   5169    places the sign bit and exponent in the bottom 12 bits.  Anding
   5170    with 0x7FF gets rid of the sign bit, leaving just the exponent
   5171    available for comparison.
   5172 */
   5173 static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
   5174 {
   5175    IRTemp i64 = newTemp(Ity_I64);
   5176    assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
   5177    IRTemp exponent = newTemp(Ity_I32);
   5178    assign(exponent,
   5179           binop(Iop_And32,
   5180                 binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
   5181                 mkU32(0x7FF)));
   5182    IRTemp in_range_and_finite = newTemp(Ity_I1);
   5183    assign(in_range_and_finite,
   5184           binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
   5185    return in_range_and_finite;
   5186 }
   5187 
   5188 /* Invent a plausible-looking FPU status word value:
   5189       ((ftop & 7) << 11) | (c3210 & 0x4700)
   5190  */
   5191 static IRExpr* get_FPU_sw ( void )
   5192 {
   5193    return
   5194       unop(Iop_32to16,
   5195            binop(Iop_Or32,
   5196                  binop(Iop_Shl32,
   5197                        binop(Iop_And32, get_ftop(), mkU32(7)),
   5198                              mkU8(11)),
   5199                        binop(Iop_And32, unop(Iop_64to32, get_C3210()),
   5200                                         mkU32(0x4700))
   5201       ));
   5202 }
   5203 
   5204 
   5205 /* Generate a dirty helper call that initialises the x87 state a la
   5206    FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
   5207    |guard| is used as a guarding condition.
   5208 */
   5209 static void gen_FINIT_SEQUENCE ( IRExpr* guard )
   5210 {
   5211    /* Uses dirty helper:
   5212          void amd64g_do_FINIT ( VexGuestAMD64State* ) */
   5213    IRDirty* d  = unsafeIRDirty_0_N (
   5214                     0/*regparms*/,
   5215                     "amd64g_dirtyhelper_FINIT",
   5216                     &amd64g_dirtyhelper_FINIT,
   5217                     mkIRExprVec_1( IRExpr_BBPTR() )
   5218                  );
   5219 
   5220    /* declare we're writing guest state */
   5221    d->nFxState = 5;
   5222    vex_bzero(&d->fxState, sizeof(d->fxState));
   5223 
   5224    d->fxState[0].fx     = Ifx_Write;
   5225    d->fxState[0].offset = OFFB_FTOP;
   5226    d->fxState[0].size   = sizeof(UInt);
   5227 
   5228    d->fxState[1].fx     = Ifx_Write;
   5229    d->fxState[1].offset = OFFB_FPREGS;
   5230    d->fxState[1].size   = 8 * sizeof(ULong);
   5231 
   5232    d->fxState[2].fx     = Ifx_Write;
   5233    d->fxState[2].offset = OFFB_FPTAGS;
   5234    d->fxState[2].size   = 8 * sizeof(UChar);
   5235 
   5236    d->fxState[3].fx     = Ifx_Write;
   5237    d->fxState[3].offset = OFFB_FPROUND;
   5238    d->fxState[3].size   = sizeof(ULong);
   5239 
   5240    d->fxState[4].fx     = Ifx_Write;
   5241    d->fxState[4].offset = OFFB_FC3210;
   5242    d->fxState[4].size   = sizeof(ULong);
   5243 
   5244    if (guard)
   5245       d->guard = guard;
   5246 
   5247    stmt( IRStmt_Dirty(d) );
   5248 }
   5249 
   5250 
   5251 /* ------------------------------------------------------- */
   5252 /* Given all that stack-mangling junk, we can now go ahead
   5253    and describe FP instructions.
   5254 */
   5255 
   5256 /* ST(0) = ST(0) `op` mem64/32(addr)
   5257    Need to check ST(0)'s tag on read, but not on write.
   5258 */
   5259 static
   5260 void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5261                          IROp op, Bool dbl )
   5262 {
   5263    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5264    if (dbl) {
   5265       put_ST_UNCHECKED(0,
   5266          triop( op,
   5267                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5268                 get_ST(0),
   5269                 loadLE(Ity_F64,mkexpr(addr))
   5270          ));
   5271    } else {
   5272       put_ST_UNCHECKED(0,
   5273          triop( op,
   5274                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5275                 get_ST(0),
   5276                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
   5277          ));
   5278    }
   5279 }
   5280 
   5281 
   5282 /* ST(0) = mem64/32(addr) `op` ST(0)
   5283    Need to check ST(0)'s tag on read, but not on write.
   5284 */
   5285 static
   5286 void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
   5287                             IROp op, Bool dbl )
   5288 {
   5289    DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
   5290    if (dbl) {
   5291       put_ST_UNCHECKED(0,
   5292          triop( op,
   5293                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5294                 loadLE(Ity_F64,mkexpr(addr)),
   5295                 get_ST(0)
   5296          ));
   5297    } else {
   5298       put_ST_UNCHECKED(0,
   5299          triop( op,
   5300                 get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5301                 unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
   5302                 get_ST(0)
   5303          ));
   5304    }
   5305 }
   5306 
   5307 
   5308 /* ST(dst) = ST(dst) `op` ST(src).
   5309    Check dst and src tags when reading but not on write.
   5310 */
   5311 static
   5312 void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5313                       Bool pop_after )
   5314 {
   5315    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5316    put_ST_UNCHECKED(
   5317       st_dst,
   5318       triop( op,
   5319              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5320              get_ST(st_dst),
   5321              get_ST(st_src) )
   5322    );
   5323    if (pop_after)
   5324       fp_pop();
   5325 }
   5326 
   5327 /* ST(dst) = ST(src) `op` ST(dst).
   5328    Check dst and src tags when reading but not on write.
   5329 */
   5330 static
   5331 void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
   5332                          Bool pop_after )
   5333 {
   5334    DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
   5335    put_ST_UNCHECKED(
   5336       st_dst,
   5337       triop( op,
   5338              get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5339              get_ST(st_src),
   5340              get_ST(st_dst) )
   5341    );
   5342    if (pop_after)
   5343       fp_pop();
   5344 }
   5345 
   5346 /* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
   5347 static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
   5348 {
   5349    DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
   5350    /* This is a bit of a hack (and isn't really right).  It sets
   5351       Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
   5352       documentation implies A and S are unchanged.
   5353    */
   5354    /* It's also fishy in that it is used both for COMIP and
   5355       UCOMIP, and they aren't the same (although similar). */
   5356    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   5357    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   5358    stmt( IRStmt_Put(
   5359             OFFB_CC_DEP1,
   5360             binop( Iop_And64,
   5361                    unop( Iop_32Uto64,
   5362                          binop(Iop_CmpF64, get_ST(0), get_ST(i))),
   5363                    mkU64(0x45)
   5364         )));
   5365    if (pop_after)
   5366       fp_pop();
   5367 }
   5368 
   5369 
   5370 /* returns
   5371    32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
   5372 */
   5373 static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
   5374 {
   5375    IRTemp t32 = newTemp(Ity_I32);
   5376    assign( t32, e32 );
   5377    return
   5378       IRExpr_ITE(
   5379          binop(Iop_CmpLT64U,
   5380                unop(Iop_32Uto64,
   5381                     binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
   5382                mkU64(65536)),
   5383          unop(Iop_32to16, mkexpr(t32)),
   5384          mkU16( 0x8000 ) );
   5385 }
   5386 
   5387 
   5388 static
   5389 ULong dis_FPU ( /*OUT*/Bool* decode_ok,
   5390                 const VexAbiInfo* vbi, Prefix pfx, Long delta )
   5391 {
   5392    Int    len;
   5393    UInt   r_src, r_dst;
   5394    HChar  dis_buf[50];
   5395    IRTemp t1, t2;
   5396 
   5397    /* On entry, delta points at the second byte of the insn (the modrm
   5398       byte).*/
   5399    UChar first_opcode = getUChar(delta-1);
   5400    UChar modrm        = getUChar(delta+0);
   5401 
   5402    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
   5403 
   5404    if (first_opcode == 0xD8) {
   5405       if (modrm < 0xC0) {
   5406 
   5407          /* bits 5,4,3 are an opcode extension, and the modRM also
   5408            specifies an address. */
   5409          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5410          delta += len;
   5411 
   5412          switch (gregLO3ofRM(modrm)) {
   5413 
   5414             case 0: /* FADD single-real */
   5415                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
   5416                break;
   5417 
   5418             case 1: /* FMUL single-real */
   5419                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
   5420                break;
   5421 
   5422             case 2: /* FCOM single-real */
   5423                DIP("fcoms %s\n", dis_buf);
   5424                /* This forces C1 to zero, which isn't right. */
   5425                /* The AMD documentation suggests that forcing C1 to
   5426                   zero is correct (Eliot Moss) */
   5427                put_C3210(
   5428                    unop( Iop_32Uto64,
   5429                        binop( Iop_And32,
   5430                               binop(Iop_Shl32,
   5431                                     binop(Iop_CmpF64,
   5432                                           get_ST(0),
   5433                                           unop(Iop_F32toF64,
   5434                                                loadLE(Ity_F32,mkexpr(addr)))),
   5435                                     mkU8(8)),
   5436                               mkU32(0x4500)
   5437                    )));
   5438                break;
   5439 
   5440             case 3: /* FCOMP single-real */
   5441                /* The AMD documentation suggests that forcing C1 to
   5442                   zero is correct (Eliot Moss) */
   5443                DIP("fcomps %s\n", dis_buf);
   5444                /* This forces C1 to zero, which isn't right. */
   5445                put_C3210(
   5446                    unop( Iop_32Uto64,
   5447                        binop( Iop_And32,
   5448                               binop(Iop_Shl32,
   5449                                     binop(Iop_CmpF64,
   5450                                           get_ST(0),
   5451                                           unop(Iop_F32toF64,
   5452                                                loadLE(Ity_F32,mkexpr(addr)))),
   5453                                     mkU8(8)),
   5454                               mkU32(0x4500)
   5455                    )));
   5456                fp_pop();
   5457                break;
   5458 
   5459             case 4: /* FSUB single-real */
   5460                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
   5461                break;
   5462 
   5463             case 5: /* FSUBR single-real */
   5464                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
   5465                break;
   5466 
   5467             case 6: /* FDIV single-real */
   5468                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
   5469                break;
   5470 
   5471             case 7: /* FDIVR single-real */
   5472                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
   5473                break;
   5474 
   5475             default:
   5476                vex_printf("unhandled opc_aux = 0x%2x\n",
   5477                           (UInt)gregLO3ofRM(modrm));
   5478                vex_printf("first_opcode == 0xD8\n");
   5479                goto decode_fail;
   5480          }
   5481       } else {
   5482          delta++;
   5483          switch (modrm) {
   5484 
   5485             case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
   5486                fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
   5487                break;
   5488 
   5489             case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
   5490                fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
   5491                break;
   5492 
   5493             /* Dunno if this is right */
   5494             case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
   5495                r_dst = (UInt)modrm - 0xD0;
   5496                DIP("fcom %%st(0),%%st(%u)\n", r_dst);
   5497                /* This forces C1 to zero, which isn't right. */
   5498                put_C3210(
   5499                    unop(Iop_32Uto64,
   5500                    binop( Iop_And32,
   5501                           binop(Iop_Shl32,
   5502                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5503                                 mkU8(8)),
   5504                           mkU32(0x4500)
   5505                    )));
   5506                break;
   5507 
   5508             /* Dunno if this is right */
   5509             case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
   5510                r_dst = (UInt)modrm - 0xD8;
   5511                DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
   5512                /* This forces C1 to zero, which isn't right. */
   5513                put_C3210(
   5514                    unop(Iop_32Uto64,
   5515                    binop( Iop_And32,
   5516                           binop(Iop_Shl32,
   5517                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   5518                                 mkU8(8)),
   5519                           mkU32(0x4500)
   5520                    )));
   5521                fp_pop();
   5522                break;
   5523 
   5524             case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
   5525                fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
   5526                break;
   5527 
   5528             case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
   5529                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
   5530                break;
   5531 
   5532             case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
   5533                fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
   5534                break;
   5535 
   5536             case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
   5537                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
   5538                break;
   5539 
   5540             default:
   5541                goto decode_fail;
   5542          }
   5543       }
   5544    }
   5545 
   5546    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
   5547    else
   5548    if (first_opcode == 0xD9) {
   5549       if (modrm < 0xC0) {
   5550 
   5551          /* bits 5,4,3 are an opcode extension, and the modRM also
   5552             specifies an address. */
   5553          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   5554          delta += len;
   5555 
   5556          switch (gregLO3ofRM(modrm)) {
   5557 
   5558             case 0: /* FLD single-real */
   5559                DIP("flds %s\n", dis_buf);
   5560                fp_push();
   5561                put_ST(0, unop(Iop_F32toF64,
   5562                               loadLE(Ity_F32, mkexpr(addr))));
   5563                break;
   5564 
   5565             case 2: /* FST single-real */
   5566                DIP("fsts %s\n", dis_buf);
   5567                storeLE(mkexpr(addr),
   5568                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5569                break;
   5570 
   5571             case 3: /* FSTP single-real */
   5572                DIP("fstps %s\n", dis_buf);
   5573                storeLE(mkexpr(addr),
   5574                        binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
   5575                fp_pop();
   5576                break;
   5577 
   5578             case 4: { /* FLDENV m28 */
   5579                /* Uses dirty helper:
   5580                      VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
   5581                IRTemp    ew = newTemp(Ity_I32);
   5582                IRTemp   w64 = newTemp(Ity_I64);
   5583                IRDirty*   d = unsafeIRDirty_0_N (
   5584                                  0/*regparms*/,
   5585                                  "amd64g_dirtyhelper_FLDENV",
   5586                                  &amd64g_dirtyhelper_FLDENV,
   5587                                  mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5588                               );
   5589                d->tmp       = w64;
   5590                /* declare we're reading memory */
   5591                d->mFx   = Ifx_Read;
   5592                d->mAddr = mkexpr(addr);
   5593                d->mSize = 28;
   5594 
   5595                /* declare we're writing guest state */
   5596                d->nFxState = 4;
   5597                vex_bzero(&d->fxState, sizeof(d->fxState));
   5598 
   5599                d->fxState[0].fx     = Ifx_Write;
   5600                d->fxState[0].offset = OFFB_FTOP;
   5601                d->fxState[0].size   = sizeof(UInt);
   5602 
   5603                d->fxState[1].fx     = Ifx_Write;
   5604                d->fxState[1].offset = OFFB_FPTAGS;
   5605                d->fxState[1].size   = 8 * sizeof(UChar);
   5606 
   5607                d->fxState[2].fx     = Ifx_Write;
   5608                d->fxState[2].offset = OFFB_FPROUND;
   5609                d->fxState[2].size   = sizeof(ULong);
   5610 
   5611                d->fxState[3].fx     = Ifx_Write;
   5612                d->fxState[3].offset = OFFB_FC3210;
   5613                d->fxState[3].size   = sizeof(ULong);
   5614 
   5615                stmt( IRStmt_Dirty(d) );
   5616 
   5617                /* ew contains any emulation warning we may need to
   5618                   issue.  If needed, side-exit to the next insn,
   5619                   reporting the warning, so that Valgrind's dispatcher
   5620                   sees the warning. */
   5621                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   5622                put_emwarn( mkexpr(ew) );
   5623                stmt(
   5624                   IRStmt_Exit(
   5625                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5626                      Ijk_EmWarn,
   5627                      IRConst_U64( guest_RIP_bbstart+delta ),
   5628                      OFFB_RIP
   5629                   )
   5630                );
   5631 
   5632                DIP("fldenv %s\n", dis_buf);
   5633                break;
   5634             }
   5635 
   5636             case 5: {/* FLDCW */
   5637                /* The only thing we observe in the control word is the
   5638                   rounding mode.  Therefore, pass the 16-bit value
   5639                   (x87 native-format control word) to a clean helper,
   5640                   getting back a 64-bit value, the lower half of which
   5641                   is the FPROUND value to store, and the upper half of
   5642                   which is the emulation-warning token which may be
   5643                   generated.
   5644                */
   5645                /* ULong amd64h_check_fldcw ( ULong ); */
   5646                IRTemp t64 = newTemp(Ity_I64);
   5647                IRTemp ew = newTemp(Ity_I32);
   5648                DIP("fldcw %s\n", dis_buf);
   5649                assign( t64, mkIRExprCCall(
   5650                                Ity_I64, 0/*regparms*/,
   5651                                "amd64g_check_fldcw",
   5652                                &amd64g_check_fldcw,
   5653                                mkIRExprVec_1(
   5654                                   unop( Iop_16Uto64,
   5655                                         loadLE(Ity_I16, mkexpr(addr)))
   5656                                )
   5657                             )
   5658                      );
   5659 
   5660                put_fpround( unop(Iop_64to32, mkexpr(t64)) );
   5661                assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   5662                put_emwarn( mkexpr(ew) );
   5663                /* Finally, if an emulation warning was reported,
   5664                   side-exit to the next insn, reporting the warning,
   5665                   so that Valgrind's dispatcher sees the warning. */
   5666                stmt(
   5667                   IRStmt_Exit(
   5668                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   5669                      Ijk_EmWarn,
   5670                      IRConst_U64( guest_RIP_bbstart+delta ),
   5671                      OFFB_RIP
   5672                   )
   5673                );
   5674                break;
   5675             }
   5676 
   5677             case 6: { /* FNSTENV m28 */
   5678                /* Uses dirty helper:
   5679                      void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
   5680                IRDirty* d = unsafeIRDirty_0_N (
   5681                                0/*regparms*/,
   5682                                "amd64g_dirtyhelper_FSTENV",
   5683                                &amd64g_dirtyhelper_FSTENV,
   5684                                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   5685                             );
   5686                /* declare we're writing memory */
   5687                d->mFx   = Ifx_Write;
   5688                d->mAddr = mkexpr(addr);
   5689                d->mSize = 28;
   5690 
   5691                /* declare we're reading guest state */
   5692                d->nFxState = 4;
   5693                vex_bzero(&d->fxState, sizeof(d->fxState));
   5694 
   5695                d->fxState[0].fx     = Ifx_Read;
   5696                d->fxState[0].offset = OFFB_FTOP;
   5697                d->fxState[0].size   = sizeof(UInt);
   5698 
   5699                d->fxState[1].fx     = Ifx_Read;
   5700                d->fxState[1].offset = OFFB_FPTAGS;
   5701                d->fxState[1].size   = 8 * sizeof(UChar);
   5702 
   5703                d->fxState[2].fx     = Ifx_Read;
   5704                d->fxState[2].offset = OFFB_FPROUND;
   5705                d->fxState[2].size   = sizeof(ULong);
   5706 
   5707                d->fxState[3].fx     = Ifx_Read;
   5708                d->fxState[3].offset = OFFB_FC3210;
   5709                d->fxState[3].size   = sizeof(ULong);
   5710 
   5711                stmt( IRStmt_Dirty(d) );
   5712 
   5713                DIP("fnstenv %s\n", dis_buf);
   5714                break;
   5715             }
   5716 
   5717             case 7: /* FNSTCW */
   5718                /* Fake up a native x87 FPU control word.  The only
   5719                   thing it depends on is FPROUND[1:0], so call a clean
   5720                   helper to cook it up. */
   5721                /* ULong amd64g_create_fpucw ( ULong fpround ) */
   5722                DIP("fnstcw %s\n", dis_buf);
   5723                storeLE(
   5724                   mkexpr(addr),
   5725                   unop( Iop_64to16,
   5726                         mkIRExprCCall(
   5727                            Ity_I64, 0/*regp*/,
   5728                            "amd64g_create_fpucw", &amd64g_create_fpucw,
   5729                            mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
   5730                         )
   5731                   )
   5732                );
   5733                break;
   5734 
   5735             default:
   5736                vex_printf("unhandled opc_aux = 0x%2x\n",
   5737                           (UInt)gregLO3ofRM(modrm));
   5738                vex_printf("first_opcode == 0xD9\n");
   5739                goto decode_fail;
   5740          }
   5741 
   5742       } else {
   5743          delta++;
   5744          switch (modrm) {
   5745 
   5746             case 0xC0 ... 0xC7: /* FLD %st(?) */
   5747                r_src = (UInt)modrm - 0xC0;
   5748                DIP("fld %%st(%u)\n", r_src);
   5749                t1 = newTemp(Ity_F64);
   5750                assign(t1, get_ST(r_src));
   5751                fp_push();
   5752                put_ST(0, mkexpr(t1));
   5753                break;
   5754 
   5755             case 0xC8 ... 0xCF: /* FXCH %st(?) */
   5756                r_src = (UInt)modrm - 0xC8;
   5757                DIP("fxch %%st(%u)\n", r_src);
   5758                t1 = newTemp(Ity_F64);
   5759                t2 = newTemp(Ity_F64);
   5760                assign(t1, get_ST(0));
   5761                assign(t2, get_ST(r_src));
   5762                put_ST_UNCHECKED(0, mkexpr(t2));
   5763                put_ST_UNCHECKED(r_src, mkexpr(t1));
   5764                break;
   5765 
   5766             case 0xE0: /* FCHS */
   5767                DIP("fchs\n");
   5768                put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
   5769                break;
   5770 
   5771             case 0xE1: /* FABS */
   5772                DIP("fabs\n");
   5773                put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
   5774                break;
   5775 
   5776             case 0xE5: { /* FXAM */
   5777                /* This is an interesting one.  It examines %st(0),
   5778                   regardless of whether the tag says it's empty or not.
   5779                   Here, just pass both the tag (in our format) and the
   5780                   value (as a double, actually a ULong) to a helper
   5781                   function. */
   5782                IRExpr** args
   5783                   = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
   5784                                    unop(Iop_ReinterpF64asI64,
   5785                                         get_ST_UNCHECKED(0)) );
   5786                put_C3210(mkIRExprCCall(
   5787                             Ity_I64,
   5788                             0/*regparm*/,
   5789                             "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
   5790                             args
   5791                         ));
   5792                DIP("fxam\n");
   5793                break;
   5794             }
   5795 
   5796             case 0xE8: /* FLD1 */
   5797                DIP("fld1\n");
   5798                fp_push();
   5799                /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
   5800                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
   5801                break;
   5802 
   5803             case 0xE9: /* FLDL2T */
   5804                DIP("fldl2t\n");
   5805                fp_push();
   5806                /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
   5807                put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
   5808                break;
   5809 
   5810             case 0xEA: /* FLDL2E */
   5811                DIP("fldl2e\n");
   5812                fp_push();
   5813                /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
   5814                put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
   5815                break;
   5816 
   5817             case 0xEB: /* FLDPI */
   5818                DIP("fldpi\n");
   5819                fp_push();
   5820                /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
   5821                put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
   5822                break;
   5823 
   5824             case 0xEC: /* FLDLG2 */
   5825                DIP("fldlg2\n");
   5826                fp_push();
   5827                /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
   5828                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
   5829                break;
   5830 
   5831             case 0xED: /* FLDLN2 */
   5832                DIP("fldln2\n");
   5833                fp_push();
   5834                /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
   5835                put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
   5836                break;
   5837 
   5838             case 0xEE: /* FLDZ */
   5839                DIP("fldz\n");
   5840                fp_push();
   5841                /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
   5842                put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
   5843                break;
   5844 
   5845             case 0xF0: /* F2XM1 */
   5846                DIP("f2xm1\n");
   5847                put_ST_UNCHECKED(0,
   5848                   binop(Iop_2xm1F64,
   5849                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5850                         get_ST(0)));
   5851                break;
   5852 
   5853             case 0xF1: /* FYL2X */
   5854                DIP("fyl2x\n");
   5855                put_ST_UNCHECKED(1,
   5856                   triop(Iop_Yl2xF64,
   5857                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5858                         get_ST(1),
   5859                         get_ST(0)));
   5860                fp_pop();
   5861                break;
   5862 
   5863             case 0xF2: { /* FPTAN */
   5864                DIP("fptan\n");
   5865                IRTemp argD = newTemp(Ity_F64);
   5866                assign(argD, get_ST(0));
   5867                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   5868                IRTemp resD = newTemp(Ity_F64);
   5869                assign(resD,
   5870                   IRExpr_ITE(
   5871                      mkexpr(argOK),
   5872                      binop(Iop_TanF64,
   5873                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5874                            mkexpr(argD)),
   5875                      mkexpr(argD))
   5876                );
   5877                put_ST_UNCHECKED(0, mkexpr(resD));
   5878                /* Conditionally push 1.0 on the stack, if the arg is
   5879                   in range */
   5880                maybe_fp_push(argOK);
   5881                maybe_put_ST(argOK, 0,
   5882                             IRExpr_Const(IRConst_F64(1.0)));
   5883                set_C2( binop(Iop_Xor64,
   5884                              unop(Iop_1Uto64, mkexpr(argOK)),
   5885                              mkU64(1)) );
   5886                break;
   5887             }
   5888 
   5889             case 0xF3: /* FPATAN */
   5890                DIP("fpatan\n");
   5891                put_ST_UNCHECKED(1,
   5892                   triop(Iop_AtanF64,
   5893                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5894                         get_ST(1),
   5895                         get_ST(0)));
   5896                fp_pop();
   5897                break;
   5898 
   5899             case 0xF4: { /* FXTRACT */
   5900                IRTemp argF = newTemp(Ity_F64);
   5901                IRTemp sigF = newTemp(Ity_F64);
   5902                IRTemp expF = newTemp(Ity_F64);
   5903                IRTemp argI = newTemp(Ity_I64);
   5904                IRTemp sigI = newTemp(Ity_I64);
   5905                IRTemp expI = newTemp(Ity_I64);
   5906                DIP("fxtract\n");
   5907                assign( argF, get_ST(0) );
   5908                assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
   5909                assign( sigI,
   5910                        mkIRExprCCall(
   5911                           Ity_I64, 0/*regparms*/,
   5912                           "x86amd64g_calculate_FXTRACT",
   5913                           &x86amd64g_calculate_FXTRACT,
   5914                           mkIRExprVec_2( mkexpr(argI),
   5915                                          mkIRExpr_HWord(0)/*sig*/ ))
   5916                );
   5917                assign( expI,
   5918                        mkIRExprCCall(
   5919                           Ity_I64, 0/*regparms*/,
   5920                           "x86amd64g_calculate_FXTRACT",
   5921                           &x86amd64g_calculate_FXTRACT,
   5922                           mkIRExprVec_2( mkexpr(argI),
   5923                                          mkIRExpr_HWord(1)/*exp*/ ))
   5924                );
   5925                assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
   5926                assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
   5927                /* exponent */
   5928                put_ST_UNCHECKED(0, mkexpr(expF) );
   5929                fp_push();
   5930                /* significand */
   5931                put_ST(0, mkexpr(sigF) );
   5932                break;
   5933             }
   5934 
   5935             case 0xF5: { /* FPREM1 -- IEEE compliant */
   5936                IRTemp a1 = newTemp(Ity_F64);
   5937                IRTemp a2 = newTemp(Ity_F64);
   5938                DIP("fprem1\n");
   5939                /* Do FPREM1 twice, once to get the remainder, and once
   5940                   to get the C3210 flag values. */
   5941                assign( a1, get_ST(0) );
   5942                assign( a2, get_ST(1) );
   5943                put_ST_UNCHECKED(0,
   5944                   triop(Iop_PRem1F64,
   5945                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5946                         mkexpr(a1),
   5947                         mkexpr(a2)));
   5948                put_C3210(
   5949                   unop(Iop_32Uto64,
   5950                   triop(Iop_PRem1C3210F64,
   5951                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5952                         mkexpr(a1),
   5953                         mkexpr(a2)) ));
   5954                break;
   5955             }
   5956 
   5957             case 0xF7: /* FINCSTP */
   5958                DIP("fincstp\n");
   5959                put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
   5960                break;
   5961 
   5962             case 0xF8: { /* FPREM -- not IEEE compliant */
   5963                IRTemp a1 = newTemp(Ity_F64);
   5964                IRTemp a2 = newTemp(Ity_F64);
   5965                DIP("fprem\n");
   5966                /* Do FPREM twice, once to get the remainder, and once
   5967                   to get the C3210 flag values. */
   5968                assign( a1, get_ST(0) );
   5969                assign( a2, get_ST(1) );
   5970                put_ST_UNCHECKED(0,
   5971                   triop(Iop_PRemF64,
   5972                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5973                         mkexpr(a1),
   5974                         mkexpr(a2)));
   5975                put_C3210(
   5976                   unop(Iop_32Uto64,
   5977                   triop(Iop_PRemC3210F64,
   5978                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5979                         mkexpr(a1),
   5980                         mkexpr(a2)) ));
   5981                break;
   5982             }
   5983 
   5984             case 0xF9: /* FYL2XP1 */
   5985                DIP("fyl2xp1\n");
   5986                put_ST_UNCHECKED(1,
   5987                   triop(Iop_Yl2xp1F64,
   5988                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5989                         get_ST(1),
   5990                         get_ST(0)));
   5991                fp_pop();
   5992                break;
   5993 
   5994             case 0xFA: /* FSQRT */
   5995                DIP("fsqrt\n");
   5996                put_ST_UNCHECKED(0,
   5997                   binop(Iop_SqrtF64,
   5998                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   5999                         get_ST(0)));
   6000                break;
   6001 
   6002             case 0xFB: { /* FSINCOS */
   6003                DIP("fsincos\n");
   6004                IRTemp argD = newTemp(Ity_F64);
   6005                assign(argD, get_ST(0));
   6006                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6007                IRTemp resD = newTemp(Ity_F64);
   6008                assign(resD,
   6009                   IRExpr_ITE(
   6010                      mkexpr(argOK),
   6011                      binop(Iop_SinF64,
   6012                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6013                            mkexpr(argD)),
   6014                      mkexpr(argD))
   6015                );
   6016                put_ST_UNCHECKED(0, mkexpr(resD));
   6017                /* Conditionally push the cos value on the stack, if
   6018                   the arg is in range */
   6019                maybe_fp_push(argOK);
   6020                maybe_put_ST(argOK, 0,
   6021                   binop(Iop_CosF64,
   6022                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6023                         mkexpr(argD)));
   6024                set_C2( binop(Iop_Xor64,
   6025                              unop(Iop_1Uto64, mkexpr(argOK)),
   6026                              mkU64(1)) );
   6027                break;
   6028             }
   6029 
   6030             case 0xFC: /* FRNDINT */
   6031                DIP("frndint\n");
   6032                put_ST_UNCHECKED(0,
   6033                   binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
   6034                break;
   6035 
   6036             case 0xFD: /* FSCALE */
   6037                DIP("fscale\n");
   6038                put_ST_UNCHECKED(0,
   6039                   triop(Iop_ScaleF64,
   6040                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6041                         get_ST(0),
   6042                         get_ST(1)));
   6043                break;
   6044 
   6045             case 0xFE:   /* FSIN */
   6046             case 0xFF: { /* FCOS */
   6047                Bool isSIN = modrm == 0xFE;
   6048                DIP("%s\n", isSIN ? "fsin" : "fcos");
   6049                IRTemp argD = newTemp(Ity_F64);
   6050                assign(argD, get_ST(0));
   6051                IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
   6052                IRTemp resD = newTemp(Ity_F64);
   6053                assign(resD,
   6054                   IRExpr_ITE(
   6055                      mkexpr(argOK),
   6056                      binop(isSIN ? Iop_SinF64 : Iop_CosF64,
   6057                            get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6058                            mkexpr(argD)),
   6059                      mkexpr(argD))
   6060                );
   6061                put_ST_UNCHECKED(0, mkexpr(resD));
   6062                set_C2( binop(Iop_Xor64,
   6063                              unop(Iop_1Uto64, mkexpr(argOK)),
   6064                              mkU64(1)) );
   6065                break;
   6066             }
   6067 
   6068             default:
   6069                goto decode_fail;
   6070          }
   6071       }
   6072    }
   6073 
   6074    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
   6075    else
   6076    if (first_opcode == 0xDA) {
   6077 
   6078       if (modrm < 0xC0) {
   6079 
   6080          /* bits 5,4,3 are an opcode extension, and the modRM also
   6081             specifies an address. */
   6082          IROp   fop;
   6083          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6084          delta += len;
   6085          switch (gregLO3ofRM(modrm)) {
   6086 
   6087             case 0: /* FIADD m32int */ /* ST(0) += m32int */
   6088                DIP("fiaddl %s\n", dis_buf);
   6089                fop = Iop_AddF64;
   6090                goto do_fop_m32;
   6091 
   6092             case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
   6093                DIP("fimull %s\n", dis_buf);
   6094                fop = Iop_MulF64;
   6095                goto do_fop_m32;
   6096 
   6097             case 4: /* FISUB m32int */ /* ST(0) -= m32int */
   6098                DIP("fisubl %s\n", dis_buf);
   6099                fop = Iop_SubF64;
   6100                goto do_fop_m32;
   6101 
   6102             case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
   6103                DIP("fisubrl %s\n", dis_buf);
   6104                fop = Iop_SubF64;
   6105                goto do_foprev_m32;
   6106 
   6107             case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
   6108                DIP("fisubl %s\n", dis_buf);
   6109                fop = Iop_DivF64;
   6110                goto do_fop_m32;
   6111 
   6112             case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
   6113                DIP("fidivrl %s\n", dis_buf);
   6114                fop = Iop_DivF64;
   6115                goto do_foprev_m32;
   6116 
   6117             do_fop_m32:
   6118                put_ST_UNCHECKED(0,
   6119                   triop(fop,
   6120                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6121                         get_ST(0),
   6122                         unop(Iop_I32StoF64,
   6123                              loadLE(Ity_I32, mkexpr(addr)))));
   6124                break;
   6125 
   6126             do_foprev_m32:
   6127                put_ST_UNCHECKED(0,
   6128                   triop(fop,
   6129                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6130                         unop(Iop_I32StoF64,
   6131                              loadLE(Ity_I32, mkexpr(addr))),
   6132                         get_ST(0)));
   6133                break;
   6134 
   6135             default:
   6136                vex_printf("unhandled opc_aux = 0x%2x\n",
   6137                           (UInt)gregLO3ofRM(modrm));
   6138                vex_printf("first_opcode == 0xDA\n");
   6139                goto decode_fail;
   6140          }
   6141 
   6142       } else {
   6143 
   6144          delta++;
   6145          switch (modrm) {
   6146 
   6147             case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
   6148                r_src = (UInt)modrm - 0xC0;
   6149                DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
   6150                put_ST_UNCHECKED(0,
   6151                                 IRExpr_ITE(
   6152                                     mk_amd64g_calculate_condition(AMD64CondB),
   6153                                     get_ST(r_src), get_ST(0)) );
   6154                break;
   6155 
   6156             case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
   6157                r_src = (UInt)modrm - 0xC8;
   6158                DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
   6159                put_ST_UNCHECKED(0,
   6160                                 IRExpr_ITE(
   6161                                     mk_amd64g_calculate_condition(AMD64CondZ),
   6162                                     get_ST(r_src), get_ST(0)) );
   6163                break;
   6164 
   6165             case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
   6166                r_src = (UInt)modrm - 0xD0;
   6167                DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
   6168                put_ST_UNCHECKED(0,
   6169                                 IRExpr_ITE(
   6170                                     mk_amd64g_calculate_condition(AMD64CondBE),
   6171                                     get_ST(r_src), get_ST(0)) );
   6172                break;
   6173 
   6174             case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
   6175                r_src = (UInt)modrm - 0xD8;
   6176                DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
   6177                put_ST_UNCHECKED(0,
   6178                                 IRExpr_ITE(
   6179                                     mk_amd64g_calculate_condition(AMD64CondP),
   6180                                     get_ST(r_src), get_ST(0)) );
   6181                break;
   6182 
   6183             case 0xE9: /* FUCOMPP %st(0),%st(1) */
   6184                DIP("fucompp %%st(0),%%st(1)\n");
   6185                /* This forces C1 to zero, which isn't right. */
   6186                put_C3210(
   6187                    unop(Iop_32Uto64,
   6188                    binop( Iop_And32,
   6189                           binop(Iop_Shl32,
   6190                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6191                                 mkU8(8)),
   6192                           mkU32(0x4500)
   6193                    )));
   6194                fp_pop();
   6195                fp_pop();
   6196                break;
   6197 
   6198             default:
   6199                goto decode_fail;
   6200          }
   6201 
   6202       }
   6203    }
   6204 
   6205    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
   6206    else
   6207    if (first_opcode == 0xDB) {
   6208       if (modrm < 0xC0) {
   6209 
   6210          /* bits 5,4,3 are an opcode extension, and the modRM also
   6211             specifies an address. */
   6212          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6213          delta += len;
   6214 
   6215          switch (gregLO3ofRM(modrm)) {
   6216 
   6217             case 0: /* FILD m32int */
   6218                DIP("fildl %s\n", dis_buf);
   6219                fp_push();
   6220                put_ST(0, unop(Iop_I32StoF64,
   6221                               loadLE(Ity_I32, mkexpr(addr))));
   6222                break;
   6223 
   6224             case 1: /* FISTTPL m32 (SSE3) */
   6225                DIP("fisttpl %s\n", dis_buf);
   6226                storeLE( mkexpr(addr),
   6227                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
   6228                fp_pop();
   6229                break;
   6230 
   6231             case 2: /* FIST m32 */
   6232                DIP("fistl %s\n", dis_buf);
   6233                storeLE( mkexpr(addr),
   6234                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6235                break;
   6236 
   6237             case 3: /* FISTP m32 */
   6238                DIP("fistpl %s\n", dis_buf);
   6239                storeLE( mkexpr(addr),
   6240                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
   6241                fp_pop();
   6242                break;
   6243 
   6244             case 5: { /* FLD extended-real */
   6245                /* Uses dirty helper:
   6246                      ULong amd64g_loadF80le ( ULong )
   6247                   addr holds the address.  First, do a dirty call to
   6248                   get hold of the data. */
   6249                IRTemp   val  = newTemp(Ity_I64);
   6250                IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
   6251 
   6252                IRDirty* d = unsafeIRDirty_1_N (
   6253                                val,
   6254                                0/*regparms*/,
   6255                                "amd64g_dirtyhelper_loadF80le",
   6256                                &amd64g_dirtyhelper_loadF80le,
   6257                                args
   6258                             );
   6259                /* declare that we're reading memory */
   6260                d->mFx   = Ifx_Read;
   6261                d->mAddr = mkexpr(addr);
   6262                d->mSize = 10;
   6263 
   6264                /* execute the dirty call, dumping the result in val. */
   6265                stmt( IRStmt_Dirty(d) );
   6266                fp_push();
   6267                put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
   6268 
   6269                DIP("fldt %s\n", dis_buf);
   6270                break;
   6271             }
   6272 
   6273             case 7: { /* FSTP extended-real */
   6274                /* Uses dirty helper:
   6275                      void amd64g_storeF80le ( ULong addr, ULong data )
   6276                */
   6277                IRExpr** args
   6278                   = mkIRExprVec_2( mkexpr(addr),
   6279                                    unop(Iop_ReinterpF64asI64, get_ST(0)) );
   6280 
   6281                IRDirty* d = unsafeIRDirty_0_N (
   6282                                0/*regparms*/,
   6283                                "amd64g_dirtyhelper_storeF80le",
   6284                                &amd64g_dirtyhelper_storeF80le,
   6285                                args
   6286                             );
   6287                /* declare we're writing memory */
   6288                d->mFx   = Ifx_Write;
   6289                d->mAddr = mkexpr(addr);
   6290                d->mSize = 10;
   6291 
   6292                /* execute the dirty call. */
   6293                stmt( IRStmt_Dirty(d) );
   6294                fp_pop();
   6295 
   6296                DIP("fstpt\n %s", dis_buf);
   6297                break;
   6298             }
   6299 
   6300             default:
   6301                vex_printf("unhandled opc_aux = 0x%2x\n",
   6302                           (UInt)gregLO3ofRM(modrm));
   6303                vex_printf("first_opcode == 0xDB\n");
   6304                goto decode_fail;
   6305          }
   6306 
   6307       } else {
   6308 
   6309          delta++;
   6310          switch (modrm) {
   6311 
   6312             case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
   6313                r_src = (UInt)modrm - 0xC0;
   6314                DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
   6315                put_ST_UNCHECKED(0,
   6316                                 IRExpr_ITE(
   6317                                     mk_amd64g_calculate_condition(AMD64CondNB),
   6318                                     get_ST(r_src), get_ST(0)) );
   6319                break;
   6320 
   6321             case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
   6322                r_src = (UInt)modrm - 0xC8;
   6323                DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
   6324                put_ST_UNCHECKED(
   6325                   0,
   6326                   IRExpr_ITE(
   6327                      mk_amd64g_calculate_condition(AMD64CondNZ),
   6328                      get_ST(r_src),
   6329                      get_ST(0)
   6330                   )
   6331                );
   6332                break;
   6333 
   6334             case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
   6335                r_src = (UInt)modrm - 0xD0;
   6336                DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
   6337                put_ST_UNCHECKED(
   6338                   0,
   6339                   IRExpr_ITE(
   6340                      mk_amd64g_calculate_condition(AMD64CondNBE),
   6341                      get_ST(r_src),
   6342                      get_ST(0)
   6343                   )
   6344                );
   6345                break;
   6346 
   6347             case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
   6348                r_src = (UInt)modrm - 0xD8;
   6349                DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
   6350                put_ST_UNCHECKED(
   6351                   0,
   6352                   IRExpr_ITE(
   6353                      mk_amd64g_calculate_condition(AMD64CondNP),
   6354                      get_ST(r_src),
   6355                      get_ST(0)
   6356                   )
   6357                );
   6358                break;
   6359 
   6360             case 0xE2:
   6361                DIP("fnclex\n");
   6362                break;
   6363 
   6364             case 0xE3: {
   6365                gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
   6366                DIP("fninit\n");
   6367                break;
   6368             }
   6369 
   6370             case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
   6371                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
   6372                break;
   6373 
   6374             case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
   6375                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
   6376                break;
   6377 
   6378             default:
   6379                goto decode_fail;
   6380          }
   6381       }
   6382    }
   6383 
   6384    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
   6385    else
   6386    if (first_opcode == 0xDC) {
   6387       if (modrm < 0xC0) {
   6388 
   6389          /* bits 5,4,3 are an opcode extension, and the modRM also
   6390             specifies an address. */
   6391          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6392          delta += len;
   6393 
   6394          switch (gregLO3ofRM(modrm)) {
   6395 
   6396             case 0: /* FADD double-real */
   6397                fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
   6398                break;
   6399 
   6400             case 1: /* FMUL double-real */
   6401                fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
   6402                break;
   6403 
   6404 //..             case 2: /* FCOM double-real */
   6405 //..                DIP("fcoml %s\n", dis_buf);
   6406 //..                /* This forces C1 to zero, which isn't right. */
   6407 //..                put_C3210(
   6408 //..                    binop( Iop_And32,
   6409 //..                           binop(Iop_Shl32,
   6410 //..                                 binop(Iop_CmpF64,
   6411 //..                                       get_ST(0),
   6412 //..                                       loadLE(Ity_F64,mkexpr(addr))),
   6413 //..                                 mkU8(8)),
   6414 //..                           mkU32(0x4500)
   6415 //..                    ));
   6416 //..                break;
   6417 
   6418             case 3: /* FCOMP double-real */
   6419                DIP("fcompl %s\n", dis_buf);
   6420                /* This forces C1 to zero, which isn't right. */
   6421                put_C3210(
   6422                    unop(Iop_32Uto64,
   6423                    binop( Iop_And32,
   6424                           binop(Iop_Shl32,
   6425                                 binop(Iop_CmpF64,
   6426                                       get_ST(0),
   6427                                       loadLE(Ity_F64,mkexpr(addr))),
   6428                                 mkU8(8)),
   6429                           mkU32(0x4500)
   6430                    )));
   6431                fp_pop();
   6432                break;
   6433 
   6434             case 4: /* FSUB double-real */
   6435                fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
   6436                break;
   6437 
   6438             case 5: /* FSUBR double-real */
   6439                fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
   6440                break;
   6441 
   6442             case 6: /* FDIV double-real */
   6443                fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
   6444                break;
   6445 
   6446             case 7: /* FDIVR double-real */
   6447                fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
   6448                break;
   6449 
   6450             default:
   6451                vex_printf("unhandled opc_aux = 0x%2x\n",
   6452                           (UInt)gregLO3ofRM(modrm));
   6453                vex_printf("first_opcode == 0xDC\n");
   6454                goto decode_fail;
   6455          }
   6456 
   6457       } else {
   6458 
   6459          delta++;
   6460          switch (modrm) {
   6461 
   6462             case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
   6463                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
   6464                break;
   6465 
   6466             case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
   6467                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
   6468                break;
   6469 
   6470             case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
   6471                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
   6472                break;
   6473 
   6474             case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
   6475                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
   6476                break;
   6477 
   6478             case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
   6479                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
   6480                break;
   6481 
   6482             case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
   6483                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
   6484                break;
   6485 
   6486             default:
   6487                goto decode_fail;
   6488          }
   6489 
   6490       }
   6491    }
   6492 
   6493    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
   6494    else
   6495    if (first_opcode == 0xDD) {
   6496 
   6497       if (modrm < 0xC0) {
   6498 
   6499          /* bits 5,4,3 are an opcode extension, and the modRM also
   6500             specifies an address. */
   6501          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6502          delta += len;
   6503 
   6504          switch (gregLO3ofRM(modrm)) {
   6505 
   6506             case 0: /* FLD double-real */
   6507                DIP("fldl %s\n", dis_buf);
   6508                fp_push();
   6509                put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
   6510                break;
   6511 
   6512             case 1: /* FISTTPQ m64 (SSE3) */
   6513                DIP("fistppll %s\n", dis_buf);
   6514                storeLE( mkexpr(addr),
   6515                         binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
   6516                fp_pop();
   6517                break;
   6518 
   6519             case 2: /* FST double-real */
   6520                DIP("fstl %s\n", dis_buf);
   6521                storeLE(mkexpr(addr), get_ST(0));
   6522                break;
   6523 
   6524             case 3: /* FSTP double-real */
   6525                DIP("fstpl %s\n", dis_buf);
   6526                storeLE(mkexpr(addr), get_ST(0));
   6527                fp_pop();
   6528                break;
   6529 
   6530             case 4: { /* FRSTOR m94/m108 */
   6531                IRTemp   ew = newTemp(Ity_I32);
   6532                IRTemp  w64 = newTemp(Ity_I64);
   6533                IRDirty*  d;
   6534                if ( have66(pfx) ) {
   6535                   /* Uses dirty helper:
   6536                      VexEmNote amd64g_dirtyhelper_FRSTORS
   6537                                   ( VexGuestAMD64State*, HWord ) */
   6538                   d = unsafeIRDirty_0_N (
   6539                          0/*regparms*/,
   6540                          "amd64g_dirtyhelper_FRSTORS",
   6541                          &amd64g_dirtyhelper_FRSTORS,
   6542                          mkIRExprVec_1( mkexpr(addr) )
   6543                       );
   6544                   d->mSize = 94;
   6545                } else {
   6546                   /* Uses dirty helper:
   6547                      VexEmNote amd64g_dirtyhelper_FRSTOR
   6548                                   ( VexGuestAMD64State*, HWord ) */
   6549                   d = unsafeIRDirty_0_N (
   6550                          0/*regparms*/,
   6551                          "amd64g_dirtyhelper_FRSTOR",
   6552                          &amd64g_dirtyhelper_FRSTOR,
   6553                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6554                       );
   6555                   d->mSize = 108;
   6556                }
   6557 
   6558                d->tmp    = w64;
   6559                /* declare we're reading memory */
   6560                d->mFx   = Ifx_Read;
   6561                d->mAddr = mkexpr(addr);
   6562                /* d->mSize set above */
   6563 
   6564                /* declare we're writing guest state */
   6565                d->nFxState = 5;
   6566                vex_bzero(&d->fxState, sizeof(d->fxState));
   6567 
   6568                d->fxState[0].fx     = Ifx_Write;
   6569                d->fxState[0].offset = OFFB_FTOP;
   6570                d->fxState[0].size   = sizeof(UInt);
   6571 
   6572                d->fxState[1].fx     = Ifx_Write;
   6573                d->fxState[1].offset = OFFB_FPREGS;
   6574                d->fxState[1].size   = 8 * sizeof(ULong);
   6575 
   6576                d->fxState[2].fx     = Ifx_Write;
   6577                d->fxState[2].offset = OFFB_FPTAGS;
   6578                d->fxState[2].size   = 8 * sizeof(UChar);
   6579 
   6580                d->fxState[3].fx     = Ifx_Write;
   6581                d->fxState[3].offset = OFFB_FPROUND;
   6582                d->fxState[3].size   = sizeof(ULong);
   6583 
   6584                d->fxState[4].fx     = Ifx_Write;
   6585                d->fxState[4].offset = OFFB_FC3210;
   6586                d->fxState[4].size   = sizeof(ULong);
   6587 
   6588                stmt( IRStmt_Dirty(d) );
   6589 
   6590                /* ew contains any emulation warning we may need to
   6591                   issue.  If needed, side-exit to the next insn,
   6592                   reporting the warning, so that Valgrind's dispatcher
   6593                   sees the warning. */
   6594                assign(ew, unop(Iop_64to32,mkexpr(w64)) );
   6595                put_emwarn( mkexpr(ew) );
   6596                stmt(
   6597                   IRStmt_Exit(
   6598                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
   6599                      Ijk_EmWarn,
   6600                      IRConst_U64( guest_RIP_bbstart+delta ),
   6601                      OFFB_RIP
   6602                   )
   6603                );
   6604 
   6605                if ( have66(pfx) ) {
   6606                   DIP("frstors %s\n", dis_buf);
   6607                } else {
   6608                   DIP("frstor %s\n", dis_buf);
   6609                }
   6610                break;
   6611             }
   6612 
   6613             case 6: { /* FNSAVE m94/m108 */
   6614                IRDirty *d;
   6615                if ( have66(pfx) ) {
   6616                  /* Uses dirty helper:
   6617                     void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
   6618                                                       HWord ) */
   6619                   d = unsafeIRDirty_0_N (
   6620                          0/*regparms*/,
   6621                          "amd64g_dirtyhelper_FNSAVES",
   6622                          &amd64g_dirtyhelper_FNSAVES,
   6623                          mkIRExprVec_1( mkexpr(addr) )
   6624                          );
   6625                   d->mSize = 94;
   6626                } else {
   6627                  /* Uses dirty helper:
   6628                     void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
   6629                                                      HWord ) */
   6630                   d = unsafeIRDirty_0_N (
   6631                          0/*regparms*/,
   6632                          "amd64g_dirtyhelper_FNSAVE",
   6633                          &amd64g_dirtyhelper_FNSAVE,
   6634                          mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   6635                       );
   6636                   d->mSize = 108;
   6637                }
   6638 
   6639                /* declare we're writing memory */
   6640                d->mFx   = Ifx_Write;
   6641                d->mAddr = mkexpr(addr);
   6642                /* d->mSize set above */
   6643 
   6644                /* declare we're reading guest state */
   6645                d->nFxState = 5;
   6646                vex_bzero(&d->fxState, sizeof(d->fxState));
   6647 
   6648                d->fxState[0].fx     = Ifx_Read;
   6649                d->fxState[0].offset = OFFB_FTOP;
   6650                d->fxState[0].size   = sizeof(UInt);
   6651 
   6652                d->fxState[1].fx     = Ifx_Read;
   6653                d->fxState[1].offset = OFFB_FPREGS;
   6654                d->fxState[1].size   = 8 * sizeof(ULong);
   6655 
   6656                d->fxState[2].fx     = Ifx_Read;
   6657                d->fxState[2].offset = OFFB_FPTAGS;
   6658                d->fxState[2].size   = 8 * sizeof(UChar);
   6659 
   6660                d->fxState[3].fx     = Ifx_Read;
   6661                d->fxState[3].offset = OFFB_FPROUND;
   6662                d->fxState[3].size   = sizeof(ULong);
   6663 
   6664                d->fxState[4].fx     = Ifx_Read;
   6665                d->fxState[4].offset = OFFB_FC3210;
   6666                d->fxState[4].size   = sizeof(ULong);
   6667 
   6668                stmt( IRStmt_Dirty(d) );
   6669 
   6670                if ( have66(pfx) ) {
   6671                  DIP("fnsaves %s\n", dis_buf);
   6672                } else {
   6673                  DIP("fnsave %s\n", dis_buf);
   6674                }
   6675                break;
   6676             }
   6677 
   6678             case 7: { /* FNSTSW m16 */
   6679                IRExpr* sw = get_FPU_sw();
   6680                vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
   6681                storeLE( mkexpr(addr), sw );
   6682                DIP("fnstsw %s\n", dis_buf);
   6683                break;
   6684             }
   6685 
   6686             default:
   6687                vex_printf("unhandled opc_aux = 0x%2x\n",
   6688                           (UInt)gregLO3ofRM(modrm));
   6689                vex_printf("first_opcode == 0xDD\n");
   6690                goto decode_fail;
   6691          }
   6692       } else {
   6693          delta++;
   6694          switch (modrm) {
   6695 
   6696             case 0xC0 ... 0xC7: /* FFREE %st(?) */
   6697                r_dst = (UInt)modrm - 0xC0;
   6698                DIP("ffree %%st(%u)\n", r_dst);
   6699                put_ST_TAG ( r_dst, mkU8(0) );
   6700                break;
   6701 
   6702             case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
   6703                r_dst = (UInt)modrm - 0xD0;
   6704                DIP("fst %%st(0),%%st(%u)\n", r_dst);
   6705                /* P4 manual says: "If the destination operand is a
   6706                   non-empty register, the invalid-operation exception
   6707                   is not generated.  Hence put_ST_UNCHECKED. */
   6708                put_ST_UNCHECKED(r_dst, get_ST(0));
   6709                break;
   6710 
   6711             case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
   6712                r_dst = (UInt)modrm - 0xD8;
   6713                DIP("fstp %%st(0),%%st(%u)\n", r_dst);
   6714                /* P4 manual says: "If the destination operand is a
   6715                   non-empty register, the invalid-operation exception
   6716                   is not generated.  Hence put_ST_UNCHECKED. */
   6717                put_ST_UNCHECKED(r_dst, get_ST(0));
   6718                fp_pop();
   6719                break;
   6720 
   6721             case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
   6722                r_dst = (UInt)modrm - 0xE0;
   6723                DIP("fucom %%st(0),%%st(%u)\n", r_dst);
   6724                /* This forces C1 to zero, which isn't right. */
   6725                put_C3210(
   6726                    unop(Iop_32Uto64,
   6727                    binop( Iop_And32,
   6728                           binop(Iop_Shl32,
   6729                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6730                                 mkU8(8)),
   6731                           mkU32(0x4500)
   6732                    )));
   6733                break;
   6734 
   6735             case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
   6736                r_dst = (UInt)modrm - 0xE8;
   6737                DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
   6738                /* This forces C1 to zero, which isn't right. */
   6739                put_C3210(
   6740                    unop(Iop_32Uto64,
   6741                    binop( Iop_And32,
   6742                           binop(Iop_Shl32,
   6743                                 binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
   6744                                 mkU8(8)),
   6745                           mkU32(0x4500)
   6746                    )));
   6747                fp_pop();
   6748                break;
   6749 
   6750             default:
   6751                goto decode_fail;
   6752          }
   6753       }
   6754    }
   6755 
   6756    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
   6757    else
   6758    if (first_opcode == 0xDE) {
   6759 
   6760       if (modrm < 0xC0) {
   6761 
   6762          /* bits 5,4,3 are an opcode extension, and the modRM also
   6763             specifies an address. */
   6764          IROp   fop;
   6765          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6766          delta += len;
   6767 
   6768          switch (gregLO3ofRM(modrm)) {
   6769 
   6770             case 0: /* FIADD m16int */ /* ST(0) += m16int */
   6771                DIP("fiaddw %s\n", dis_buf);
   6772                fop = Iop_AddF64;
   6773                goto do_fop_m16;
   6774 
   6775             case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
   6776                DIP("fimulw %s\n", dis_buf);
   6777                fop = Iop_MulF64;
   6778                goto do_fop_m16;
   6779 
   6780             case 4: /* FISUB m16int */ /* ST(0) -= m16int */
   6781                DIP("fisubw %s\n", dis_buf);
   6782                fop = Iop_SubF64;
   6783                goto do_fop_m16;
   6784 
   6785             case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
   6786                DIP("fisubrw %s\n", dis_buf);
   6787                fop = Iop_SubF64;
   6788                goto do_foprev_m16;
   6789 
   6790             case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
   6791                DIP("fisubw %s\n", dis_buf);
   6792                fop = Iop_DivF64;
   6793                goto do_fop_m16;
   6794 
   6795             case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
   6796                DIP("fidivrw %s\n", dis_buf);
   6797                fop = Iop_DivF64;
   6798                goto do_foprev_m16;
   6799 
   6800             do_fop_m16:
   6801                put_ST_UNCHECKED(0,
   6802                   triop(fop,
   6803                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6804                         get_ST(0),
   6805                         unop(Iop_I32StoF64,
   6806                              unop(Iop_16Sto32,
   6807                                   loadLE(Ity_I16, mkexpr(addr))))));
   6808                break;
   6809 
   6810             do_foprev_m16:
   6811                put_ST_UNCHECKED(0,
   6812                   triop(fop,
   6813                         get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   6814                         unop(Iop_I32StoF64,
   6815                              unop(Iop_16Sto32,
   6816                                   loadLE(Ity_I16, mkexpr(addr)))),
   6817                         get_ST(0)));
   6818                break;
   6819 
   6820             default:
   6821                vex_printf("unhandled opc_aux = 0x%2x\n",
   6822                           (UInt)gregLO3ofRM(modrm));
   6823                vex_printf("first_opcode == 0xDE\n");
   6824                goto decode_fail;
   6825          }
   6826 
   6827       } else {
   6828 
   6829          delta++;
   6830          switch (modrm) {
   6831 
   6832             case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
   6833                fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
   6834                break;
   6835 
   6836             case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
   6837                fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
   6838                break;
   6839 
   6840             case 0xD9: /* FCOMPP %st(0),%st(1) */
   6841                DIP("fcompp %%st(0),%%st(1)\n");
   6842                /* This forces C1 to zero, which isn't right. */
   6843                put_C3210(
   6844                    unop(Iop_32Uto64,
   6845                    binop( Iop_And32,
   6846                           binop(Iop_Shl32,
   6847                                 binop(Iop_CmpF64, get_ST(0), get_ST(1)),
   6848                                 mkU8(8)),
   6849                           mkU32(0x4500)
   6850                    )));
   6851                fp_pop();
   6852                fp_pop();
   6853                break;
   6854 
   6855             case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
   6856                fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
   6857                break;
   6858 
   6859             case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
   6860                fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
   6861                break;
   6862 
   6863             case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
   6864                fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
   6865                break;
   6866 
   6867             case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
   6868                fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
   6869                break;
   6870 
   6871             default:
   6872                goto decode_fail;
   6873          }
   6874 
   6875       }
   6876    }
   6877 
   6878    /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
   6879    else
   6880    if (first_opcode == 0xDF) {
   6881 
   6882       if (modrm < 0xC0) {
   6883 
   6884          /* bits 5,4,3 are an opcode extension, and the modRM also
   6885             specifies an address. */
   6886          IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   6887          delta += len;
   6888 
   6889          switch (gregLO3ofRM(modrm)) {
   6890 
   6891             case 0: /* FILD m16int */
   6892                DIP("fildw %s\n", dis_buf);
   6893                fp_push();
   6894                put_ST(0, unop(Iop_I32StoF64,
   6895                               unop(Iop_16Sto32,
   6896                                    loadLE(Ity_I16, mkexpr(addr)))));
   6897                break;
   6898 
   6899             case 1: /* FISTTPS m16 (SSE3) */
   6900                DIP("fisttps %s\n", dis_buf);
   6901                storeLE( mkexpr(addr),
   6902                         x87ishly_qnarrow_32_to_16(
   6903                         binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
   6904                fp_pop();
   6905                break;
   6906 
   6907             case 2: /* FIST m16 */
   6908                DIP("fists %s\n", dis_buf);
   6909                storeLE( mkexpr(addr),
   6910                         x87ishly_qnarrow_32_to_16(
   6911                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6912                break;
   6913 
   6914             case 3: /* FISTP m16 */
   6915                DIP("fistps %s\n", dis_buf);
   6916                storeLE( mkexpr(addr),
   6917                         x87ishly_qnarrow_32_to_16(
   6918                         binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
   6919                fp_pop();
   6920                break;
   6921 
   6922             case 5: /* FILD m64 */
   6923                DIP("fildll %s\n", dis_buf);
   6924                fp_push();
   6925                put_ST(0, binop(Iop_I64StoF64,
   6926                                get_roundingmode(),
   6927                                loadLE(Ity_I64, mkexpr(addr))));
   6928                break;
   6929 
   6930             case 7: /* FISTP m64 */
   6931                DIP("fistpll %s\n", dis_buf);
   6932                storeLE( mkexpr(addr),
   6933                         binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
   6934                fp_pop();
   6935                break;
   6936 
   6937             default:
   6938                vex_printf("unhandled opc_aux = 0x%2x\n",
   6939                           (UInt)gregLO3ofRM(modrm));
   6940                vex_printf("first_opcode == 0xDF\n");
   6941                goto decode_fail;
   6942          }
   6943 
   6944       } else {
   6945 
   6946          delta++;
   6947          switch (modrm) {
   6948 
   6949             case 0xC0: /* FFREEP %st(0) */
   6950                DIP("ffreep %%st(%d)\n", 0);
   6951                put_ST_TAG ( 0, mkU8(0) );
   6952                fp_pop();
   6953                break;
   6954 
   6955             case 0xE0: /* FNSTSW %ax */
   6956                DIP("fnstsw %%ax\n");
   6957                /* Invent a plausible-looking FPU status word value and
   6958                   dump it in %AX:
   6959                      ((ftop & 7) << 11) | (c3210 & 0x4700)
   6960                */
   6961                putIRegRAX(
   6962                   2,
   6963                   unop(Iop_32to16,
   6964                        binop(Iop_Or32,
   6965                              binop(Iop_Shl32,
   6966                                    binop(Iop_And32, get_ftop(), mkU32(7)),
   6967                                    mkU8(11)),
   6968                              binop(Iop_And32,
   6969                                    unop(Iop_64to32, get_C3210()),
   6970                                    mkU32(0x4700))
   6971                )));
   6972                break;
   6973 
   6974             case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
   6975                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
   6976                break;
   6977 
   6978             case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
   6979                /* not really right since COMIP != UCOMIP */
   6980                fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
   6981                break;
   6982 
   6983             default:
   6984                goto decode_fail;
   6985          }
   6986       }
   6987 
   6988    }
   6989 
   6990    else
   6991       goto decode_fail;
   6992 
   6993    *decode_ok = True;
   6994    return delta;
   6995 
   6996   decode_fail:
   6997    *decode_ok = False;
   6998    return delta;
   6999 }
   7000 
   7001 
   7002 /*------------------------------------------------------------*/
   7003 /*---                                                      ---*/
   7004 /*--- MMX INSTRUCTIONS                                     ---*/
   7005 /*---                                                      ---*/
   7006 /*------------------------------------------------------------*/
   7007 
   7008 /* Effect of MMX insns on x87 FPU state (table 11-2 of
   7009    IA32 arch manual, volume 3):
   7010 
   7011    Read from, or write to MMX register (viz, any insn except EMMS):
   7012    * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
   7013    * FP stack pointer set to zero
   7014 
   7015    EMMS:
   7016    * All tags set to Invalid (empty) -- FPTAGS[i] := zero
   7017    * FP stack pointer set to zero
   7018 */
   7019 
   7020 static void do_MMX_preamble ( void )
   7021 {
   7022    Int         i;
   7023    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7024    IRExpr*     zero  = mkU32(0);
   7025    IRExpr*     tag1  = mkU8(1);
   7026    put_ftop(zero);
   7027    for (i = 0; i < 8; i++)
   7028       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
   7029 }
   7030 
   7031 static void do_EMMS_preamble ( void )
   7032 {
   7033    Int         i;
   7034    IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
   7035    IRExpr*     zero  = mkU32(0);
   7036    IRExpr*     tag0  = mkU8(0);
   7037    put_ftop(zero);
   7038    for (i = 0; i < 8; i++)
   7039       stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
   7040 }
   7041 
   7042 
   7043 static IRExpr* getMMXReg ( UInt archreg )
   7044 {
   7045    vassert(archreg < 8);
   7046    return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
   7047 }
   7048 
   7049 
   7050 static void putMMXReg ( UInt archreg, IRExpr* e )
   7051 {
   7052    vassert(archreg < 8);
   7053    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
   7054    stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
   7055 }
   7056 
   7057 
   7058 /* Helper for non-shift MMX insns.  Note this is incomplete in the
   7059    sense that it does not first call do_MMX_preamble() -- that is the
   7060    responsibility of its caller. */
   7061 
   7062 static
   7063 ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
   7064                                 Prefix      pfx,
   7065                                 Long        delta,
   7066                                 UChar       opc,
   7067                                 const HChar* name,
   7068                                 Bool        show_granularity )
   7069 {
   7070    HChar   dis_buf[50];
   7071    UChar   modrm = getUChar(delta);
   7072    Bool    isReg = epartIsReg(modrm);
   7073    IRExpr* argL  = NULL;
   7074    IRExpr* argR  = NULL;
   7075    IRExpr* argG  = NULL;
   7076    IRExpr* argE  = NULL;
   7077    IRTemp  res   = newTemp(Ity_I64);
   7078 
   7079    Bool    invG  = False;
   7080    IROp    op    = Iop_INVALID;
   7081    void*   hAddr = NULL;
   7082    const HChar*  hName = NULL;
   7083    Bool    eLeft = False;
   7084 
   7085 #  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
   7086 
   7087    switch (opc) {
   7088       /* Original MMX ones */
   7089       case 0xFC: op = Iop_Add8x8; break;
   7090       case 0xFD: op = Iop_Add16x4; break;
   7091       case 0xFE: op = Iop_Add32x2; break;
   7092 
   7093       case 0xEC: op = Iop_QAdd8Sx8; break;
   7094       case 0xED: op = Iop_QAdd16Sx4; break;
   7095 
   7096       case 0xDC: op = Iop_QAdd8Ux8; break;
   7097       case 0xDD: op = Iop_QAdd16Ux4; break;
   7098 
   7099       case 0xF8: op = Iop_Sub8x8;  break;
   7100       case 0xF9: op = Iop_Sub16x4; break;
   7101       case 0xFA: op = Iop_Sub32x2; break;
   7102 
   7103       case 0xE8: op = Iop_QSub8Sx8; break;
   7104       case 0xE9: op = Iop_QSub16Sx4; break;
   7105 
   7106       case 0xD8: op = Iop_QSub8Ux8; break;
   7107       case 0xD9: op = Iop_QSub16Ux4; break;
   7108 
   7109       case 0xE5: op = Iop_MulHi16Sx4; break;
   7110       case 0xD5: op = Iop_Mul16x4; break;
   7111       case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
   7112 
   7113       case 0x74: op = Iop_CmpEQ8x8; break;
   7114       case 0x75: op = Iop_CmpEQ16x4; break;
   7115       case 0x76: op = Iop_CmpEQ32x2; break;
   7116 
   7117       case 0x64: op = Iop_CmpGT8Sx8; break;
   7118       case 0x65: op = Iop_CmpGT16Sx4; break;
   7119       case 0x66: op = Iop_CmpGT32Sx2; break;
   7120 
   7121       case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
   7122       case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
   7123       case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
   7124 
   7125       case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
   7126       case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
   7127       case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
   7128 
   7129       case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
   7130       case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
   7131       case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
   7132 
   7133       case 0xDB: op = Iop_And64; break;
   7134       case 0xDF: op = Iop_And64; invG = True; break;
   7135       case 0xEB: op = Iop_Or64; break;
   7136       case 0xEF: /* Possibly do better here if argL and argR are the
   7137                     same reg */
   7138                  op = Iop_Xor64; break;
   7139 
   7140       /* Introduced in SSE1 */
   7141       case 0xE0: op = Iop_Avg8Ux8;    break;
   7142       case 0xE3: op = Iop_Avg16Ux4;   break;
   7143       case 0xEE: op = Iop_Max16Sx4;   break;
   7144       case 0xDE: op = Iop_Max8Ux8;    break;
   7145       case 0xEA: op = Iop_Min16Sx4;   break;
   7146       case 0xDA: op = Iop_Min8Ux8;    break;
   7147       case 0xE4: op = Iop_MulHi16Ux4; break;
   7148       case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
   7149 
   7150       /* Introduced in SSE2 */
   7151       case 0xD4: op = Iop_Add64; break;
   7152       case 0xFB: op = Iop_Sub64; break;
   7153 
   7154       default:
   7155          vex_printf("\n0x%x\n", (UInt)opc);
   7156          vpanic("dis_MMXop_regmem_to_reg");
   7157    }
   7158 
   7159 #  undef XXX
   7160 
   7161    argG = getMMXReg(gregLO3ofRM(modrm));
   7162    if (invG)
   7163       argG = unop(Iop_Not64, argG);
   7164 
   7165    if (isReg) {
   7166       delta++;
   7167       argE = getMMXReg(eregLO3ofRM(modrm));
   7168    } else {
   7169       Int    len;
   7170       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7171       delta += len;
   7172       argE = loadLE(Ity_I64, mkexpr(addr));
   7173    }
   7174 
   7175    if (eLeft) {
   7176       argL = argE;
   7177       argR = argG;
   7178    } else {
   7179       argL = argG;
   7180       argR = argE;
   7181    }
   7182 
   7183    if (op != Iop_INVALID) {
   7184       vassert(hName == NULL);
   7185       vassert(hAddr == NULL);
   7186       assign(res, binop(op, argL, argR));
   7187    } else {
   7188       vassert(hName != NULL);
   7189       vassert(hAddr != NULL);
   7190       assign( res,
   7191               mkIRExprCCall(
   7192                  Ity_I64,
   7193                  0/*regparms*/, hName, hAddr,
   7194                  mkIRExprVec_2( argL, argR )
   7195               )
   7196             );
   7197    }
   7198 
   7199    putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
   7200 
   7201    DIP("%s%s %s, %s\n",
   7202        name, show_granularity ? nameMMXGran(opc & 3) : "",
   7203        ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
   7204        nameMMXReg(gregLO3ofRM(modrm)) );
   7205 
   7206    return delta;
   7207 }
   7208 
   7209 
   7210 /* Vector by scalar shift of G by the amount specified at the bottom
   7211    of E.  This is a straight copy of dis_SSE_shiftG_byE. */
   7212 
   7213 static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
   7214                                   Prefix pfx, Long delta,
   7215                                   const HChar* opname, IROp op )
   7216 {
   7217    HChar   dis_buf[50];
   7218    Int     alen, size;
   7219    IRTemp  addr;
   7220    Bool    shl, shr, sar;
   7221    UChar   rm   = getUChar(delta);
   7222    IRTemp  g0   = newTemp(Ity_I64);
   7223    IRTemp  g1   = newTemp(Ity_I64);
   7224    IRTemp  amt  = newTemp(Ity_I64);
   7225    IRTemp  amt8 = newTemp(Ity_I8);
   7226 
   7227    if (epartIsReg(rm)) {
   7228       assign( amt, getMMXReg(eregLO3ofRM(rm)) );
   7229       DIP("%s %s,%s\n", opname,
   7230                         nameMMXReg(eregLO3ofRM(rm)),
   7231                         nameMMXReg(gregLO3ofRM(rm)) );
   7232       delta++;
   7233    } else {
   7234       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   7235       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   7236       DIP("%s %s,%s\n", opname,
   7237                         dis_buf,
   7238                         nameMMXReg(gregLO3ofRM(rm)) );
   7239       delta += alen;
   7240    }
   7241    assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
   7242    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   7243 
   7244    shl = shr = sar = False;
   7245    size = 0;
   7246    switch (op) {
   7247       case Iop_ShlN16x4: shl = True; size = 32; break;
   7248       case Iop_ShlN32x2: shl = True; size = 32; break;
   7249       case Iop_Shl64:    shl = True; size = 64; break;
   7250       case Iop_ShrN16x4: shr = True; size = 16; break;
   7251       case Iop_ShrN32x2: shr = True; size = 32; break;
   7252       case Iop_Shr64:    shr = True; size = 64; break;
   7253       case Iop_SarN16x4: sar = True; size = 16; break;
   7254       case Iop_SarN32x2: sar = True; size = 32; break;
   7255       default: vassert(0);
   7256    }
   7257 
   7258    if (shl || shr) {
   7259      assign(
   7260         g1,
   7261         IRExpr_ITE(
   7262            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7263            binop(op, mkexpr(g0), mkexpr(amt8)),
   7264            mkU64(0)
   7265         )
   7266      );
   7267    } else
   7268    if (sar) {
   7269      assign(
   7270         g1,
   7271         IRExpr_ITE(
   7272            binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
   7273            binop(op, mkexpr(g0), mkexpr(amt8)),
   7274            binop(op, mkexpr(g0), mkU8(size-1))
   7275         )
   7276      );
   7277    } else {
   7278       vassert(0);
   7279    }
   7280 
   7281    putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
   7282    return delta;
   7283 }
   7284 
   7285 
   7286 /* Vector by scalar shift of E by an immediate byte.  This is a
   7287    straight copy of dis_SSE_shiftE_imm. */
   7288 
   7289 static
   7290 ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
   7291 {
   7292    Bool    shl, shr, sar;
   7293    UChar   rm   = getUChar(delta);
   7294    IRTemp  e0   = newTemp(Ity_I64);
   7295    IRTemp  e1   = newTemp(Ity_I64);
   7296    UChar   amt, size;
   7297    vassert(epartIsReg(rm));
   7298    vassert(gregLO3ofRM(rm) == 2
   7299            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   7300    amt = getUChar(delta+1);
   7301    delta += 2;
   7302    DIP("%s $%d,%s\n", opname,
   7303                       (Int)amt,
   7304                       nameMMXReg(eregLO3ofRM(rm)) );
   7305 
   7306    assign( e0, getMMXReg(eregLO3ofRM(rm)) );
   7307 
   7308    shl = shr = sar = False;
   7309    size = 0;
   7310    switch (op) {
   7311       case Iop_ShlN16x4: shl = True; size = 16; break;
   7312       case Iop_ShlN32x2: shl = True; size = 32; break;
   7313       case Iop_Shl64:    shl = True; size = 64; break;
   7314       case Iop_SarN16x4: sar = True; size = 16; break;
   7315       case Iop_SarN32x2: sar = True; size = 32; break;
   7316       case Iop_ShrN16x4: shr = True; size = 16; break;
   7317       case Iop_ShrN32x2: shr = True; size = 32; break;
   7318       case Iop_Shr64:    shr = True; size = 64; break;
   7319       default: vassert(0);
   7320    }
   7321 
   7322    if (shl || shr) {
   7323      assign( e1, amt >= size
   7324                     ? mkU64(0)
   7325                     : binop(op, mkexpr(e0), mkU8(amt))
   7326      );
   7327    } else
   7328    if (sar) {
   7329      assign( e1, amt >= size
   7330                     ? binop(op, mkexpr(e0), mkU8(size-1))
   7331                     : binop(op, mkexpr(e0), mkU8(amt))
   7332      );
   7333    } else {
   7334       vassert(0);
   7335    }
   7336 
   7337    putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
   7338    return delta;
   7339 }
   7340 
   7341 
   7342 /* Completely handle all MMX instructions except emms. */
   7343 
   7344 static
   7345 ULong dis_MMX ( Bool* decode_ok,
   7346                 const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
   7347 {
   7348    Int   len;
   7349    UChar modrm;
   7350    HChar dis_buf[50];
   7351    UChar opc = getUChar(delta);
   7352    delta++;
   7353 
   7354    /* dis_MMX handles all insns except emms. */
   7355    do_MMX_preamble();
   7356 
   7357    switch (opc) {
   7358 
   7359       case 0x6E:
   7360          if (sz == 4) {
   7361             /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
   7362             modrm = getUChar(delta);
   7363             if (epartIsReg(modrm)) {
   7364                delta++;
   7365                putMMXReg(
   7366                   gregLO3ofRM(modrm),
   7367                   binop( Iop_32HLto64,
   7368                          mkU32(0),
   7369                          getIReg32(eregOfRexRM(pfx,modrm)) ) );
   7370                DIP("movd %s, %s\n",
   7371                    nameIReg32(eregOfRexRM(pfx,modrm)),
   7372                    nameMMXReg(gregLO3ofRM(modrm)));
   7373             } else {
   7374                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7375                delta += len;
   7376                putMMXReg(
   7377                   gregLO3ofRM(modrm),
   7378                   binop( Iop_32HLto64,
   7379                          mkU32(0),
   7380                          loadLE(Ity_I32, mkexpr(addr)) ) );
   7381                DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7382             }
   7383          }
   7384          else
   7385          if (sz == 8) {
   7386             /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
   7387             modrm = getUChar(delta);
   7388             if (epartIsReg(modrm)) {
   7389                delta++;
   7390                putMMXReg( gregLO3ofRM(modrm),
   7391                           getIReg64(eregOfRexRM(pfx,modrm)) );
   7392                DIP("movd %s, %s\n",
   7393                    nameIReg64(eregOfRexRM(pfx,modrm)),
   7394                    nameMMXReg(gregLO3ofRM(modrm)));
   7395             } else {
   7396                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7397                delta += len;
   7398                putMMXReg( gregLO3ofRM(modrm),
   7399                           loadLE(Ity_I64, mkexpr(addr)) );
   7400                DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7401             }
   7402          }
   7403          else {
   7404             goto mmx_decode_failure;
   7405          }
   7406          break;
   7407 
   7408       case 0x7E:
   7409          if (sz == 4) {
   7410             /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
   7411             modrm = getUChar(delta);
   7412             if (epartIsReg(modrm)) {
   7413                delta++;
   7414                putIReg32( eregOfRexRM(pfx,modrm),
   7415                           unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7416                DIP("movd %s, %s\n",
   7417                    nameMMXReg(gregLO3ofRM(modrm)),
   7418                    nameIReg32(eregOfRexRM(pfx,modrm)));
   7419             } else {
   7420                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7421                delta += len;
   7422                storeLE( mkexpr(addr),
   7423                         unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
   7424                DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7425             }
   7426          }
   7427          else
   7428          if (sz == 8) {
   7429             /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
   7430             modrm = getUChar(delta);
   7431             if (epartIsReg(modrm)) {
   7432                delta++;
   7433                putIReg64( eregOfRexRM(pfx,modrm),
   7434                           getMMXReg(gregLO3ofRM(modrm)) );
   7435                DIP("movd %s, %s\n",
   7436                    nameMMXReg(gregLO3ofRM(modrm)),
   7437                    nameIReg64(eregOfRexRM(pfx,modrm)));
   7438             } else {
   7439                IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7440                delta += len;
   7441                storeLE( mkexpr(addr),
   7442                        getMMXReg(gregLO3ofRM(modrm)) );
   7443                DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7444             }
   7445          } else {
   7446             goto mmx_decode_failure;
   7447          }
   7448          break;
   7449 
   7450       case 0x6F:
   7451          /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
   7452          if (sz != 4
   7453              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7454             goto mmx_decode_failure;
   7455          modrm = getUChar(delta);
   7456          if (epartIsReg(modrm)) {
   7457             delta++;
   7458             putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
   7459             DIP("movq %s, %s\n",
   7460                 nameMMXReg(eregLO3ofRM(modrm)),
   7461                 nameMMXReg(gregLO3ofRM(modrm)));
   7462          } else {
   7463             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7464             delta += len;
   7465             putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
   7466             DIP("movq %s, %s\n",
   7467                 dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
   7468          }
   7469          break;
   7470 
   7471       case 0x7F:
   7472          /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
   7473          if (sz != 4
   7474              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7475             goto mmx_decode_failure;
   7476          modrm = getUChar(delta);
   7477          if (epartIsReg(modrm)) {
   7478             delta++;
   7479             putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
   7480             DIP("movq %s, %s\n",
   7481                 nameMMXReg(gregLO3ofRM(modrm)),
   7482                 nameMMXReg(eregLO3ofRM(modrm)));
   7483          } else {
   7484             IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   7485             delta += len;
   7486             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   7487             DIP("mov(nt)q %s, %s\n",
   7488                 nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
   7489          }
   7490          break;
   7491 
   7492       case 0xFC:
   7493       case 0xFD:
   7494       case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
   7495          if (sz != 4)
   7496             goto mmx_decode_failure;
   7497          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
   7498          break;
   7499 
   7500       case 0xEC:
   7501       case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7502          if (sz != 4
   7503              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7504             goto mmx_decode_failure;
   7505          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
   7506          break;
   7507 
   7508       case 0xDC:
   7509       case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7510          if (sz != 4)
   7511             goto mmx_decode_failure;
   7512          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
   7513          break;
   7514 
   7515       case 0xF8:
   7516       case 0xF9:
   7517       case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
   7518          if (sz != 4)
   7519             goto mmx_decode_failure;
   7520          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
   7521          break;
   7522 
   7523       case 0xE8:
   7524       case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7525          if (sz != 4)
   7526             goto mmx_decode_failure;
   7527          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
   7528          break;
   7529 
   7530       case 0xD8:
   7531       case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
   7532          if (sz != 4)
   7533             goto mmx_decode_failure;
   7534          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
   7535          break;
   7536 
   7537       case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
   7538          if (sz != 4)
   7539             goto mmx_decode_failure;
   7540          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
   7541          break;
   7542 
   7543       case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
   7544          if (sz != 4)
   7545             goto mmx_decode_failure;
   7546          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
   7547          break;
   7548 
   7549       case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
   7550          vassert(sz == 4);
   7551          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
   7552          break;
   7553 
   7554       case 0x74:
   7555       case 0x75:
   7556       case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
   7557          if (sz != 4)
   7558             goto mmx_decode_failure;
   7559          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
   7560          break;
   7561 
   7562       case 0x64:
   7563       case 0x65:
   7564       case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
   7565          if (sz != 4)
   7566             goto mmx_decode_failure;
   7567          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
   7568          break;
   7569 
   7570       case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
   7571          if (sz != 4)
   7572             goto mmx_decode_failure;
   7573          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
   7574          break;
   7575 
   7576       case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7577          if (sz != 4)
   7578             goto mmx_decode_failure;
   7579          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
   7580          break;
   7581 
   7582       case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
   7583          if (sz != 4)
   7584             goto mmx_decode_failure;
   7585          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
   7586          break;
   7587 
   7588       case 0x68:
   7589       case 0x69:
   7590       case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
   7591          if (sz != 4
   7592              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7593             goto mmx_decode_failure;
   7594          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
   7595          break;
   7596 
   7597       case 0x60:
   7598       case 0x61:
   7599       case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7600          if (sz != 4
   7601              && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
   7602             goto mmx_decode_failure;
   7603          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
   7604          break;
   7605 
   7606       case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
   7607          if (sz != 4)
   7608             goto mmx_decode_failure;
   7609          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
   7610          break;
   7611 
   7612       case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
   7613          if (sz != 4)
   7614             goto mmx_decode_failure;
   7615          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
   7616          break;
   7617 
   7618       case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
   7619          if (sz != 4)
   7620             goto mmx_decode_failure;
   7621          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
   7622          break;
   7623 
   7624       case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
   7625          if (sz != 4)
   7626             goto mmx_decode_failure;
   7627          delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
   7628          break;
   7629 
   7630 #     define SHIFT_BY_REG(_name,_op)                                     \
   7631                 delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
   7632                 break;
   7633 
   7634       /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7635       case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
   7636       case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
   7637       case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
   7638 
   7639       /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
   7640       case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
   7641       case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
   7642       case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
   7643 
   7644       /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
   7645       case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
   7646       case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
   7647 
   7648 #     undef SHIFT_BY_REG
   7649 
   7650       case 0x71:
   7651       case 0x72:
   7652       case 0x73: {
   7653          /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
   7654          UChar byte2, subopc;
   7655          if (sz != 4)
   7656             goto mmx_decode_failure;
   7657          byte2  = getUChar(delta);      /* amode / sub-opcode */
   7658          subopc = toUChar( (byte2 >> 3) & 7 );
   7659 
   7660 #        define SHIFT_BY_IMM(_name,_op)                        \
   7661             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
   7662             } while (0)
   7663 
   7664               if (subopc == 2 /*SRL*/ && opc == 0x71)
   7665                   SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
   7666          else if (subopc == 2 /*SRL*/ && opc == 0x72)
   7667                  SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
   7668          else if (subopc == 2 /*SRL*/ && opc == 0x73)
   7669                  SHIFT_BY_IMM("psrlq", Iop_Shr64);
   7670 
   7671          else if (subopc == 4 /*SAR*/ && opc == 0x71)
   7672                  SHIFT_BY_IMM("psraw", Iop_SarN16x4);
   7673          else if (subopc == 4 /*SAR*/ && opc == 0x72)
   7674                  SHIFT_BY_IMM("psrad", Iop_SarN32x2);
   7675 
   7676          else if (subopc == 6 /*SHL*/ && opc == 0x71)
   7677                  SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
   7678          else if (subopc == 6 /*SHL*/ && opc == 0x72)
   7679                   SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
   7680          else if (subopc == 6 /*SHL*/ && opc == 0x73)
   7681                  SHIFT_BY_IMM("psllq", Iop_Shl64);
   7682 
   7683          else goto mmx_decode_failure;
   7684 
   7685 #        undef SHIFT_BY_IMM
   7686          break;
   7687       }
   7688 
   7689       case 0xF7: {
   7690          IRTemp addr    = newTemp(Ity_I64);
   7691          IRTemp regD    = newTemp(Ity_I64);
   7692          IRTemp regM    = newTemp(Ity_I64);
   7693          IRTemp mask    = newTemp(Ity_I64);
   7694          IRTemp olddata = newTemp(Ity_I64);
   7695          IRTemp newdata = newTemp(Ity_I64);
   7696 
   7697          modrm = getUChar(delta);
   7698          if (sz != 4 || (!epartIsReg(modrm)))
   7699             goto mmx_decode_failure;
   7700          delta++;
   7701 
   7702          assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   7703          assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
   7704          assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
   7705          assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
   7706          assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
   7707          assign( newdata,
   7708                  binop(Iop_Or64,
   7709                        binop(Iop_And64,
   7710                              mkexpr(regD),
   7711                              mkexpr(mask) ),
   7712                        binop(Iop_And64,
   7713                              mkexpr(olddata),
   7714                              unop(Iop_Not64, mkexpr(mask)))) );
   7715          storeLE( mkexpr(addr), mkexpr(newdata) );
   7716          DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
   7717                                  nameMMXReg( gregLO3ofRM(modrm) ) );
   7718          break;
   7719       }
   7720 
   7721       /* --- MMX decode failure --- */
   7722       default:
   7723       mmx_decode_failure:
   7724          *decode_ok = False;
   7725          return delta; /* ignored */
   7726 
   7727    }
   7728 
   7729    *decode_ok = True;
   7730    return delta;
   7731 }
   7732 
   7733 
   7734 /*------------------------------------------------------------*/
   7735 /*--- More misc arithmetic and other obscure insns.        ---*/
   7736 /*------------------------------------------------------------*/
   7737 
   7738 /* Generate base << amt with vacated places filled with stuff
   7739    from xtra.  amt guaranteed in 0 .. 63. */
   7740 static
   7741 IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
   7742 {
   7743    /* if   amt == 0
   7744       then base
   7745       else (base << amt) | (xtra >>u (64-amt))
   7746    */
   7747    return
   7748       IRExpr_ITE(
   7749          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7750          binop(Iop_Or64,
   7751                binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
   7752                binop(Iop_Shr64, mkexpr(xtra),
   7753                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7754                ),
   7755          mkexpr(base)
   7756       );
   7757 }
   7758 
   7759 /* Generate base >>u amt with vacated places filled with stuff
   7760    from xtra.  amt guaranteed in 0 .. 63. */
   7761 static
   7762 IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
   7763 {
   7764    /* if   amt == 0
   7765       then base
   7766       else (base >>u amt) | (xtra << (64-amt))
   7767    */
   7768    return
   7769       IRExpr_ITE(
   7770          binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
   7771          binop(Iop_Or64,
   7772                binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
   7773                binop(Iop_Shl64, mkexpr(xtra),
   7774                                 binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
   7775                ),
   7776          mkexpr(base)
   7777       );
   7778 }
   7779 
   7780 /* Double length left and right shifts.  Apparently only required in
   7781    v-size (no b- variant). */
   7782 static
   7783 ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
   7784                         Prefix pfx,
   7785                         Long delta, UChar modrm,
   7786                         Int sz,
   7787                         IRExpr* shift_amt,
   7788                         Bool amt_is_literal,
   7789                         const HChar* shift_amt_txt,
   7790                         Bool left_shift )
   7791 {
   7792    /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
   7793       for printing it.   And eip on entry points at the modrm byte. */
   7794    Int len;
   7795    HChar dis_buf[50];
   7796 
   7797    IRType ty     = szToITy(sz);
   7798    IRTemp gsrc   = newTemp(ty);
   7799    IRTemp esrc   = newTemp(ty);
   7800    IRTemp addr   = IRTemp_INVALID;
   7801    IRTemp tmpSH  = newTemp(Ity_I8);
   7802    IRTemp tmpSS  = newTemp(Ity_I8);
   7803    IRTemp tmp64  = IRTemp_INVALID;
   7804    IRTemp res64  = IRTemp_INVALID;
   7805    IRTemp rss64  = IRTemp_INVALID;
   7806    IRTemp resTy  = IRTemp_INVALID;
   7807    IRTemp rssTy  = IRTemp_INVALID;
   7808    Int    mask   = sz==8 ? 63 : 31;
   7809 
   7810    vassert(sz == 2 || sz == 4 || sz == 8);
   7811 
   7812    /* The E-part is the destination; this is shifted.  The G-part
   7813       supplies bits to be shifted into the E-part, but is not
   7814       changed.
   7815 
   7816       If shifting left, form a double-length word with E at the top
   7817       and G at the bottom, and shift this left.  The result is then in
   7818       the high part.
   7819 
   7820       If shifting right, form a double-length word with G at the top
   7821       and E at the bottom, and shift this right.  The result is then
   7822       at the bottom.  */
   7823 
   7824    /* Fetch the operands. */
   7825 
   7826    assign( gsrc, getIRegG(sz, pfx, modrm) );
   7827 
   7828    if (epartIsReg(modrm)) {
   7829       delta++;
   7830       assign( esrc, getIRegE(sz, pfx, modrm) );
   7831       DIP("sh%cd%c %s, %s, %s\n",
   7832           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7833           shift_amt_txt,
   7834           nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
   7835    } else {
   7836       addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
   7837                         /* # bytes following amode */
   7838                         amt_is_literal ? 1 : 0 );
   7839       delta += len;
   7840       assign( esrc, loadLE(ty, mkexpr(addr)) );
   7841       DIP("sh%cd%c %s, %s, %s\n",
   7842           ( left_shift ? 'l' : 'r' ), nameISize(sz),
   7843           shift_amt_txt,
   7844           nameIRegG(sz, pfx, modrm), dis_buf);
   7845    }
   7846 
   7847    /* Calculate the masked shift amount (tmpSH), the masked subshift
   7848       amount (tmpSS), the shifted value (res64) and the subshifted
   7849       value (rss64). */
   7850 
   7851    assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
   7852    assign( tmpSS, binop(Iop_And8,
   7853                         binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
   7854                         mkU8(mask)));
   7855 
   7856    tmp64 = newTemp(Ity_I64);
   7857    res64 = newTemp(Ity_I64);
   7858    rss64 = newTemp(Ity_I64);
   7859 
   7860    if (sz == 2 || sz == 4) {
   7861 
   7862       /* G is xtra; E is data */
   7863       /* what a freaking nightmare: */
   7864       if (sz == 4 && left_shift) {
   7865          assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
   7866          assign( res64,
   7867                  binop(Iop_Shr64,
   7868                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7869                        mkU8(32)) );
   7870          assign( rss64,
   7871                  binop(Iop_Shr64,
   7872                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
   7873                        mkU8(32)) );
   7874       }
   7875       else
   7876       if (sz == 4 && !left_shift) {
   7877          assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
   7878          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7879          assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
   7880       }
   7881       else
   7882       if (sz == 2 && left_shift) {
   7883          assign( tmp64,
   7884                  binop(Iop_32HLto64,
   7885                        binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
   7886                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
   7887          ));
   7888          /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
   7889          assign( res64,
   7890                  binop(Iop_Shr64,
   7891                        binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
   7892                        mkU8(48)) );
   7893          /* subshift formed by shifting [esrc'0000'0000'0000] */
   7894          assign( rss64,
   7895                  binop(Iop_Shr64,
   7896                        binop(Iop_Shl64,
   7897                              binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
   7898                                               mkU8(48)),
   7899                              mkexpr(tmpSS)),
   7900                        mkU8(48)) );
   7901       }
   7902       else
   7903       if (sz == 2 && !left_shift) {
   7904          assign( tmp64,
   7905                  binop(Iop_32HLto64,
   7906                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
   7907                        binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
   7908          ));
   7909          /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
   7910          assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
   7911          /* subshift formed by shifting [0000'0000'0000'esrc] */
   7912          assign( rss64, binop(Iop_Shr64,
   7913                               unop(Iop_16Uto64, mkexpr(esrc)),
   7914                               mkexpr(tmpSS)) );
   7915       }
   7916 
   7917    } else {
   7918 
   7919       vassert(sz == 8);
   7920       if (left_shift) {
   7921          assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
   7922          assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
   7923       } else {
   7924          assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
   7925          assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
   7926       }
   7927 
   7928    }
   7929 
   7930    resTy = newTemp(ty);
   7931    rssTy = newTemp(ty);
   7932    assign( resTy, narrowTo(ty, mkexpr(res64)) );
   7933    assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
   7934 
   7935    /* Put result back and write the flags thunk. */
   7936    setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
   7937                               resTy, rssTy, ty, tmpSH );
   7938 
   7939    if (epartIsReg(modrm)) {
   7940       putIRegE(sz, pfx, modrm, mkexpr(resTy));
   7941    } else {
   7942       storeLE( mkexpr(addr), mkexpr(resTy) );
   7943    }
   7944 
   7945    if (amt_is_literal) delta++;
   7946    return delta;
   7947 }
   7948 
   7949 
   7950 /* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
   7951    required. */
   7952 
   7953 typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
   7954 
   7955 static const HChar* nameBtOp ( BtOp op )
   7956 {
   7957    switch (op) {
   7958       case BtOpNone:  return "";
   7959       case BtOpSet:   return "s";
   7960       case BtOpReset: return "r";
   7961       case BtOpComp:  return "c";
   7962       default: vpanic("nameBtOp(amd64)");
   7963    }
   7964 }
   7965 
   7966 
   7967 static
   7968 ULong dis_bt_G_E ( const VexAbiInfo* vbi,
   7969                    Prefix pfx, Int sz, Long delta, BtOp op,
   7970                    /*OUT*/Bool* decode_OK )
   7971 {
   7972    HChar  dis_buf[50];
   7973    UChar  modrm;
   7974    Int    len;
   7975    IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
   7976           t_addr1, t_rsp, t_mask, t_new;
   7977 
   7978    vassert(sz == 2 || sz == 4 || sz == 8);
   7979 
   7980    t_fetched = t_bitno0 = t_bitno1 = t_bitno2
   7981              = t_addr0 = t_addr1 = t_rsp
   7982              = t_mask = t_new = IRTemp_INVALID;
   7983 
   7984    t_fetched = newTemp(Ity_I8);
   7985    t_new     = newTemp(Ity_I8);
   7986    t_bitno0  = newTemp(Ity_I64);
   7987    t_bitno1  = newTemp(Ity_I64);
   7988    t_bitno2  = newTemp(Ity_I8);
   7989    t_addr1   = newTemp(Ity_I64);
   7990    modrm     = getUChar(delta);
   7991 
   7992    *decode_OK = True;
   7993    if (epartIsReg(modrm)) {
   7994       /* F2 and F3 are never acceptable. */
   7995       if (haveF2orF3(pfx)) {
   7996          *decode_OK = False;
   7997          return delta;
   7998       }
   7999    } else {
   8000       /* F2 or F3 (but not both) are allowed, provided LOCK is also
   8001          present, and only for the BTC/BTS/BTR cases (not BT). */
   8002       if (haveF2orF3(pfx)) {
   8003          if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
   8004             *decode_OK = False;
   8005             return delta;
   8006          }
   8007       }
   8008    }
   8009 
   8010    assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
   8011 
   8012    if (epartIsReg(modrm)) {
   8013       delta++;
   8014       /* Get it onto the client's stack.  Oh, this is a horrible
   8015          kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
   8016          Because of the ELF ABI stack redzone, there may be live data
   8017          up to 128 bytes below %RSP.  So we can't just push it on the
   8018          stack, else we may wind up trashing live data, and causing
   8019          impossible-to-find simulation errors.  (Yes, this did
   8020          happen.)  So we need to drop RSP before at least 128 before
   8021          pushing it.  That unfortunately means hitting Memcheck's
   8022          fast-case painting code.  Ideally we should drop more than
   8023          128, to reduce the chances of breaking buggy programs that
   8024          have live data below -128(%RSP).  Memcheck fast-cases moves
   8025          of 288 bytes due to the need to handle ppc64-linux quickly,
   8026          so let's use 288.  Of course the real fix is to get rid of
   8027          this kludge entirely.  */
   8028       t_rsp = newTemp(Ity_I64);
   8029       t_addr0 = newTemp(Ity_I64);
   8030 
   8031       vassert(vbi->guest_stack_redzone_size == 128);
   8032       assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
   8033       putIReg64(R_RSP, mkexpr(t_rsp));
   8034 
   8035       storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
   8036 
   8037       /* Make t_addr0 point at it. */
   8038       assign( t_addr0, mkexpr(t_rsp) );
   8039 
   8040       /* Mask out upper bits of the shift amount, since we're doing a
   8041          reg. */
   8042       assign( t_bitno1, binop(Iop_And64,
   8043                               mkexpr(t_bitno0),
   8044                               mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
   8045 
   8046    } else {
   8047       t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
   8048       delta += len;
   8049       assign( t_bitno1, mkexpr(t_bitno0) );
   8050    }
   8051 
   8052    /* At this point: t_addr0 is the address being operated on.  If it
   8053       was a reg, we will have pushed it onto the client's stack.
   8054       t_bitno1 is the bit number, suitably masked in the case of a
   8055       reg.  */
   8056 
   8057    /* Now the main sequence. */
   8058    assign( t_addr1,
   8059            binop(Iop_Add64,
   8060                  mkexpr(t_addr0),
   8061                  binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
   8062 
   8063    /* t_addr1 now holds effective address */
   8064 
   8065    assign( t_bitno2,
   8066            unop(Iop_64to8,
   8067                 binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
   8068 
   8069    /* t_bitno2 contains offset of bit within byte */
   8070 
   8071    if (op != BtOpNone) {
   8072       t_mask = newTemp(Ity_I8);
   8073       assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
   8074    }
   8075 
   8076    /* t_mask is now a suitable byte mask */
   8077 
   8078    assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
   8079 
   8080    if (op != BtOpNone) {
   8081       switch (op) {
   8082          case BtOpSet:
   8083             assign( t_new,
   8084                     binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8085             break;
   8086          case BtOpComp:
   8087             assign( t_new,
   8088                     binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
   8089             break;
   8090          case BtOpReset:
   8091             assign( t_new,
   8092                     binop(Iop_And8, mkexpr(t_fetched),
   8093                                     unop(Iop_Not8, mkexpr(t_mask))) );
   8094             break;
   8095          default:
   8096             vpanic("dis_bt_G_E(amd64)");
   8097       }
   8098       if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
   8099          casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
   8100                                  mkexpr(t_new)/*new*/,
   8101                                  guest_RIP_curr_instr );
   8102       } else {
   8103          storeLE( mkexpr(t_addr1), mkexpr(t_new) );
   8104       }
   8105    }
   8106 
   8107    /* Side effect done; now get selected bit into Carry flag */
   8108    /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
   8109    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8110    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8111    stmt( IRStmt_Put(
   8112             OFFB_CC_DEP1,
   8113             binop(Iop_And64,
   8114                   binop(Iop_Shr64,
   8115                         unop(Iop_8Uto64, mkexpr(t_fetched)),
   8116                         mkexpr(t_bitno2)),
   8117                   mkU64(1)))
   8118        );
   8119    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8120       elimination of previous stores to this field work better. */
   8121    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8122 
   8123    /* Move reg operand from stack back to reg */
   8124    if (epartIsReg(modrm)) {
   8125       /* t_rsp still points at it. */
   8126       /* only write the reg if actually modifying it; doing otherwise
   8127          zeroes the top half erroneously when doing btl due to
   8128          standard zero-extend rule */
   8129       if (op != BtOpNone)
   8130          putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
   8131       putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
   8132    }
   8133 
   8134    DIP("bt%s%c %s, %s\n",
   8135        nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
   8136        ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
   8137 
   8138    return delta;
   8139 }
   8140 
   8141 
   8142 
   8143 /* Handle BSF/BSR.  Only v-size seems necessary. */
   8144 static
   8145 ULong dis_bs_E_G ( const VexAbiInfo* vbi,
   8146                    Prefix pfx, Int sz, Long delta, Bool fwds )
   8147 {
   8148    Bool   isReg;
   8149    UChar  modrm;
   8150    HChar  dis_buf[50];
   8151 
   8152    IRType ty    = szToITy(sz);
   8153    IRTemp src   = newTemp(ty);
   8154    IRTemp dst   = newTemp(ty);
   8155    IRTemp src64 = newTemp(Ity_I64);
   8156    IRTemp dst64 = newTemp(Ity_I64);
   8157    IRTemp srcB  = newTemp(Ity_I1);
   8158 
   8159    vassert(sz == 8 || sz == 4 || sz == 2);
   8160 
   8161    modrm = getUChar(delta);
   8162    isReg = epartIsReg(modrm);
   8163    if (isReg) {
   8164       delta++;
   8165       assign( src, getIRegE(sz, pfx, modrm) );
   8166    } else {
   8167       Int    len;
   8168       IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
   8169       delta += len;
   8170       assign( src, loadLE(ty, mkexpr(addr)) );
   8171    }
   8172 
   8173    DIP("bs%c%c %s, %s\n",
   8174        fwds ? 'f' : 'r', nameISize(sz),
   8175        ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
   8176        nameIRegG(sz, pfx, modrm));
   8177 
   8178    /* First, widen src to 64 bits if it is not already. */
   8179    assign( src64, widenUto64(mkexpr(src)) );
   8180 
   8181    /* Generate a bool expression which is zero iff the original is
   8182       zero, and nonzero otherwise.  Ask for a CmpNE version which, if
   8183       instrumented by Memcheck, is instrumented expensively, since
   8184       this may be used on the output of a preceding movmskb insn,
   8185       which has been known to be partially defined, and in need of
   8186       careful handling. */
   8187    assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
   8188 
   8189    /* Flags: Z is 1 iff source value is zero.  All others
   8190       are undefined -- we force them to zero. */
   8191    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8192    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8193    stmt( IRStmt_Put(
   8194             OFFB_CC_DEP1,
   8195             IRExpr_ITE( mkexpr(srcB),
   8196                         /* src!=0 */
   8197                         mkU64(0),
   8198                         /* src==0 */
   8199                         mkU64(AMD64G_CC_MASK_Z)
   8200                         )
   8201        ));
   8202    /* Set NDEP even though it isn't used.  This makes redundant-PUT
   8203       elimination of previous stores to this field work better. */
   8204    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8205 
   8206    /* Result: iff source value is zero, we can't use
   8207       Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
   8208       But anyway, amd64 semantics say the result is undefined in
   8209       such situations.  Hence handle the zero case specially. */
   8210 
   8211    /* Bleh.  What we compute:
   8212 
   8213           bsf64:  if src == 0 then {dst is unchanged}
   8214                               else Ctz64(src)
   8215 
   8216           bsr64:  if src == 0 then {dst is unchanged}
   8217                               else 63 - Clz64(src)
   8218 
   8219           bsf32:  if src == 0 then {dst is unchanged}
   8220                               else Ctz64(32Uto64(src))
   8221 
   8222           bsr32:  if src == 0 then {dst is unchanged}
   8223                               else 63 - Clz64(32Uto64(src))
   8224 
   8225           bsf16:  if src == 0 then {dst is unchanged}
   8226                               else Ctz64(32Uto64(16Uto32(src)))
   8227 
   8228           bsr16:  if src == 0 then {dst is unchanged}
   8229                               else 63 - Clz64(32Uto64(16Uto32(src)))
   8230    */
   8231 
   8232    /* The main computation, guarding against zero. */
   8233    assign( dst64,
   8234            IRExpr_ITE(
   8235               mkexpr(srcB),
   8236               /* src != 0 */
   8237               fwds ? unop(Iop_Ctz64, mkexpr(src64))
   8238                    : binop(Iop_Sub64,
   8239                            mkU64(63),
   8240                            unop(Iop_Clz64, mkexpr(src64))),
   8241               /* src == 0 -- leave dst unchanged */
   8242               widenUto64( getIRegG( sz, pfx, modrm ) )
   8243            )
   8244          );
   8245 
   8246    if (sz == 2)
   8247       assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
   8248    else
   8249    if (sz == 4)
   8250       assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
   8251    else
   8252       assign( dst, mkexpr(dst64) );
   8253 
   8254    /* dump result back */
   8255    putIRegG( sz, pfx, modrm, mkexpr(dst) );
   8256 
   8257    return delta;
   8258 }
   8259 
   8260 
   8261 /* swap rAX with the reg specified by reg and REX.B */
   8262 static
   8263 void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
   8264 {
   8265    IRType ty = szToITy(sz);
   8266    IRTemp t1 = newTemp(ty);
   8267    IRTemp t2 = newTemp(ty);
   8268    vassert(sz == 2 || sz == 4 || sz == 8);
   8269    vassert(regLo3 < 8);
   8270    if (sz == 8) {
   8271       assign( t1, getIReg64(R_RAX) );
   8272       assign( t2, getIRegRexB(8, pfx, regLo3) );
   8273       putIReg64( R_RAX, mkexpr(t2) );
   8274       putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
   8275    } else if (sz == 4) {
   8276       assign( t1, getIReg32(R_RAX) );
   8277       assign( t2, getIRegRexB(4, pfx, regLo3) );
   8278       putIReg32( R_RAX, mkexpr(t2) );
   8279       putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
   8280    } else {
   8281       assign( t1, getIReg16(R_RAX) );
   8282       assign( t2, getIRegRexB(2, pfx, regLo3) );
   8283       putIReg16( R_RAX, mkexpr(t2) );
   8284       putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
   8285    }
   8286    DIP("xchg%c %s, %s\n",
   8287        nameISize(sz), nameIRegRAX(sz),
   8288                       nameIRegRexB(sz,pfx, regLo3));
   8289 }
   8290 
   8291 
   8292 static
   8293 void codegen_SAHF ( void )
   8294 {
   8295    /* Set the flags to:
   8296       (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
   8297                                     -- retain the old O flag
   8298       | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8299                 |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
   8300    */
   8301    ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8302                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8303    IRTemp oldflags   = newTemp(Ity_I64);
   8304    assign( oldflags, mk_amd64g_calculate_rflags_all() );
   8305    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   8306    stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
   8307    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   8308    stmt( IRStmt_Put( OFFB_CC_DEP1,
   8309          binop(Iop_Or64,
   8310                binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
   8311                binop(Iop_And64,
   8312                      binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
   8313                      mkU64(mask_SZACP))
   8314               )
   8315    ));
   8316 }
   8317 
   8318 
   8319 static
   8320 void codegen_LAHF ( void  )
   8321 {
   8322    /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
   8323    IRExpr* rax_with_hole;
   8324    IRExpr* new_byte;
   8325    IRExpr* new_rax;
   8326    ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
   8327                         |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
   8328 
   8329    IRTemp  flags = newTemp(Ity_I64);
   8330    assign( flags, mk_amd64g_calculate_rflags_all() );
   8331 
   8332    rax_with_hole
   8333       = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
   8334    new_byte
   8335       = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
   8336                         mkU64(1<<1));
   8337    new_rax
   8338       = binop(Iop_Or64, rax_with_hole,
   8339                         binop(Iop_Shl64, new_byte, mkU8(8)));
   8340    putIReg64(R_RAX, new_rax);
   8341 }
   8342 
   8343 
   8344 static
   8345 ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
   8346                         const VexAbiInfo*  vbi,
   8347                         Prefix       pfx,
   8348                         Int          size,
   8349                         Long         delta0 )
   8350 {
   8351    HChar dis_buf[50];
   8352    Int   len;
   8353 
   8354    IRType ty    = szToITy(size);
   8355    IRTemp acc   = newTemp(ty);
   8356    IRTemp src   = newTemp(ty);
   8357    IRTemp dest  = newTemp(ty);
   8358    IRTemp dest2 = newTemp(ty);
   8359    IRTemp acc2  = newTemp(ty);
   8360    IRTemp cond  = newTemp(Ity_I1);
   8361    IRTemp addr  = IRTemp_INVALID;
   8362    UChar  rm    = getUChar(delta0);
   8363 
   8364    /* There are 3 cases to consider:
   8365 
   8366       reg-reg: ignore any lock prefix, generate sequence based
   8367                on ITE
   8368 
   8369       reg-mem, not locked: ignore any lock prefix, generate sequence
   8370                            based on ITE
   8371 
   8372       reg-mem, locked: use IRCAS
   8373    */
   8374 
   8375    /* Decide whether F2 or F3 are acceptable.  Never for register
   8376       case, but for the memory case, one or the other is OK provided
   8377       LOCK is also present. */
   8378    if (epartIsReg(rm)) {
   8379       if (haveF2orF3(pfx)) {
   8380          *ok = False;
   8381          return delta0;
   8382       }
   8383    } else {
   8384       if (haveF2orF3(pfx)) {
   8385          if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
   8386             *ok = False;
   8387             return delta0;
   8388          }
   8389       }
   8390    }
   8391 
   8392    if (epartIsReg(rm)) {
   8393       /* case 1 */
   8394       assign( dest, getIRegE(size, pfx, rm) );
   8395       delta0++;
   8396       assign( src, getIRegG(size, pfx, rm) );
   8397       assign( acc, getIRegRAX(size) );
   8398       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8399       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8400       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8401       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8402       putIRegRAX(size, mkexpr(acc2));
   8403       putIRegE(size, pfx, rm, mkexpr(dest2));
   8404       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8405                                nameIRegG(size,pfx,rm),
   8406                                nameIRegE(size,pfx,rm) );
   8407    }
   8408    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8409       /* case 2 */
   8410       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8411       assign( dest, loadLE(ty, mkexpr(addr)) );
   8412       delta0 += len;
   8413       assign( src, getIRegG(size, pfx, rm) );
   8414       assign( acc, getIRegRAX(size) );
   8415       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8416       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8417       assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
   8418       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8419       putIRegRAX(size, mkexpr(acc2));
   8420       storeLE( mkexpr(addr), mkexpr(dest2) );
   8421       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8422                                nameIRegG(size,pfx,rm), dis_buf);
   8423    }
   8424    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8425       /* case 3 */
   8426       /* src is new value.  acc is expected value.  dest is old value.
   8427          Compute success from the output of the IRCAS, and steer the
   8428          new value for RAX accordingly: in case of success, RAX is
   8429          unchanged. */
   8430       addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8431       delta0 += len;
   8432       assign( src, getIRegG(size, pfx, rm) );
   8433       assign( acc, getIRegRAX(size) );
   8434       stmt( IRStmt_CAS(
   8435          mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
   8436                   NULL, mkexpr(acc), NULL, mkexpr(src) )
   8437       ));
   8438       setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
   8439       assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
   8440       assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
   8441       putIRegRAX(size, mkexpr(acc2));
   8442       DIP("cmpxchg%c %s,%s\n", nameISize(size),
   8443                                nameIRegG(size,pfx,rm), dis_buf);
   8444    }
   8445    else vassert(0);
   8446 
   8447    *ok = True;
   8448    return delta0;
   8449 }
   8450 
   8451 
   8452 /* Handle conditional move instructions of the form
   8453       cmovcc E(reg-or-mem), G(reg)
   8454 
   8455    E(src) is reg-or-mem
   8456    G(dst) is reg.
   8457 
   8458    If E is reg, -->    GET %E, tmps
   8459                        GET %G, tmpd
   8460                        CMOVcc tmps, tmpd
   8461                        PUT tmpd, %G
   8462 
   8463    If E is mem  -->    (getAddr E) -> tmpa
   8464                        LD (tmpa), tmps
   8465                        GET %G, tmpd
   8466                        CMOVcc tmps, tmpd
   8467                        PUT tmpd, %G
   8468 */
   8469 static
   8470 ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
   8471                      Prefix        pfx,
   8472                      Int           sz,
   8473                      AMD64Condcode cond,
   8474                      Long          delta0 )
   8475 {
   8476    UChar rm  = getUChar(delta0);
   8477    HChar dis_buf[50];
   8478    Int   len;
   8479 
   8480    IRType ty   = szToITy(sz);
   8481    IRTemp tmps = newTemp(ty);
   8482    IRTemp tmpd = newTemp(ty);
   8483 
   8484    if (epartIsReg(rm)) {
   8485       assign( tmps, getIRegE(sz, pfx, rm) );
   8486       assign( tmpd, getIRegG(sz, pfx, rm) );
   8487 
   8488       putIRegG( sz, pfx, rm,
   8489                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8490                             mkexpr(tmps),
   8491                             mkexpr(tmpd) )
   8492               );
   8493       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8494                             nameIRegE(sz,pfx,rm),
   8495                             nameIRegG(sz,pfx,rm));
   8496       return 1+delta0;
   8497    }
   8498 
   8499    /* E refers to memory */
   8500    {
   8501       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8502       assign( tmps, loadLE(ty, mkexpr(addr)) );
   8503       assign( tmpd, getIRegG(sz, pfx, rm) );
   8504 
   8505       putIRegG( sz, pfx, rm,
   8506                 IRExpr_ITE( mk_amd64g_calculate_condition(cond),
   8507                             mkexpr(tmps),
   8508                             mkexpr(tmpd) )
   8509               );
   8510 
   8511       DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
   8512                             dis_buf,
   8513                             nameIRegG(sz,pfx,rm));
   8514       return len+delta0;
   8515    }
   8516 }
   8517 
   8518 
   8519 static
   8520 ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
   8521                      const VexAbiInfo* vbi,
   8522                      Prefix pfx, Int sz, Long delta0 )
   8523 {
   8524    Int   len;
   8525    UChar rm = getUChar(delta0);
   8526    HChar dis_buf[50];
   8527 
   8528    IRType ty    = szToITy(sz);
   8529    IRTemp tmpd  = newTemp(ty);
   8530    IRTemp tmpt0 = newTemp(ty);
   8531    IRTemp tmpt1 = newTemp(ty);
   8532 
   8533    /* There are 3 cases to consider:
   8534 
   8535       reg-reg: ignore any lock prefix,
   8536                generate 'naive' (non-atomic) sequence
   8537 
   8538       reg-mem, not locked: ignore any lock prefix, generate 'naive'
   8539                            (non-atomic) sequence
   8540 
   8541       reg-mem, locked: use IRCAS
   8542    */
   8543 
   8544    if (epartIsReg(rm)) {
   8545       /* case 1 */
   8546       assign( tmpd, getIRegE(sz, pfx, rm) );
   8547       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8548       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8549                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8550       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8551       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8552       putIRegE(sz, pfx, rm, mkexpr(tmpt1));
   8553       DIP("xadd%c %s, %s\n",
   8554           nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
   8555       *decode_ok = True;
   8556       return 1+delta0;
   8557    }
   8558    else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
   8559       /* case 2 */
   8560       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8561       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8562       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8563       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8564                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8565       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8566       storeLE( mkexpr(addr), mkexpr(tmpt1) );
   8567       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8568       DIP("xadd%c %s, %s\n",
   8569           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8570       *decode_ok = True;
   8571       return len+delta0;
   8572    }
   8573    else if (!epartIsReg(rm) && haveLOCK(pfx)) {
   8574       /* case 3 */
   8575       IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
   8576       assign( tmpd,  loadLE(ty, mkexpr(addr)) );
   8577       assign( tmpt0, getIRegG(sz, pfx, rm) );
   8578       assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
   8579                            mkexpr(tmpd), mkexpr(tmpt0)) );
   8580       casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
   8581                            mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
   8582       setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
   8583       putIRegG(sz, pfx, rm, mkexpr(tmpd));
   8584       DIP("xadd%c %s, %s\n",
   8585           nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
   8586       *decode_ok = True;
   8587       return len+delta0;
   8588    }
   8589    /*UNREACHED*/
   8590    vassert(0);
   8591 }
   8592 
   8593 //.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
   8594 //..
   8595 //.. static
   8596 //.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
   8597 //.. {
   8598 //..    Int    len;
   8599 //..    IRTemp addr;
   8600 //..    UChar  rm  = getUChar(delta0);
   8601 //..    HChar  dis_buf[50];
   8602 //..
   8603 //..    if (epartIsReg(rm)) {
   8604 //..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
   8605 //..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
   8606 //..       return 1+delta0;
   8607 //..    } else {
   8608 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8609 //..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
   8610 //..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
   8611 //..       return len+delta0;
   8612 //..    }
   8613 //.. }
   8614 //..
   8615 //.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
   8616 //..    dst is ireg and sz==4, zero out top half of it.  */
   8617 //..
   8618 //.. static
   8619 //.. UInt dis_mov_Sw_Ew ( UChar sorb,
   8620 //..                      Int   sz,
   8621 //..                      UInt  delta0 )
   8622 //.. {
   8623 //..    Int    len;
   8624 //..    IRTemp addr;
   8625 //..    UChar  rm  = getUChar(delta0);
   8626 //..    HChar  dis_buf[50];
   8627 //..
   8628 //..    vassert(sz == 2 || sz == 4);
   8629 //..
   8630 //..    if (epartIsReg(rm)) {
   8631 //..       if (sz == 4)
   8632 //..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
   8633 //..       else
   8634 //..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
   8635 //..
   8636 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
   8637 //..       return 1+delta0;
   8638 //..    } else {
   8639 //..       addr = disAMode ( &len, sorb, delta0, dis_buf );
   8640 //..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
   8641 //..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
   8642 //..       return len+delta0;
   8643 //..    }
   8644 //.. }
   8645 
   8646 /* Handle move instructions of the form
   8647       mov S, E  meaning
   8648       mov sreg, reg-or-mem
   8649    Is passed the a ptr to the modRM byte, and the data size.  Returns
   8650    the address advanced completely over this instruction.
   8651 
   8652    VEX does not currently simulate segment registers on AMD64 which means that
   8653    instead of moving a value of a segment register, zero is moved to the
   8654    destination.  The zero value represents a null (unused) selector.  This is
   8655    not correct (especially for the %cs, %fs and %gs registers) but it seems to
   8656    provide a sufficient simulation for currently seen programs that use this
   8657    instruction.  If some program actually decides to use the obtained segment
   8658    selector for something meaningful then the zero value should be a clear
   8659    indicator that there is some problem.
   8660 
   8661    S(src) is sreg.
   8662    E(dst) is reg-or-mem
   8663 
   8664    If E is reg, -->    PUT $0, %E
   8665 
   8666    If E is mem, -->    (getAddr E) -> tmpa
   8667                        ST $0, (tmpa)
   8668 */
   8669 static
   8670 ULong dis_mov_S_E ( const VexAbiInfo* vbi,
   8671                     Prefix      pfx,
   8672                     Int         size,
   8673                     Long        delta0 )
   8674 {
   8675    Int   len;
   8676    UChar rm = getUChar(delta0);
   8677    HChar dis_buf[50];
   8678 
   8679    if (epartIsReg(rm)) {
   8680       putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
   8681       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8682                          nameIRegE(size, pfx, rm));
   8683       return 1+delta0;
   8684    }
   8685 
   8686    /* E refers to memory */
   8687    {
   8688       IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
   8689       storeLE(mkexpr(addr), mkU16(0));
   8690       DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
   8691                          dis_buf);
   8692       return len+delta0;
   8693    }
   8694 }
   8695 
   8696 //.. static
   8697 //.. void dis_push_segreg ( UInt sreg, Int sz )
   8698 //.. {
   8699 //..     IRTemp t1 = newTemp(Ity_I16);
   8700 //..     IRTemp ta = newTemp(Ity_I32);
   8701 //..     vassert(sz == 2 || sz == 4);
   8702 //..
   8703 //..     assign( t1, getSReg(sreg) );
   8704 //..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
   8705 //..     putIReg(4, R_ESP, mkexpr(ta));
   8706 //..     storeLE( mkexpr(ta), mkexpr(t1) );
   8707 //..
   8708 //..     DIP("pushw %s\n", nameSReg(sreg));
   8709 //.. }
   8710 //..
   8711 //.. static
   8712 //.. void dis_pop_segreg ( UInt sreg, Int sz )
   8713 //.. {
   8714 //..     IRTemp t1 = newTemp(Ity_I16);
   8715 //..     IRTemp ta = newTemp(Ity_I32);
   8716 //..     vassert(sz == 2 || sz == 4);
   8717 //..
   8718 //..     assign( ta, getIReg(4, R_ESP) );
   8719 //..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
   8720 //..
   8721 //..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
   8722 //..     putSReg( sreg, mkexpr(t1) );
   8723 //..     DIP("pop %s\n", nameSReg(sreg));
   8724 //.. }
   8725 
   8726 static
   8727 void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
   8728 {
   8729    IRTemp t1 = newTemp(Ity_I64);
   8730    IRTemp t2 = newTemp(Ity_I64);
   8731    IRTemp t3 = newTemp(Ity_I64);
   8732    assign(t1, getIReg64(R_RSP));
   8733    assign(t2, loadLE(Ity_I64,mkexpr(t1)));
   8734    assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
   8735    putIReg64(R_RSP, mkexpr(t3));
   8736    make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
   8737    jmp_treg(dres, Ijk_Ret, t2);
   8738    vassert(dres->whatNext == Dis_StopHere);
   8739 }
   8740 
   8741 
   8742 /*------------------------------------------------------------*/
   8743 /*--- SSE/SSE2/SSE3 helpers                                ---*/
   8744 /*------------------------------------------------------------*/
   8745 
   8746 /* Indicates whether the op requires a rounding-mode argument.  Note
   8747    that this covers only vector floating point arithmetic ops, and
   8748    omits the scalar ones that need rounding modes.  Note also that
   8749    inconsistencies here will get picked up later by the IR sanity
   8750    checker, so this isn't correctness-critical. */
   8751 static Bool requiresRMode ( IROp op )
   8752 {
   8753    switch (op) {
   8754       /* 128 bit ops */
   8755       case Iop_Add32Fx4: case Iop_Sub32Fx4:
   8756       case Iop_Mul32Fx4: case Iop_Div32Fx4:
   8757       case Iop_Add64Fx2: case Iop_Sub64Fx2:
   8758       case Iop_Mul64Fx2: case Iop_Div64Fx2:
   8759       /* 256 bit ops */
   8760       case Iop_Add32Fx8: case Iop_Sub32Fx8:
   8761       case Iop_Mul32Fx8: case Iop_Div32Fx8:
   8762       case Iop_Add64Fx4: case Iop_Sub64Fx4:
   8763       case Iop_Mul64Fx4: case Iop_Div64Fx4:
   8764          return True;
   8765       default:
   8766          break;
   8767    }
   8768    return False;
   8769 }
   8770 
   8771 
   8772 /* Worker function; do not call directly.
   8773    Handles full width G = G `op` E   and   G = (not G) `op` E.
   8774 */
   8775 
   8776 static ULong dis_SSE_E_to_G_all_wrk (
   8777                 const VexAbiInfo* vbi,
   8778                 Prefix pfx, Long delta,
   8779                 const HChar* opname, IROp op,
   8780                 Bool   invertG
   8781              )
   8782 {
   8783    HChar   dis_buf[50];
   8784    Int     alen;
   8785    IRTemp  addr;
   8786    UChar   rm = getUChar(delta);
   8787    Bool    needsRMode = requiresRMode(op);
   8788    IRExpr* gpart
   8789       = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
   8790                 : getXMMReg(gregOfRexRM(pfx,rm));
   8791    if (epartIsReg(rm)) {
   8792       putXMMReg(
   8793          gregOfRexRM(pfx,rm),
   8794          needsRMode
   8795             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8796                         gpart,
   8797                         getXMMReg(eregOfRexRM(pfx,rm)))
   8798             : binop(op, gpart,
   8799                         getXMMReg(eregOfRexRM(pfx,rm)))
   8800       );
   8801       DIP("%s %s,%s\n", opname,
   8802                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8803                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8804       return delta+1;
   8805    } else {
   8806       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8807       putXMMReg(
   8808          gregOfRexRM(pfx,rm),
   8809          needsRMode
   8810             ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
   8811                         gpart,
   8812                         loadLE(Ity_V128, mkexpr(addr)))
   8813             : binop(op, gpart,
   8814                         loadLE(Ity_V128, mkexpr(addr)))
   8815       );
   8816       DIP("%s %s,%s\n", opname,
   8817                         dis_buf,
   8818                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8819       return delta+alen;
   8820    }
   8821 }
   8822 
   8823 
   8824 /* All lanes SSE binary operation, G = G `op` E. */
   8825 
   8826 static
   8827 ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
   8828                            Prefix pfx, Long delta,
   8829                            const HChar* opname, IROp op )
   8830 {
   8831    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
   8832 }
   8833 
   8834 /* All lanes SSE binary operation, G = (not G) `op` E. */
   8835 
   8836 static
   8837 ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
   8838                                 Prefix pfx, Long delta,
   8839                                 const HChar* opname, IROp op )
   8840 {
   8841    return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
   8842 }
   8843 
   8844 
   8845 /* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
   8846 
   8847 static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
   8848                                    Prefix pfx, Long delta,
   8849                                    const HChar* opname, IROp op )
   8850 {
   8851    HChar   dis_buf[50];
   8852    Int     alen;
   8853    IRTemp  addr;
   8854    UChar   rm = getUChar(delta);
   8855    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8856    if (epartIsReg(rm)) {
   8857       putXMMReg( gregOfRexRM(pfx,rm),
   8858                  binop(op, gpart,
   8859                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8860       DIP("%s %s,%s\n", opname,
   8861                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8862                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8863       return delta+1;
   8864    } else {
   8865       /* We can only do a 32-bit memory read, so the upper 3/4 of the
   8866          E operand needs to be made simply of zeroes. */
   8867       IRTemp epart = newTemp(Ity_V128);
   8868       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8869       assign( epart, unop( Iop_32UtoV128,
   8870                            loadLE(Ity_I32, mkexpr(addr))) );
   8871       putXMMReg( gregOfRexRM(pfx,rm),
   8872                  binop(op, gpart, mkexpr(epart)) );
   8873       DIP("%s %s,%s\n", opname,
   8874                         dis_buf,
   8875                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8876       return delta+alen;
   8877    }
   8878 }
   8879 
   8880 
   8881 /* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
   8882 
   8883 static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
   8884                                    Prefix pfx, Long delta,
   8885                                    const HChar* opname, IROp op )
   8886 {
   8887    HChar   dis_buf[50];
   8888    Int     alen;
   8889    IRTemp  addr;
   8890    UChar   rm = getUChar(delta);
   8891    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   8892    if (epartIsReg(rm)) {
   8893       putXMMReg( gregOfRexRM(pfx,rm),
   8894                  binop(op, gpart,
   8895                            getXMMReg(eregOfRexRM(pfx,rm))) );
   8896       DIP("%s %s,%s\n", opname,
   8897                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8898                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8899       return delta+1;
   8900    } else {
   8901       /* We can only do a 64-bit memory read, so the upper half of the
   8902          E operand needs to be made simply of zeroes. */
   8903       IRTemp epart = newTemp(Ity_V128);
   8904       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8905       assign( epart, unop( Iop_64UtoV128,
   8906                            loadLE(Ity_I64, mkexpr(addr))) );
   8907       putXMMReg( gregOfRexRM(pfx,rm),
   8908                  binop(op, gpart, mkexpr(epart)) );
   8909       DIP("%s %s,%s\n", opname,
   8910                         dis_buf,
   8911                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8912       return delta+alen;
   8913    }
   8914 }
   8915 
   8916 
   8917 /* All lanes unary SSE operation, G = op(E). */
   8918 
   8919 static ULong dis_SSE_E_to_G_unary_all (
   8920                 const VexAbiInfo* vbi,
   8921                 Prefix pfx, Long delta,
   8922                 const HChar* opname, IROp op
   8923              )
   8924 {
   8925    HChar   dis_buf[50];
   8926    Int     alen;
   8927    IRTemp  addr;
   8928    UChar   rm = getUChar(delta);
   8929    // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
   8930    // up in the usual way.
   8931    Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
   8932    if (epartIsReg(rm)) {
   8933       IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
   8934       /* XXXROUNDINGFIXME */
   8935       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8936                               : unop(op, src);
   8937       putXMMReg( gregOfRexRM(pfx,rm), res );
   8938       DIP("%s %s,%s\n", opname,
   8939                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8940                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8941       return delta+1;
   8942    } else {
   8943       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8944       IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
   8945       /* XXXROUNDINGFIXME */
   8946       IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
   8947                               : unop(op, src);
   8948       putXMMReg( gregOfRexRM(pfx,rm), res );
   8949       DIP("%s %s,%s\n", opname,
   8950                         dis_buf,
   8951                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8952       return delta+alen;
   8953    }
   8954 }
   8955 
   8956 
   8957 /* Lowest 32-bit lane only unary SSE operation, G = op(E). */
   8958 
   8959 static ULong dis_SSE_E_to_G_unary_lo32 (
   8960                 const VexAbiInfo* vbi,
   8961                 Prefix pfx, Long delta,
   8962                 const HChar* opname, IROp op
   8963              )
   8964 {
   8965    /* First we need to get the old G value and patch the low 32 bits
   8966       of the E operand into it.  Then apply op and write back to G. */
   8967    HChar   dis_buf[50];
   8968    Int     alen;
   8969    IRTemp  addr;
   8970    UChar   rm = getUChar(delta);
   8971    IRTemp  oldG0 = newTemp(Ity_V128);
   8972    IRTemp  oldG1 = newTemp(Ity_V128);
   8973 
   8974    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   8975 
   8976    if (epartIsReg(rm)) {
   8977       assign( oldG1,
   8978               binop( Iop_SetV128lo32,
   8979                      mkexpr(oldG0),
   8980                      getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
   8981       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8982       DIP("%s %s,%s\n", opname,
   8983                         nameXMMReg(eregOfRexRM(pfx,rm)),
   8984                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8985       return delta+1;
   8986    } else {
   8987       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   8988       assign( oldG1,
   8989               binop( Iop_SetV128lo32,
   8990                      mkexpr(oldG0),
   8991                      loadLE(Ity_I32, mkexpr(addr)) ));
   8992       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   8993       DIP("%s %s,%s\n", opname,
   8994                         dis_buf,
   8995                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   8996       return delta+alen;
   8997    }
   8998 }
   8999 
   9000 
   9001 /* Lowest 64-bit lane only unary SSE operation, G = op(E). */
   9002 
   9003 static ULong dis_SSE_E_to_G_unary_lo64 (
   9004                 const VexAbiInfo* vbi,
   9005                 Prefix pfx, Long delta,
   9006                 const HChar* opname, IROp op
   9007              )
   9008 {
   9009    /* First we need to get the old G value and patch the low 64 bits
   9010       of the E operand into it.  Then apply op and write back to G. */
   9011    HChar   dis_buf[50];
   9012    Int     alen;
   9013    IRTemp  addr;
   9014    UChar   rm = getUChar(delta);
   9015    IRTemp  oldG0 = newTemp(Ity_V128);
   9016    IRTemp  oldG1 = newTemp(Ity_V128);
   9017 
   9018    assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
   9019 
   9020    if (epartIsReg(rm)) {
   9021       assign( oldG1,
   9022               binop( Iop_SetV128lo64,
   9023                      mkexpr(oldG0),
   9024                      getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
   9025       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9026       DIP("%s %s,%s\n", opname,
   9027                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9028                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9029       return delta+1;
   9030    } else {
   9031       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9032       assign( oldG1,
   9033               binop( Iop_SetV128lo64,
   9034                      mkexpr(oldG0),
   9035                      loadLE(Ity_I64, mkexpr(addr)) ));
   9036       putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
   9037       DIP("%s %s,%s\n", opname,
   9038                         dis_buf,
   9039                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9040       return delta+alen;
   9041    }
   9042 }
   9043 
   9044 
   9045 /* SSE integer binary operation:
   9046       G = G `op` E   (eLeft == False)
   9047       G = E `op` G   (eLeft == True)
   9048 */
   9049 static ULong dis_SSEint_E_to_G(
   9050                 const VexAbiInfo* vbi,
   9051                 Prefix pfx, Long delta,
   9052                 const HChar* opname, IROp op,
   9053                 Bool   eLeft
   9054              )
   9055 {
   9056    HChar   dis_buf[50];
   9057    Int     alen;
   9058    IRTemp  addr;
   9059    UChar   rm = getUChar(delta);
   9060    IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
   9061    IRExpr* epart = NULL;
   9062    if (epartIsReg(rm)) {
   9063       epart = getXMMReg(eregOfRexRM(pfx,rm));
   9064       DIP("%s %s,%s\n", opname,
   9065                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9066                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9067       delta += 1;
   9068    } else {
   9069       addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9070       epart = loadLE(Ity_V128, mkexpr(addr));
   9071       DIP("%s %s,%s\n", opname,
   9072                         dis_buf,
   9073                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9074       delta += alen;
   9075    }
   9076    putXMMReg( gregOfRexRM(pfx,rm),
   9077               eLeft ? binop(op, epart, gpart)
   9078                     : binop(op, gpart, epart) );
   9079    return delta;
   9080 }
   9081 
   9082 
   9083 /* Helper for doing SSE FP comparisons.  False return ==> unhandled.
   9084    This is all a bit of a kludge in that it ignores the subtleties of
   9085    ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
   9086    spec. */
   9087 static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
   9088                            /*OUT*/IROp* opP,
   9089                            /*OUT*/Bool* postNotP,
   9090                            UInt imm8, Bool all_lanes, Int sz )
   9091 {
   9092    if (imm8 >= 32) return False;
   9093 
   9094    /* First, compute a (preSwap, op, postNot) triple from
   9095       the supplied imm8. */
   9096    Bool pre = False;
   9097    IROp op  = Iop_INVALID;
   9098    Bool not = False;
   9099 
   9100 #  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
   9101    // If you add a case here, add a corresponding test for both VCMPSD_128
   9102    // and VCMPSS_128 in avx-1.c.
   9103    // Cases 0xA and above are
   9104    //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
   9105    switch (imm8) {
   9106       // "O" = ordered, "U" = unordered
   9107       // "Q" = non-signalling (quiet), "S" = signalling
   9108       //
   9109       //             swap operands?
   9110       //             |
   9111       //             |      cmp op          invert after?
   9112       //             |      |               |
   9113       //             v      v               v
   9114       case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
   9115       case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
   9116       case 0x10: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
   9117       case 0x18: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_US
   9118       //
   9119       case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
   9120       case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
   9121       //
   9122       case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
   9123       case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
   9124       //
   9125       case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
   9126       case 0x13: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_S
   9127       //
   9128       // 0xC: this isn't really right because it returns all-1s when
   9129       // either operand is a NaN, and it should return all-0s.
   9130       case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
   9131       case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
   9132       case 0x14: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
   9133       case 0x1C: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
   9134       //
   9135       case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
   9136       case 0x15: XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
   9137       //
   9138       case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
   9139       case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
   9140       //
   9141       case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
   9142       case 0x17: XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_S
   9143       //
   9144       case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
   9145       case 0x19: XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
   9146       //
   9147       case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
   9148       case 0x1A: XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
   9149       //
   9150       case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
   9151       case 0x1D: XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
   9152       //
   9153       case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
   9154       case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
   9155       // Unhandled:
   9156       // 0xB  FALSE_OQ
   9157       // 0xF  TRUE_UQ
   9158       // 0x1B  FALSE_OS
   9159       // 0x1F  TRUE_US
   9160       /* Don't forget to add test cases to VCMPSS_128_<imm8> in
   9161          avx-1.c if new cases turn up. */
   9162       default: break;
   9163    }
   9164 #  undef XXX
   9165    if (op == Iop_INVALID) return False;
   9166 
   9167    /* Now convert the op into one with the same arithmetic but that is
   9168       correct for the width and laneage requirements. */
   9169 
   9170    /**/ if (sz == 4 && all_lanes) {
   9171       switch (op) {
   9172          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
   9173          case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
   9174          case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
   9175          case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
   9176          default: vassert(0);
   9177       }
   9178    }
   9179    else if (sz == 4 && !all_lanes) {
   9180       switch (op) {
   9181          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
   9182          case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
   9183          case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
   9184          case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
   9185          default: vassert(0);
   9186       }
   9187    }
   9188    else if (sz == 8 && all_lanes) {
   9189       switch (op) {
   9190          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
   9191          case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
   9192          case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
   9193          case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
   9194          default: vassert(0);
   9195       }
   9196    }
   9197    else if (sz == 8 && !all_lanes) {
   9198       switch (op) {
   9199          case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
   9200          case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
   9201          case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
   9202          case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
   9203          default: vassert(0);
   9204       }
   9205    }
   9206    else {
   9207       vpanic("findSSECmpOp(amd64,guest)");
   9208    }
   9209 
   9210    *preSwapP = pre; *opP = op; *postNotP = not;
   9211    return True;
   9212 }
   9213 
   9214 
   9215 /* Handles SSE 32F/64F comparisons.  It can fail, in which case it
   9216    returns the original delta to indicate failure. */
   9217 
   9218 static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
   9219                                  Prefix pfx, Long delta,
   9220                                  const HChar* opname, Bool all_lanes, Int sz )
   9221 {
   9222    Long    delta0 = delta;
   9223    HChar   dis_buf[50];
   9224    Int     alen;
   9225    UInt    imm8;
   9226    IRTemp  addr;
   9227    Bool    preSwap = False;
   9228    IROp    op      = Iop_INVALID;
   9229    Bool    postNot = False;
   9230    IRTemp  plain   = newTemp(Ity_V128);
   9231    UChar   rm      = getUChar(delta);
   9232    UShort  mask    = 0;
   9233    vassert(sz == 4 || sz == 8);
   9234    if (epartIsReg(rm)) {
   9235       imm8 = getUChar(delta+1);
   9236       if (imm8 >= 8) return delta0; /* FAIL */
   9237       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9238       if (!ok) return delta0; /* FAIL */
   9239       vassert(!preSwap); /* never needed for imm8 < 8 */
   9240       assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
   9241                                getXMMReg(eregOfRexRM(pfx,rm))) );
   9242       delta += 2;
   9243       DIP("%s $%u,%s,%s\n", opname,
   9244                             imm8,
   9245                             nameXMMReg(eregOfRexRM(pfx,rm)),
   9246                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9247    } else {
   9248       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   9249       imm8 = getUChar(delta+alen);
   9250       if (imm8 >= 8) return delta0; /* FAIL */
   9251       Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
   9252       if (!ok) return delta0; /* FAIL */
   9253       vassert(!preSwap); /* never needed for imm8 < 8 */
   9254       assign( plain,
   9255               binop(
   9256                  op,
   9257                  getXMMReg(gregOfRexRM(pfx,rm)),
   9258                    all_lanes
   9259                       ? loadLE(Ity_V128, mkexpr(addr))
   9260                    : sz == 8
   9261                       ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
   9262                    : /*sz==4*/
   9263                       unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
   9264               )
   9265       );
   9266       delta += alen+1;
   9267       DIP("%s $%u,%s,%s\n", opname,
   9268                             imm8,
   9269                             dis_buf,
   9270                             nameXMMReg(gregOfRexRM(pfx,rm)) );
   9271    }
   9272 
   9273    if (postNot && all_lanes) {
   9274       putXMMReg( gregOfRexRM(pfx,rm),
   9275                  unop(Iop_NotV128, mkexpr(plain)) );
   9276    }
   9277    else
   9278    if (postNot && !all_lanes) {
   9279       mask = toUShort(sz==4 ? 0x000F : 0x00FF);
   9280       putXMMReg( gregOfRexRM(pfx,rm),
   9281                  binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
   9282    }
   9283    else {
   9284       putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
   9285    }
   9286 
   9287    return delta;
   9288 }
   9289 
   9290 
   9291 /* Vector by scalar shift of G by the amount specified at the bottom
   9292    of E. */
   9293 
   9294 static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
   9295                                   Prefix pfx, Long delta,
   9296                                   const HChar* opname, IROp op )
   9297 {
   9298    HChar   dis_buf[50];
   9299    Int     alen, size;
   9300    IRTemp  addr;
   9301    Bool    shl, shr, sar;
   9302    UChar   rm   = getUChar(delta);
   9303    IRTemp  g0   = newTemp(Ity_V128);
   9304    IRTemp  g1   = newTemp(Ity_V128);
   9305    IRTemp  amt  = newTemp(Ity_I64);
   9306    IRTemp  amt8 = newTemp(Ity_I8);
   9307    if (epartIsReg(rm)) {
   9308       assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
   9309       DIP("%s %s,%s\n", opname,
   9310                         nameXMMReg(eregOfRexRM(pfx,rm)),
   9311                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9312       delta++;
   9313    } else {
   9314       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   9315       assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
   9316       DIP("%s %s,%s\n", opname,
   9317                         dis_buf,
   9318                         nameXMMReg(gregOfRexRM(pfx,rm)) );
   9319       delta += alen;
   9320    }
   9321    assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
   9322    assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
   9323 
   9324    shl = shr = sar = False;
   9325    size = 0;
   9326    switch (op) {
   9327       case Iop_ShlN16x8: shl = True; size = 32; break;
   9328       case Iop_ShlN32x4: shl = True; size = 32; break;
   9329       case Iop_ShlN64x2: shl = True; size = 64; break;
   9330       case Iop_SarN16x8: sar = True; size = 16; break;
   9331       case Iop_SarN32x4: sar = True; size = 32; break;
   9332       case Iop_ShrN16x8: shr = True; size = 16; break;
   9333       case Iop_ShrN32x4: shr = True; size = 32; break;
   9334       case Iop_ShrN64x2: shr = True; size = 64; break;
   9335       default: vassert(0);
   9336    }
   9337 
   9338    if (shl || shr) {
   9339      assign(
   9340         g1,
   9341         IRExpr_ITE(
   9342            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9343            binop(op, mkexpr(g0), mkexpr(amt8)),
   9344            mkV128(0x0000)
   9345         )
   9346      );
   9347    } else
   9348    if (sar) {
   9349      assign(
   9350         g1,
   9351         IRExpr_ITE(
   9352            binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
   9353            binop(op, mkexpr(g0), mkexpr(amt8)),
   9354            binop(op, mkexpr(g0), mkU8(size-1))
   9355         )
   9356      );
   9357    } else {
   9358       vassert(0);
   9359    }
   9360 
   9361    putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
   9362    return delta;
   9363 }
   9364 
   9365 
   9366 /* Vector by scalar shift of E by an immediate byte. */
   9367 
   9368 static
   9369 ULong dis_SSE_shiftE_imm ( Prefix pfx,
   9370                            Long delta, const HChar* opname, IROp op )
   9371 {
   9372    Bool    shl, shr, sar;
   9373    UChar   rm   = getUChar(delta);
   9374    IRTemp  e0   = newTemp(Ity_V128);
   9375    IRTemp  e1   = newTemp(Ity_V128);
   9376    UChar   amt, size;
   9377    vassert(epartIsReg(rm));
   9378    vassert(gregLO3ofRM(rm) == 2
   9379            || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
   9380    amt = getUChar(delta+1);
   9381    delta += 2;
   9382    DIP("%s $%d,%s\n", opname,
   9383                       (Int)amt,
   9384                       nameXMMReg(eregOfRexRM(pfx,rm)) );
   9385    assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
   9386 
   9387    shl = shr = sar = False;
   9388    size = 0;
   9389    switch (op) {
   9390       case Iop_ShlN16x8: shl = True; size = 16; break;
   9391       case Iop_ShlN32x4: shl = True; size = 32; break;
   9392       case Iop_ShlN64x2: shl = True; size = 64; break;
   9393       case Iop_SarN16x8: sar = True; size = 16; break;
   9394       case Iop_SarN32x4: sar = True; size = 32; break;
   9395       case Iop_ShrN16x8: shr = True; size = 16; break;
   9396       case Iop_ShrN32x4: shr = True; size = 32; break;
   9397       case Iop_ShrN64x2: shr = True; size = 64; break;
   9398       default: vassert(0);
   9399    }
   9400 
   9401    if (shl || shr) {
   9402      assign( e1, amt >= size
   9403                     ? mkV128(0x0000)
   9404                     : binop(op, mkexpr(e0), mkU8(amt))
   9405      );
   9406    } else
   9407    if (sar) {
   9408      assign( e1, amt >= size
   9409                     ? binop(op, mkexpr(e0), mkU8(size-1))
   9410                     : binop(op, mkexpr(e0), mkU8(amt))
   9411      );
   9412    } else {
   9413       vassert(0);
   9414    }
   9415 
   9416    putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
   9417    return delta;
   9418 }
   9419 
   9420 
   9421 /* Get the current SSE rounding mode. */
   9422 
   9423 static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
   9424 {
   9425    return
   9426       unop( Iop_64to32,
   9427             binop( Iop_And64,
   9428                    IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
   9429                    mkU64(3) ));
   9430 }
   9431 
   9432 static void put_sse_roundingmode ( IRExpr* sseround )
   9433 {
   9434    vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
   9435    stmt( IRStmt_Put( OFFB_SSEROUND,
   9436                      unop(Iop_32Uto64,sseround) ) );
   9437 }
   9438 
   9439 /* Break a V128-bit value up into four 32-bit ints. */
   9440 
   9441 static void breakupV128to32s ( IRTemp t128,
   9442                                /*OUTs*/
   9443                                IRTemp* t3, IRTemp* t2,
   9444                                IRTemp* t1, IRTemp* t0 )
   9445 {
   9446    IRTemp hi64 = newTemp(Ity_I64);
   9447    IRTemp lo64 = newTemp(Ity_I64);
   9448    assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
   9449    assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
   9450 
   9451    vassert(t0 && *t0 == IRTemp_INVALID);
   9452    vassert(t1 && *t1 == IRTemp_INVALID);
   9453    vassert(t2 && *t2 == IRTemp_INVALID);
   9454    vassert(t3 && *t3 == IRTemp_INVALID);
   9455 
   9456    *t0 = newTemp(Ity_I32);
   9457    *t1 = newTemp(Ity_I32);
   9458    *t2 = newTemp(Ity_I32);
   9459    *t3 = newTemp(Ity_I32);
   9460    assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
   9461    assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
   9462    assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
   9463    assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
   9464 }
   9465 
   9466 /* Construct a V128-bit value from four 32-bit ints. */
   9467 
   9468 static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
   9469                                IRTemp t1, IRTemp t0 )
   9470 {
   9471    return
   9472       binop( Iop_64HLtoV128,
   9473              binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9474              binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
   9475    );
   9476 }
   9477 
   9478 /* Break a 64-bit value up into four 16-bit ints. */
   9479 
   9480 static void breakup64to16s ( IRTemp t64,
   9481                              /*OUTs*/
   9482                              IRTemp* t3, IRTemp* t2,
   9483                              IRTemp* t1, IRTemp* t0 )
   9484 {
   9485    IRTemp hi32 = newTemp(Ity_I32);
   9486    IRTemp lo32 = newTemp(Ity_I32);
   9487    assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
   9488    assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
   9489 
   9490    vassert(t0 && *t0 == IRTemp_INVALID);
   9491    vassert(t1 && *t1 == IRTemp_INVALID);
   9492    vassert(t2 && *t2 == IRTemp_INVALID);
   9493    vassert(t3 && *t3 == IRTemp_INVALID);
   9494 
   9495    *t0 = newTemp(Ity_I16);
   9496    *t1 = newTemp(Ity_I16);
   9497    *t2 = newTemp(Ity_I16);
   9498    *t3 = newTemp(Ity_I16);
   9499    assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
   9500    assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
   9501    assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
   9502    assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
   9503 }
   9504 
   9505 /* Construct a 64-bit value from four 16-bit ints. */
   9506 
   9507 static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
   9508                              IRTemp t1, IRTemp t0 )
   9509 {
   9510    return
   9511       binop( Iop_32HLto64,
   9512              binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
   9513              binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
   9514    );
   9515 }
   9516 
   9517 /* Break a V256-bit value up into four 64-bit ints. */
   9518 
   9519 static void breakupV256to64s ( IRTemp t256,
   9520                                /*OUTs*/
   9521                                IRTemp* t3, IRTemp* t2,
   9522                                IRTemp* t1, IRTemp* t0 )
   9523 {
   9524    vassert(t0 && *t0 == IRTemp_INVALID);
   9525    vassert(t1 && *t1 == IRTemp_INVALID);
   9526    vassert(t2 && *t2 == IRTemp_INVALID);
   9527    vassert(t3 && *t3 == IRTemp_INVALID);
   9528    *t0 = newTemp(Ity_I64);
   9529    *t1 = newTemp(Ity_I64);
   9530    *t2 = newTemp(Ity_I64);
   9531    *t3 = newTemp(Ity_I64);
   9532    assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
   9533    assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
   9534    assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
   9535    assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
   9536 }
   9537 
   9538 /* Break a V256-bit value up into two V128s. */
   9539 
   9540 static void breakupV256toV128s ( IRTemp t256,
   9541                                  /*OUTs*/
   9542                                  IRTemp* t1, IRTemp* t0 )
   9543 {
   9544    vassert(t0 && *t0 == IRTemp_INVALID);
   9545    vassert(t1 && *t1 == IRTemp_INVALID);
   9546    *t0 = newTemp(Ity_V128);
   9547    *t1 = newTemp(Ity_V128);
   9548    assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
   9549    assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
   9550 }
   9551 
   9552 /* Break a V256-bit value up into eight 32-bit ints.  */
   9553 
   9554 static void breakupV256to32s ( IRTemp t256,
   9555                                /*OUTs*/
   9556                                IRTemp* t7, IRTemp* t6,
   9557                                IRTemp* t5, IRTemp* t4,
   9558                                IRTemp* t3, IRTemp* t2,
   9559                                IRTemp* t1, IRTemp* t0 )
   9560 {
   9561    IRTemp t128_1 = IRTemp_INVALID;
   9562    IRTemp t128_0 = IRTemp_INVALID;
   9563    breakupV256toV128s( t256, &t128_1, &t128_0 );
   9564    breakupV128to32s( t128_1, t7, t6, t5, t4 );
   9565    breakupV128to32s( t128_0, t3, t2, t1, t0 );
   9566 }
   9567 
   9568 /* Break a V128-bit value up into two 64-bit ints. */
   9569 
   9570 static void breakupV128to64s ( IRTemp t128,
   9571                                /*OUTs*/
   9572                                IRTemp* t1, IRTemp* t0 )
   9573 {
   9574    vassert(t0 && *t0 == IRTemp_INVALID);
   9575    vassert(t1 && *t1 == IRTemp_INVALID);
   9576    *t0 = newTemp(Ity_I64);
   9577    *t1 = newTemp(Ity_I64);
   9578    assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
   9579    assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
   9580 }
   9581 
   9582 /* Construct a V256-bit value from eight 32-bit ints. */
   9583 
   9584 static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
   9585                                IRTemp t5, IRTemp t4,
   9586                                IRTemp t3, IRTemp t2,
   9587                                IRTemp t1, IRTemp t0 )
   9588 {
   9589    return
   9590       binop( Iop_V128HLtoV256,
   9591              binop( Iop_64HLtoV128,
   9592                     binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
   9593                     binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
   9594              binop( Iop_64HLtoV128,
   9595                     binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
   9596                     binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
   9597    );
   9598 }
   9599 
   9600 /* Construct a V256-bit value from four 64-bit ints. */
   9601 
   9602 static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
   9603                                IRTemp t1, IRTemp t0 )
   9604 {
   9605    return
   9606       binop( Iop_V128HLtoV256,
   9607              binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
   9608              binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
   9609    );
   9610 }
   9611 
   9612 /* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
   9613    values (aa,bb), computes, for each of the 4 16-bit lanes:
   9614 
   9615    (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
   9616 */
   9617 static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
   9618 {
   9619    IRTemp aa      = newTemp(Ity_I64);
   9620    IRTemp bb      = newTemp(Ity_I64);
   9621    IRTemp aahi32s = newTemp(Ity_I64);
   9622    IRTemp aalo32s = newTemp(Ity_I64);
   9623    IRTemp bbhi32s = newTemp(Ity_I64);
   9624    IRTemp bblo32s = newTemp(Ity_I64);
   9625    IRTemp rHi     = newTemp(Ity_I64);
   9626    IRTemp rLo     = newTemp(Ity_I64);
   9627    IRTemp one32x2 = newTemp(Ity_I64);
   9628    assign(aa, aax);
   9629    assign(bb, bbx);
   9630    assign( aahi32s,
   9631            binop(Iop_SarN32x2,
   9632                  binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
   9633                  mkU8(16) ));
   9634    assign( aalo32s,
   9635            binop(Iop_SarN32x2,
   9636                  binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
   9637                  mkU8(16) ));
   9638    assign( bbhi32s,
   9639            binop(Iop_SarN32x2,
   9640                  binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
   9641                  mkU8(16) ));
   9642    assign( bblo32s,
   9643            binop(Iop_SarN32x2,
   9644                  binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
   9645                  mkU8(16) ));
   9646    assign(one32x2, mkU64( (1ULL << 32) + 1 ));
   9647    assign(
   9648       rHi,
   9649       binop(
   9650          Iop_ShrN32x2,
   9651          binop(
   9652             Iop_Add32x2,
   9653             binop(
   9654                Iop_ShrN32x2,
   9655                binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
   9656                mkU8(14)
   9657             ),
   9658             mkexpr(one32x2)
   9659          ),
   9660          mkU8(1)
   9661       )
   9662    );
   9663    assign(
   9664       rLo,
   9665       binop(
   9666          Iop_ShrN32x2,
   9667          binop(
   9668             Iop_Add32x2,
   9669             binop(
   9670                Iop_ShrN32x2,
   9671                binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
   9672                mkU8(14)
   9673             ),
   9674             mkexpr(one32x2)
   9675          ),
   9676          mkU8(1)
   9677       )
   9678    );
   9679    return
   9680       binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
   9681 }
   9682 
   9683 /* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
   9684    values (aa,bb), computes, for each lane:
   9685 
   9686           if aa_lane < 0 then - bb_lane
   9687      else if aa_lane > 0 then bb_lane
   9688      else 0
   9689 */
   9690 static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
   9691 {
   9692    IRTemp aa       = newTemp(Ity_I64);
   9693    IRTemp bb       = newTemp(Ity_I64);
   9694    IRTemp zero     = newTemp(Ity_I64);
   9695    IRTemp bbNeg    = newTemp(Ity_I64);
   9696    IRTemp negMask  = newTemp(Ity_I64);
   9697    IRTemp posMask  = newTemp(Ity_I64);
   9698    IROp   opSub    = Iop_INVALID;
   9699    IROp   opCmpGTS = Iop_INVALID;
   9700 
   9701    switch (laneszB) {
   9702       case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
   9703       case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
   9704       case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
   9705       default: vassert(0);
   9706    }
   9707 
   9708    assign( aa,      aax );
   9709    assign( bb,      bbx );
   9710    assign( zero,    mkU64(0) );
   9711    assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
   9712    assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
   9713    assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
   9714 
   9715    return
   9716       binop(Iop_Or64,
   9717             binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
   9718             binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
   9719 
   9720 }
   9721 
   9722 
   9723 /* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
   9724    value aa, computes, for each lane
   9725 
   9726    if aa < 0 then -aa else aa
   9727 
   9728    Note that the result is interpreted as unsigned, so that the
   9729    absolute value of the most negative signed input can be
   9730    represented.
   9731 */
   9732 static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
   9733 {
   9734    IRTemp res     = newTemp(Ity_I64);
   9735    IRTemp zero    = newTemp(Ity_I64);
   9736    IRTemp aaNeg   = newTemp(Ity_I64);
   9737    IRTemp negMask = newTemp(Ity_I64);
   9738    IRTemp posMask = newTemp(Ity_I64);
   9739    IROp   opSub   = Iop_INVALID;
   9740    IROp   opSarN  = Iop_INVALID;
   9741 
   9742    switch (laneszB) {
   9743       case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
   9744       case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
   9745       case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
   9746       default: vassert(0);
   9747    }
   9748 
   9749    assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
   9750    assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
   9751    assign( zero,    mkU64(0) );
   9752    assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
   9753    assign( res,
   9754            binop(Iop_Or64,
   9755                  binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
   9756                  binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
   9757    return res;
   9758 }
   9759 
   9760 /* XMM version of math_PABS_MMX. */
   9761 static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
   9762 {
   9763    IRTemp res  = newTemp(Ity_V128);
   9764    IRTemp aaHi = newTemp(Ity_I64);
   9765    IRTemp aaLo = newTemp(Ity_I64);
   9766    assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
   9767    assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
   9768    assign(res, binop(Iop_64HLtoV128,
   9769                      mkexpr(math_PABS_MMX(aaHi, laneszB)),
   9770                      mkexpr(math_PABS_MMX(aaLo, laneszB))));
   9771    return res;
   9772 }
   9773 
   9774 /* Specialisations of math_PABS_XMM, since there's no easy way to do
   9775    partial applications in C :-( */
   9776 static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
   9777    return math_PABS_XMM(aa, 4);
   9778 }
   9779 
   9780 static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
   9781    return math_PABS_XMM(aa, 2);
   9782 }
   9783 
   9784 static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
   9785    return math_PABS_XMM(aa, 1);
   9786 }
   9787 
   9788 /* YMM version of math_PABS_XMM. */
   9789 static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
   9790 {
   9791    IRTemp res  = newTemp(Ity_V256);
   9792    IRTemp aaHi = IRTemp_INVALID;
   9793    IRTemp aaLo = IRTemp_INVALID;
   9794    breakupV256toV128s(aa, &aaHi, &aaLo);
   9795    assign(res, binop(Iop_V128HLtoV256,
   9796                      mkexpr(math_PABS_XMM(aaHi, laneszB)),
   9797                      mkexpr(math_PABS_XMM(aaLo, laneszB))));
   9798    return res;
   9799 }
   9800 
   9801 static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
   9802    return math_PABS_YMM(aa, 4);
   9803 }
   9804 
   9805 static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
   9806    return math_PABS_YMM(aa, 2);
   9807 }
   9808 
   9809 static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
   9810    return math_PABS_YMM(aa, 1);
   9811 }
   9812 
   9813 static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
   9814                                         IRTemp lo64, Long byteShift )
   9815 {
   9816    vassert(byteShift >= 1 && byteShift <= 7);
   9817    return
   9818       binop(Iop_Or64,
   9819             binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
   9820             binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
   9821       );
   9822 }
   9823 
   9824 static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
   9825 {
   9826    IRTemp res = newTemp(Ity_V128);
   9827    IRTemp sHi = newTemp(Ity_I64);
   9828    IRTemp sLo = newTemp(Ity_I64);
   9829    IRTemp dHi = newTemp(Ity_I64);
   9830    IRTemp dLo = newTemp(Ity_I64);
   9831    IRTemp rHi = newTemp(Ity_I64);
   9832    IRTemp rLo = newTemp(Ity_I64);
   9833 
   9834    assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
   9835    assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
   9836    assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
   9837    assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
   9838 
   9839    if (imm8 == 0) {
   9840       assign( rHi, mkexpr(sHi) );
   9841       assign( rLo, mkexpr(sLo) );
   9842    }
   9843    else if (imm8 >= 1 && imm8 <= 7) {
   9844       assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
   9845       assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
   9846    }
   9847    else if (imm8 == 8) {
   9848       assign( rHi, mkexpr(dLo) );
   9849       assign( rLo, mkexpr(sHi) );
   9850    }
   9851    else if (imm8 >= 9 && imm8 <= 15) {
   9852       assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
   9853       assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
   9854    }
   9855    else if (imm8 == 16) {
   9856       assign( rHi, mkexpr(dHi) );
   9857       assign( rLo, mkexpr(dLo) );
   9858    }
   9859    else if (imm8 >= 17 && imm8 <= 23) {
   9860       assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
   9861       assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
   9862    }
   9863    else if (imm8 == 24) {
   9864       assign( rHi, mkU64(0) );
   9865       assign( rLo, mkexpr(dHi) );
   9866    }
   9867    else if (imm8 >= 25 && imm8 <= 31) {
   9868       assign( rHi, mkU64(0) );
   9869       assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
   9870    }
   9871    else if (imm8 >= 32 && imm8 <= 255) {
   9872       assign( rHi, mkU64(0) );
   9873       assign( rLo, mkU64(0) );
   9874    }
   9875    else
   9876       vassert(0);
   9877 
   9878    assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
   9879    return res;
   9880 }
   9881 
   9882 
   9883 /* Generate a SIGSEGV followed by a restart of the current instruction
   9884    if effective_addr is not 16-aligned.  This is required behaviour
   9885    for some SSE3 instructions and all 128-bit SSSE3 instructions.
   9886    This assumes that guest_RIP_curr_instr is set correctly! */
   9887 static
   9888 void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
   9889 {
   9890    stmt(
   9891       IRStmt_Exit(
   9892          binop(Iop_CmpNE64,
   9893                binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
   9894                mkU64(0)),
   9895          Ijk_SigSEGV,
   9896          IRConst_U64(guest_RIP_curr_instr),
   9897          OFFB_RIP
   9898       )
   9899    );
   9900 }
   9901 
   9902 static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
   9903    gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
   9904 }
   9905 
   9906 static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
   9907    gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
   9908 }
   9909 
   9910 static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
   9911    gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
   9912 }
   9913 
   9914 /* Helper for deciding whether a given insn (starting at the opcode
   9915    byte) may validly be used with a LOCK prefix.  The following insns
   9916    may be used with LOCK when their destination operand is in memory.
   9917    AFAICS this is exactly the same for both 32-bit and 64-bit mode.
   9918 
   9919    ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
   9920    OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
   9921    ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
   9922    SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
   9923    AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
   9924    SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
   9925    XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
   9926 
   9927    DEC        FE /1,  FF /1
   9928    INC        FE /0,  FF /0
   9929 
   9930    NEG        F6 /3,  F7 /3
   9931    NOT        F6 /2,  F7 /2
   9932 
   9933    XCHG       86, 87
   9934 
   9935    BTC        0F BB,  0F BA /7
   9936    BTR        0F B3,  0F BA /6
   9937    BTS        0F AB,  0F BA /5
   9938 
   9939    CMPXCHG    0F B0,  0F B1
   9940    CMPXCHG8B  0F C7 /1
   9941 
   9942    XADD       0F C0,  0F C1
   9943 
   9944    ------------------------------
   9945 
   9946    80 /0  =  addb $imm8,  rm8
   9947    81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
   9948    82 /0  =  addb $imm8,  rm8
   9949    83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
   9950 
   9951    00     =  addb r8,  rm8
   9952    01     =  addl r32, rm32  and  addw r16, rm16
   9953 
   9954    Same for ADD OR ADC SBB AND SUB XOR
   9955 
   9956    FE /1  = dec rm8
   9957    FF /1  = dec rm32  and  dec rm16
   9958 
   9959    FE /0  = inc rm8
   9960    FF /0  = inc rm32  and  inc rm16
   9961 
   9962    F6 /3  = neg rm8
   9963    F7 /3  = neg rm32  and  neg rm16
   9964 
   9965    F6 /2  = not rm8
   9966    F7 /2  = not rm32  and  not rm16
   9967 
   9968    0F BB     = btcw r16, rm16    and  btcl r32, rm32
   9969    OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
   9970 
   9971    Same for BTS, BTR
   9972 */
   9973 static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
   9974 {
   9975    switch (opc[0]) {
   9976       case 0x00: case 0x01: case 0x08: case 0x09:
   9977       case 0x10: case 0x11: case 0x18: case 0x19:
   9978       case 0x20: case 0x21: case 0x28: case 0x29:
   9979       case 0x30: case 0x31:
   9980          if (!epartIsReg(opc[1]))
   9981             return True;
   9982          break;
   9983 
   9984       case 0x80: case 0x81: case 0x82: case 0x83:
   9985          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
   9986              && !epartIsReg(opc[1]))
   9987             return True;
   9988          break;
   9989 
   9990       case 0xFE: case 0xFF:
   9991          if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
   9992              && !epartIsReg(opc[1]))
   9993             return True;
   9994          break;
   9995 
   9996       case 0xF6: case 0xF7:
   9997          if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
   9998              && !epartIsReg(opc[1]))
   9999             return True;
   10000          break;
   10001 
   10002       case 0x86: case 0x87:
   10003          if (!epartIsReg(opc[1]))
   10004             return True;
   10005          break;
   10006 
   10007       case 0x0F: {
   10008          switch (opc[1]) {
   10009             case 0xBB: case 0xB3: case 0xAB:
   10010                if (!epartIsReg(opc[2]))
   10011                   return True;
   10012                break;
   10013             case 0xBA:
   10014                if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
   10015                    && !epartIsReg(opc[2]))
   10016                   return True;
   10017                break;
   10018             case 0xB0: case 0xB1:
   10019                if (!epartIsReg(opc[2]))
   10020                   return True;
   10021                break;
   10022             case 0xC7:
   10023                if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
   10024                   return True;
   10025                break;
   10026             case 0xC0: case 0xC1:
   10027                if (!epartIsReg(opc[2]))
   10028                   return True;
   10029                break;
   10030             default:
   10031                break;
   10032          } /* switch (opc[1]) */
   10033          break;
   10034       }
   10035 
   10036       default:
   10037          break;
   10038    } /* switch (opc[0]) */
   10039 
   10040    return False;
   10041 }
   10042 
   10043 
   10044 /*------------------------------------------------------------*/
   10045 /*---                                                      ---*/
   10046 /*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
   10047 /*---                                                      ---*/
   10048 /*------------------------------------------------------------*/
   10049 
   10050 static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
   10051                          Long delta, Bool isAvx, UChar opc )
   10052 {
   10053    vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
   10054    Int    alen  = 0;
   10055    HChar  dis_buf[50];
   10056    IRTemp argL  = newTemp(Ity_F64);
   10057    IRTemp argR  = newTemp(Ity_F64);
   10058    UChar  modrm = getUChar(delta);
   10059    IRTemp addr  = IRTemp_INVALID;
   10060    if (epartIsReg(modrm)) {
   10061       assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
   10062                                       0/*lowest lane*/ ) );
   10063       delta += 1;
   10064       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10065                                 opc==0x2E ? "u" : "",
   10066                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10067                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10068    } else {
   10069       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10070       assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
   10071       delta += alen;
   10072       DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
   10073                                 opc==0x2E ? "u" : "",
   10074                                 dis_buf,
   10075                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10076    }
   10077    assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
   10078                                    0/*lowest lane*/ ) );
   10079 
   10080    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10081    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10082    stmt( IRStmt_Put(
   10083             OFFB_CC_DEP1,
   10084             binop( Iop_And64,
   10085                    unop( Iop_32Uto64,
   10086                          binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
   10087                    mkU64(0x45)
   10088        )));
   10089    return delta;
   10090 }
   10091 
   10092 
   10093 static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
   10094                          Long delta, Bool isAvx, UChar opc )
   10095 {
   10096    vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
   10097    Int    alen  = 0;
   10098    HChar  dis_buf[50];
   10099    IRTemp argL  = newTemp(Ity_F32);
   10100    IRTemp argR  = newTemp(Ity_F32);
   10101    UChar  modrm = getUChar(delta);
   10102    IRTemp addr  = IRTemp_INVALID;
   10103    if (epartIsReg(modrm)) {
   10104       assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
   10105                                       0/*lowest lane*/ ) );
   10106       delta += 1;
   10107       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10108                                 opc==0x2E ? "u" : "",
   10109                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
   10110                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10111    } else {
   10112       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10113       assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
   10114       delta += alen;
   10115       DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
   10116                                 opc==0x2E ? "u" : "",
   10117                                 dis_buf,
   10118                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
   10119    }
   10120    assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
   10121                                    0/*lowest lane*/ ) );
   10122 
   10123    stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
   10124    stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
   10125    stmt( IRStmt_Put(
   10126             OFFB_CC_DEP1,
   10127             binop( Iop_And64,
   10128                    unop( Iop_32Uto64,
   10129                          binop(Iop_CmpF64,
   10130                                unop(Iop_F32toF64,mkexpr(argL)),
   10131                                unop(Iop_F32toF64,mkexpr(argR)))),
   10132                    mkU64(0x45)
   10133        )));
   10134    return delta;
   10135 }
   10136 
   10137 
   10138 static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
   10139                               Long delta, Bool writesYmm )
   10140 {
   10141    Int    order;
   10142    Int    alen  = 0;
   10143    HChar  dis_buf[50];
   10144    IRTemp sV    = newTemp(Ity_V128);
   10145    UChar  modrm = getUChar(delta);
   10146    const HChar* strV  = writesYmm ? "v" : "";
   10147    IRTemp addr  = IRTemp_INVALID;
   10148    if (epartIsReg(modrm)) {
   10149       assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   10150       order = (Int)getUChar(delta+1);
   10151       delta += 1+1;
   10152       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10153                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10154                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   10155    } else {
   10156       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10157                         1/*byte after the amode*/ );
   10158       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   10159       order = (Int)getUChar(delta+alen);
   10160       delta += alen+1;
   10161       DIP("%spshufd $%d,%s,%s\n", strV, order,
   10162                                  dis_buf,
   10163                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   10164    }
   10165 
   10166    IRTemp s3, s2, s1, s0;
   10167    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   10168    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10169 
   10170 #  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10171    IRTemp dV = newTemp(Ity_V128);
   10172    assign(dV,
   10173           mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
   10174                          SEL((order>>2)&3), SEL((order>>0)&3) )
   10175    );
   10176 #  undef SEL
   10177 
   10178    (writesYmm ? putYMMRegLoAndZU : putXMMReg)
   10179       (gregOfRexRM(pfx,modrm), mkexpr(dV));
   10180    return delta;
   10181 }
   10182 
   10183 
   10184 static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   10185 {
   10186    Int    order;
   10187    Int    alen  = 0;
   10188    HChar  dis_buf[50];
   10189    IRTemp sV    = newTemp(Ity_V256);
   10190    UChar  modrm = getUChar(delta);
   10191    IRTemp addr  = IRTemp_INVALID;
   10192    UInt   rG    = gregOfRexRM(pfx,modrm);
   10193    if (epartIsReg(modrm)) {
   10194       UInt rE = eregOfRexRM(pfx,modrm);
   10195       assign( sV, getYMMReg(rE) );
   10196       order = (Int)getUChar(delta+1);
   10197       delta += 1+1;
   10198       DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
   10199    } else {
   10200       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   10201                         1/*byte after the amode*/ );
   10202       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   10203       order = (Int)getUChar(delta+alen);
   10204       delta += alen+1;
   10205       DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
   10206    }
   10207 
   10208    IRTemp s[8];
   10209    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   10210    breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
   10211                          &s[3], &s[2], &s[1], &s[0] );
   10212 
   10213    putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
   10214                                  s[4 + ((order>>4)&3)],
   10215                                  s[4 + ((order>>2)&3)],
   10216                                  s[4 + ((order>>0)&3)],
   10217                                  s[0 + ((order>>6)&3)],
   10218                                  s[0 + ((order>>4)&3)],
   10219                                  s[0 + ((order>>2)&3)],
   10220                                  s[0 + ((order>>0)&3)] ) );
   10221    return delta;
   10222 }
   10223 
   10224 
   10225 static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
   10226 {
   10227    IRTemp dV    = newTemp(Ity_V128);
   10228    IRTemp hi64  = newTemp(Ity_I64);
   10229    IRTemp lo64  = newTemp(Ity_I64);
   10230    IRTemp hi64r = newTemp(Ity_I64);
   10231    IRTemp lo64r = newTemp(Ity_I64);
   10232 
   10233    vassert(imm >= 0 && imm <= 255);
   10234    if (imm >= 16) {
   10235       assign(dV, mkV128(0x0000));
   10236       return dV;
   10237    }
   10238 
   10239    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10240    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10241 
   10242    if (imm == 0) {
   10243       assign( lo64r, mkexpr(lo64) );
   10244       assign( hi64r, mkexpr(hi64) );
   10245    }
   10246    else
   10247    if (imm == 8) {
   10248       assign( hi64r, mkU64(0) );
   10249       assign( lo64r, mkexpr(hi64) );
   10250    }
   10251    else
   10252    if (imm > 8) {
   10253       assign( hi64r, mkU64(0) );
   10254       assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
   10255    } else {
   10256       assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
   10257       assign( lo64r,
   10258               binop( Iop_Or64,
   10259                      binop(Iop_Shr64, mkexpr(lo64),
   10260                            mkU8(8 * imm)),
   10261                      binop(Iop_Shl64, mkexpr(hi64),
   10262                            mkU8(8 * (8 - imm)) )
   10263                      )
   10264               );
   10265    }
   10266 
   10267    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10268    return dV;
   10269 }
   10270 
   10271 
   10272 static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
   10273 {
   10274    IRTemp       dV    = newTemp(Ity_V128);
   10275    IRTemp       hi64  = newTemp(Ity_I64);
   10276    IRTemp       lo64  = newTemp(Ity_I64);
   10277    IRTemp       hi64r = newTemp(Ity_I64);
   10278    IRTemp       lo64r = newTemp(Ity_I64);
   10279 
   10280    vassert(imm >= 0 && imm <= 255);
   10281    if (imm >= 16) {
   10282       assign(dV, mkV128(0x0000));
   10283       return dV;
   10284    }
   10285 
   10286    assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
   10287    assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
   10288 
   10289    if (imm == 0) {
   10290       assign( lo64r, mkexpr(lo64) );
   10291       assign( hi64r, mkexpr(hi64) );
   10292    }
   10293    else
   10294    if (imm == 8) {
   10295       assign( lo64r, mkU64(0) );
   10296       assign( hi64r, mkexpr(lo64) );
   10297    }
   10298    else
   10299    if (imm > 8) {
   10300       assign( lo64r, mkU64(0) );
   10301       assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
   10302    } else {
   10303       assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
   10304       assign( hi64r,
   10305               binop( Iop_Or64,
   10306                      binop(Iop_Shl64, mkexpr(hi64),
   10307                            mkU8(8 * imm)),
   10308                      binop(Iop_Shr64, mkexpr(lo64),
   10309                            mkU8(8 * (8 - imm)) )
   10310                      )
   10311               );
   10312    }
   10313 
   10314    assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
   10315    return dV;
   10316 }
   10317 
   10318 
   10319 static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10320                             Long delta, Bool isAvx, UChar opc, Int sz )
   10321 {
   10322    vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
   10323    HChar  dis_buf[50];
   10324    Int    alen   = 0;
   10325    UChar  modrm  = getUChar(delta);
   10326    IRTemp addr   = IRTemp_INVALID;
   10327    IRTemp rmode  = newTemp(Ity_I32);
   10328    IRTemp f64lo  = newTemp(Ity_F64);
   10329    Bool   r2zero = toBool(opc == 0x2C);
   10330 
   10331    if (epartIsReg(modrm)) {
   10332       delta += 1;
   10333       assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   10334       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10335                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10336                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10337                                            False));
   10338    } else {
   10339       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10340       assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   10341       delta += alen;
   10342       DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10343                                   dis_buf,
   10344                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10345                                            False));
   10346    }
   10347 
   10348    if (r2zero) {
   10349       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10350    } else {
   10351       assign( rmode, get_sse_roundingmode() );
   10352    }
   10353 
   10354    if (sz == 4) {
   10355       putIReg32( gregOfRexRM(pfx,modrm),
   10356                  binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
   10357    } else {
   10358       vassert(sz == 8);
   10359       putIReg64( gregOfRexRM(pfx,modrm),
   10360                  binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
   10361    }
   10362 
   10363    return delta;
   10364 }
   10365 
   10366 
   10367 static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
   10368                             Long delta, Bool isAvx, UChar opc, Int sz )
   10369 {
   10370    vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
   10371    HChar  dis_buf[50];
   10372    Int    alen   = 0;
   10373    UChar  modrm  = getUChar(delta);
   10374    IRTemp addr   = IRTemp_INVALID;
   10375    IRTemp rmode  = newTemp(Ity_I32);
   10376    IRTemp f32lo  = newTemp(Ity_F32);
   10377    Bool   r2zero = toBool(opc == 0x2C);
   10378 
   10379    if (epartIsReg(modrm)) {
   10380       delta += 1;
   10381       assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   10382       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10383                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
   10384                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10385                                            False));
   10386    } else {
   10387       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10388       assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   10389       delta += alen;
   10390       DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
   10391                                   dis_buf,
   10392                                   nameIReg(sz, gregOfRexRM(pfx,modrm),
   10393                                            False));
   10394    }
   10395 
   10396    if (r2zero) {
   10397       assign( rmode, mkU32((UInt)Irrm_ZERO) );
   10398    } else {
   10399       assign( rmode, get_sse_roundingmode() );
   10400    }
   10401 
   10402    if (sz == 4) {
   10403       putIReg32( gregOfRexRM(pfx,modrm),
   10404                  binop( Iop_F64toI32S,
   10405                         mkexpr(rmode),
   10406                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10407    } else {
   10408       vassert(sz == 8);
   10409       putIReg64( gregOfRexRM(pfx,modrm),
   10410                  binop( Iop_F64toI64S,
   10411                         mkexpr(rmode),
   10412                         unop(Iop_F32toF64, mkexpr(f32lo))) );
   10413    }
   10414 
   10415    return delta;
   10416 }
   10417 
   10418 
   10419 static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10420                                Long delta, Bool isAvx )
   10421 {
   10422    IRTemp addr  = IRTemp_INVALID;
   10423    Int    alen  = 0;
   10424    HChar  dis_buf[50];
   10425    IRTemp f32lo = newTemp(Ity_F32);
   10426    IRTemp f32hi = newTemp(Ity_F32);
   10427    UChar  modrm = getUChar(delta);
   10428    UInt   rG    = gregOfRexRM(pfx,modrm);
   10429    if (epartIsReg(modrm)) {
   10430       UInt rE = eregOfRexRM(pfx,modrm);
   10431       assign( f32lo, getXMMRegLane32F(rE, 0) );
   10432       assign( f32hi, getXMMRegLane32F(rE, 1) );
   10433       delta += 1;
   10434       DIP("%scvtps2pd %s,%s\n",
   10435           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10436    } else {
   10437       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10438       assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
   10439       assign( f32hi, loadLE(Ity_F32,
   10440                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10441       delta += alen;
   10442       DIP("%scvtps2pd %s,%s\n",
   10443           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   10444    }
   10445 
   10446    putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
   10447    putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
   10448    if (isAvx)
   10449       putYMMRegLane128( rG, 1, mkV128(0));
   10450    return delta;
   10451 }
   10452 
   10453 
   10454 static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10455                                Long delta )
   10456 {
   10457    IRTemp addr  = IRTemp_INVALID;
   10458    Int    alen  = 0;
   10459    HChar  dis_buf[50];
   10460    IRTemp f32_0 = newTemp(Ity_F32);
   10461    IRTemp f32_1 = newTemp(Ity_F32);
   10462    IRTemp f32_2 = newTemp(Ity_F32);
   10463    IRTemp f32_3 = newTemp(Ity_F32);
   10464    UChar  modrm = getUChar(delta);
   10465    UInt   rG    = gregOfRexRM(pfx,modrm);
   10466    if (epartIsReg(modrm)) {
   10467       UInt rE = eregOfRexRM(pfx,modrm);
   10468       assign( f32_0, getXMMRegLane32F(rE, 0) );
   10469       assign( f32_1, getXMMRegLane32F(rE, 1) );
   10470       assign( f32_2, getXMMRegLane32F(rE, 2) );
   10471       assign( f32_3, getXMMRegLane32F(rE, 3) );
   10472       delta += 1;
   10473       DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
   10474    } else {
   10475       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10476       assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
   10477       assign( f32_1, loadLE(Ity_F32,
   10478                             binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
   10479       assign( f32_2, loadLE(Ity_F32,
   10480                             binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
   10481       assign( f32_3, loadLE(Ity_F32,
   10482                             binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
   10483       delta += alen;
   10484       DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
   10485    }
   10486 
   10487    putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
   10488    putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
   10489    putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
   10490    putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
   10491    return delta;
   10492 }
   10493 
   10494 
   10495 static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10496                                Long delta, Bool isAvx )
   10497 {
   10498    IRTemp addr  = IRTemp_INVALID;
   10499    Int    alen  = 0;
   10500    HChar  dis_buf[50];
   10501    UChar  modrm = getUChar(delta);
   10502    UInt   rG    = gregOfRexRM(pfx,modrm);
   10503    IRTemp argV  = newTemp(Ity_V128);
   10504    IRTemp rmode = newTemp(Ity_I32);
   10505    if (epartIsReg(modrm)) {
   10506       UInt rE = eregOfRexRM(pfx,modrm);
   10507       assign( argV, getXMMReg(rE) );
   10508       delta += 1;
   10509       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10510           nameXMMReg(rE), nameXMMReg(rG));
   10511    } else {
   10512       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10513       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10514       delta += alen;
   10515       DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
   10516           dis_buf, nameXMMReg(rG) );
   10517    }
   10518 
   10519    assign( rmode, get_sse_roundingmode() );
   10520    IRTemp t0 = newTemp(Ity_F64);
   10521    IRTemp t1 = newTemp(Ity_F64);
   10522    assign( t0, unop(Iop_ReinterpI64asF64,
   10523                     unop(Iop_V128to64, mkexpr(argV))) );
   10524    assign( t1, unop(Iop_ReinterpI64asF64,
   10525                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10526 
   10527 #  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
   10528    putXMMRegLane32(  rG, 3, mkU32(0) );
   10529    putXMMRegLane32(  rG, 2, mkU32(0) );
   10530    putXMMRegLane32F( rG, 1, CVT(t1) );
   10531    putXMMRegLane32F( rG, 0, CVT(t0) );
   10532 #  undef CVT
   10533    if (isAvx)
   10534       putYMMRegLane128( rG, 1, mkV128(0) );
   10535 
   10536    return delta;
   10537 }
   10538 
   10539 
   10540 static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10541                                 Long delta, Bool isAvx, Bool r2zero )
   10542 {
   10543    IRTemp addr  = IRTemp_INVALID;
   10544    Int    alen  = 0;
   10545    HChar  dis_buf[50];
   10546    UChar  modrm = getUChar(delta);
   10547    IRTemp argV  = newTemp(Ity_V128);
   10548    IRTemp rmode = newTemp(Ity_I32);
   10549    UInt   rG    = gregOfRexRM(pfx,modrm);
   10550    IRTemp t0, t1, t2, t3;
   10551 
   10552    if (epartIsReg(modrm)) {
   10553       UInt rE = eregOfRexRM(pfx,modrm);
   10554       assign( argV, getXMMReg(rE) );
   10555       delta += 1;
   10556       DIP("%scvt%sps2dq %s,%s\n",
   10557           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10558    } else {
   10559       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10560       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10561       delta += alen;
   10562       DIP("%scvt%sps2dq %s,%s\n",
   10563           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10564    }
   10565 
   10566    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10567                          : get_sse_roundingmode() );
   10568    t0 = t1 = t2 = t3 = IRTemp_INVALID;
   10569    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10570    /* This is less than ideal.  If it turns out to be a performance
   10571       bottleneck it can be improved. */
   10572 #  define CVT(_t)                             \
   10573       binop( Iop_F64toI32S,                   \
   10574              mkexpr(rmode),                   \
   10575              unop( Iop_F32toF64,              \
   10576                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10577 
   10578    putXMMRegLane32( rG, 3, CVT(t3) );
   10579    putXMMRegLane32( rG, 2, CVT(t2) );
   10580    putXMMRegLane32( rG, 1, CVT(t1) );
   10581    putXMMRegLane32( rG, 0, CVT(t0) );
   10582 #  undef CVT
   10583    if (isAvx)
   10584       putYMMRegLane128( rG, 1, mkV128(0) );
   10585 
   10586    return delta;
   10587 }
   10588 
   10589 
   10590 static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10591                                 Long delta, Bool r2zero )
   10592 {
   10593    IRTemp addr  = IRTemp_INVALID;
   10594    Int    alen  = 0;
   10595    HChar  dis_buf[50];
   10596    UChar  modrm = getUChar(delta);
   10597    IRTemp argV  = newTemp(Ity_V256);
   10598    IRTemp rmode = newTemp(Ity_I32);
   10599    UInt   rG    = gregOfRexRM(pfx,modrm);
   10600    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10601 
   10602    if (epartIsReg(modrm)) {
   10603       UInt rE = eregOfRexRM(pfx,modrm);
   10604       assign( argV, getYMMReg(rE) );
   10605       delta += 1;
   10606       DIP("vcvt%sps2dq %s,%s\n",
   10607           r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
   10608    } else {
   10609       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10610       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10611       delta += alen;
   10612       DIP("vcvt%sps2dq %s,%s\n",
   10613           r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
   10614    }
   10615 
   10616    assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
   10617                          : get_sse_roundingmode() );
   10618    t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
   10619    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10620    /* This is less than ideal.  If it turns out to be a performance
   10621       bottleneck it can be improved. */
   10622 #  define CVT(_t)                             \
   10623       binop( Iop_F64toI32S,                   \
   10624              mkexpr(rmode),                   \
   10625              unop( Iop_F32toF64,              \
   10626                    unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
   10627 
   10628    putYMMRegLane32( rG, 7, CVT(t7) );
   10629    putYMMRegLane32( rG, 6, CVT(t6) );
   10630    putYMMRegLane32( rG, 5, CVT(t5) );
   10631    putYMMRegLane32( rG, 4, CVT(t4) );
   10632    putYMMRegLane32( rG, 3, CVT(t3) );
   10633    putYMMRegLane32( rG, 2, CVT(t2) );
   10634    putYMMRegLane32( rG, 1, CVT(t1) );
   10635    putYMMRegLane32( rG, 0, CVT(t0) );
   10636 #  undef CVT
   10637 
   10638    return delta;
   10639 }
   10640 
   10641 
   10642 static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10643                                 Long delta, Bool isAvx, Bool r2zero )
   10644 {
   10645    IRTemp addr  = IRTemp_INVALID;
   10646    Int    alen  = 0;
   10647    HChar  dis_buf[50];
   10648    UChar  modrm = getUChar(delta);
   10649    IRTemp argV  = newTemp(Ity_V128);
   10650    IRTemp rmode = newTemp(Ity_I32);
   10651    UInt   rG    = gregOfRexRM(pfx,modrm);
   10652    IRTemp t0, t1;
   10653 
   10654    if (epartIsReg(modrm)) {
   10655       UInt rE = eregOfRexRM(pfx,modrm);
   10656       assign( argV, getXMMReg(rE) );
   10657       delta += 1;
   10658       DIP("%scvt%spd2dq %s,%s\n",
   10659           isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
   10660    } else {
   10661       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10662       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10663       delta += alen;
   10664       DIP("%scvt%spd2dqx %s,%s\n",
   10665           isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10666    }
   10667 
   10668    if (r2zero) {
   10669       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10670    } else {
   10671       assign( rmode, get_sse_roundingmode() );
   10672    }
   10673 
   10674    t0 = newTemp(Ity_F64);
   10675    t1 = newTemp(Ity_F64);
   10676    assign( t0, unop(Iop_ReinterpI64asF64,
   10677                     unop(Iop_V128to64, mkexpr(argV))) );
   10678    assign( t1, unop(Iop_ReinterpI64asF64,
   10679                     unop(Iop_V128HIto64, mkexpr(argV))) );
   10680 
   10681 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10682                           mkexpr(rmode),                   \
   10683                           mkexpr(_t) )
   10684 
   10685    putXMMRegLane32( rG, 3, mkU32(0) );
   10686    putXMMRegLane32( rG, 2, mkU32(0) );
   10687    putXMMRegLane32( rG, 1, CVT(t1) );
   10688    putXMMRegLane32( rG, 0, CVT(t0) );
   10689 #  undef CVT
   10690    if (isAvx)
   10691       putYMMRegLane128( rG, 1, mkV128(0) );
   10692 
   10693    return delta;
   10694 }
   10695 
   10696 
   10697 static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10698                                 Long delta, Bool r2zero )
   10699 {
   10700    IRTemp addr  = IRTemp_INVALID;
   10701    Int    alen  = 0;
   10702    HChar  dis_buf[50];
   10703    UChar  modrm = getUChar(delta);
   10704    IRTemp argV  = newTemp(Ity_V256);
   10705    IRTemp rmode = newTemp(Ity_I32);
   10706    UInt   rG    = gregOfRexRM(pfx,modrm);
   10707    IRTemp t0, t1, t2, t3;
   10708 
   10709    if (epartIsReg(modrm)) {
   10710       UInt rE = eregOfRexRM(pfx,modrm);
   10711       assign( argV, getYMMReg(rE) );
   10712       delta += 1;
   10713       DIP("vcvt%spd2dq %s,%s\n",
   10714           r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
   10715    } else {
   10716       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10717       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10718       delta += alen;
   10719       DIP("vcvt%spd2dqy %s,%s\n",
   10720           r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
   10721    }
   10722 
   10723    if (r2zero) {
   10724       assign(rmode, mkU32((UInt)Irrm_ZERO) );
   10725    } else {
   10726       assign( rmode, get_sse_roundingmode() );
   10727    }
   10728 
   10729    t0 = IRTemp_INVALID;
   10730    t1 = IRTemp_INVALID;
   10731    t2 = IRTemp_INVALID;
   10732    t3 = IRTemp_INVALID;
   10733    breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
   10734 
   10735 #  define CVT(_t)  binop( Iop_F64toI32S,                   \
   10736                           mkexpr(rmode),                   \
   10737                           unop( Iop_ReinterpI64asF64,      \
   10738                                 mkexpr(_t) ) )
   10739 
   10740    putXMMRegLane32( rG, 3, CVT(t3) );
   10741    putXMMRegLane32( rG, 2, CVT(t2) );
   10742    putXMMRegLane32( rG, 1, CVT(t1) );
   10743    putXMMRegLane32( rG, 0, CVT(t0) );
   10744 #  undef CVT
   10745    putYMMRegLane128( rG, 1, mkV128(0) );
   10746 
   10747    return delta;
   10748 }
   10749 
   10750 
   10751 static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10752                                Long delta, Bool isAvx )
   10753 {
   10754    IRTemp addr  = IRTemp_INVALID;
   10755    Int    alen  = 0;
   10756    HChar  dis_buf[50];
   10757    UChar  modrm = getUChar(delta);
   10758    IRTemp argV  = newTemp(Ity_V128);
   10759    IRTemp rmode = newTemp(Ity_I32);
   10760    UInt   rG    = gregOfRexRM(pfx,modrm);
   10761    IRTemp t0, t1, t2, t3;
   10762 
   10763    if (epartIsReg(modrm)) {
   10764       UInt rE = eregOfRexRM(pfx,modrm);
   10765       assign( argV, getXMMReg(rE) );
   10766       delta += 1;
   10767       DIP("%scvtdq2ps %s,%s\n",
   10768           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   10769    } else {
   10770       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10771       assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
   10772       delta += alen;
   10773       DIP("%scvtdq2ps %s,%s\n",
   10774           isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
   10775    }
   10776 
   10777    assign( rmode, get_sse_roundingmode() );
   10778    t0 = IRTemp_INVALID;
   10779    t1 = IRTemp_INVALID;
   10780    t2 = IRTemp_INVALID;
   10781    t3 = IRTemp_INVALID;
   10782    breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
   10783 
   10784 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10785                           mkexpr(rmode),                   \
   10786                           unop(Iop_I32StoF64,mkexpr(_t)))
   10787 
   10788    putXMMRegLane32F( rG, 3, CVT(t3) );
   10789    putXMMRegLane32F( rG, 2, CVT(t2) );
   10790    putXMMRegLane32F( rG, 1, CVT(t1) );
   10791    putXMMRegLane32F( rG, 0, CVT(t0) );
   10792 #  undef CVT
   10793    if (isAvx)
   10794       putYMMRegLane128( rG, 1, mkV128(0) );
   10795 
   10796    return delta;
   10797 }
   10798 
   10799 static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10800                                Long delta )
   10801 {
   10802    IRTemp addr   = IRTemp_INVALID;
   10803    Int    alen   = 0;
   10804    HChar  dis_buf[50];
   10805    UChar  modrm  = getUChar(delta);
   10806    IRTemp argV   = newTemp(Ity_V256);
   10807    IRTemp rmode  = newTemp(Ity_I32);
   10808    UInt   rG     = gregOfRexRM(pfx,modrm);
   10809    IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
   10810 
   10811    if (epartIsReg(modrm)) {
   10812       UInt rE = eregOfRexRM(pfx,modrm);
   10813       assign( argV, getYMMReg(rE) );
   10814       delta += 1;
   10815       DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   10816    } else {
   10817       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   10818       assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
   10819       delta += alen;
   10820       DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
   10821    }
   10822 
   10823    assign( rmode, get_sse_roundingmode() );
   10824    t0 = IRTemp_INVALID;
   10825    t1 = IRTemp_INVALID;
   10826    t2 = IRTemp_INVALID;
   10827    t3 = IRTemp_INVALID;
   10828    t4 = IRTemp_INVALID;
   10829    t5 = IRTemp_INVALID;
   10830    t6 = IRTemp_INVALID;
   10831    t7 = IRTemp_INVALID;
   10832    breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
   10833 
   10834 #  define CVT(_t)  binop( Iop_F64toF32,                    \
   10835                           mkexpr(rmode),                   \
   10836                           unop(Iop_I32StoF64,mkexpr(_t)))
   10837 
   10838    putYMMRegLane32F( rG, 7, CVT(t7) );
   10839    putYMMRegLane32F( rG, 6, CVT(t6) );
   10840    putYMMRegLane32F( rG, 5, CVT(t5) );
   10841    putYMMRegLane32F( rG, 4, CVT(t4) );
   10842    putYMMRegLane32F( rG, 3, CVT(t3) );
   10843    putYMMRegLane32F( rG, 2, CVT(t2) );
   10844    putYMMRegLane32F( rG, 1, CVT(t1) );
   10845    putYMMRegLane32F( rG, 0, CVT(t0) );
   10846 #  undef CVT
   10847 
   10848    return delta;
   10849 }
   10850 
   10851 
   10852 static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
   10853                                Long delta, Bool isAvx )
   10854 {
   10855    UChar modrm = getUChar(delta);
   10856    vassert(epartIsReg(modrm)); /* ensured by caller */
   10857    UInt   rE = eregOfRexRM(pfx,modrm);
   10858    UInt   rG = gregOfRexRM(pfx,modrm);
   10859    IRTemp t0 = newTemp(Ity_V128);
   10860    IRTemp t1 = newTemp(Ity_I32);
   10861    assign(t0, getXMMReg(rE));
   10862    assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
   10863    putIReg32(rG, mkexpr(t1));
   10864    DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
   10865        nameIReg32(rG));
   10866    delta += 1;
   10867    return delta;
   10868 }
   10869 
   10870 
   10871 static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
   10872                                Long delta  )
   10873 {
   10874    UChar modrm = getUChar(delta);
   10875    vassert(epartIsReg(modrm)); /* ensured by caller */
   10876    UInt   rE = eregOfRexRM(pfx,modrm);
   10877    UInt   rG = gregOfRexRM(pfx,modrm);
   10878    IRTemp t0 = newTemp(Ity_V128);
   10879    IRTemp t1 = newTemp(Ity_V128);
   10880    IRTemp t2 = newTemp(Ity_I16);
   10881    IRTemp t3 = newTemp(Ity_I16);
   10882    assign(t0, getYMMRegLane128(rE, 0));
   10883    assign(t1, getYMMRegLane128(rE, 1));
   10884    assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
   10885    assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
   10886    putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
   10887    DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   10888    delta += 1;
   10889    return delta;
   10890 }
   10891 
   10892 
   10893 /* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
   10894    relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
   10895 /* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
   10896 static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10897 {
   10898    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10899    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10900    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10901    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10902    IRTemp res = newTemp(Ity_V128);
   10903    assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
   10904                      : mkV128from32s( s1, d1, s0, d0 ));
   10905    return res;
   10906 }
   10907 
   10908 
   10909 /* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
   10910 /* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
   10911 static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10912 {
   10913    IRTemp s1 = newTemp(Ity_I64);
   10914    IRTemp s0 = newTemp(Ity_I64);
   10915    IRTemp d1 = newTemp(Ity_I64);
   10916    IRTemp d0 = newTemp(Ity_I64);
   10917    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   10918    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   10919    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   10920    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   10921    IRTemp res = newTemp(Ity_V128);
   10922    assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
   10923                     : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
   10924    return res;
   10925 }
   10926 
   10927 
   10928 /* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
   10929    Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
   10930    or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
   10931    way. */
   10932 static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10933 {
   10934    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10935    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10936    breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
   10937    breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
   10938    IRTemp res = newTemp(Ity_V256);
   10939    assign(res, xIsH
   10940                ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
   10941                                             mkexpr(s1), mkexpr(d1))
   10942                : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
   10943                                             mkexpr(s0), mkexpr(d0)));
   10944    return res;
   10945 }
   10946 
   10947 
   10948 /* FIXME: this is really bad.  Surely can do something better here?
   10949    One observation is that the steering in the upper and lower 128 bit
   10950    halves is the same as with math_UNPCKxPS_128, so we simply split
   10951    into two halves, and use that.  Consequently any improvement in
   10952    math_UNPCKxPS_128 (probably, to use interleave-style primops)
   10953    benefits this too. */
   10954 static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
   10955 {
   10956    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10957    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10958    breakupV256toV128s( sV, &sVhi, &sVlo );
   10959    breakupV256toV128s( dV, &dVhi, &dVlo );
   10960    IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
   10961    IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
   10962    IRTemp rV   = newTemp(Ity_V256);
   10963    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   10964    return rV;
   10965 }
   10966 
   10967 
   10968 static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10969 {
   10970    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   10971    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   10972    vassert(imm8 < 256);
   10973 
   10974    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   10975    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   10976 
   10977 #  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
   10978 #  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   10979    IRTemp res = newTemp(Ity_V128);
   10980    assign(res,
   10981           mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
   10982                          SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
   10983 #  undef SELD
   10984 #  undef SELS
   10985    return res;
   10986 }
   10987 
   10988 
   10989 /* 256-bit SHUFPS appears to steer each of the 128-bit halves
   10990    identically.  Hence do the clueless thing and use math_SHUFPS_128
   10991    twice. */
   10992 static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   10993 {
   10994    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   10995    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   10996    breakupV256toV128s( sV, &sVhi, &sVlo );
   10997    breakupV256toV128s( dV, &dVhi, &dVlo );
   10998    IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
   10999    IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
   11000    IRTemp rV   = newTemp(Ity_V256);
   11001    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11002    return rV;
   11003 }
   11004 
   11005 
   11006 static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11007 {
   11008    IRTemp s1 = newTemp(Ity_I64);
   11009    IRTemp s0 = newTemp(Ity_I64);
   11010    IRTemp d1 = newTemp(Ity_I64);
   11011    IRTemp d0 = newTemp(Ity_I64);
   11012 
   11013    assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
   11014    assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
   11015    assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
   11016    assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
   11017 
   11018 #  define SELD(n) mkexpr((n)==0 ? d0 : d1)
   11019 #  define SELS(n) mkexpr((n)==0 ? s0 : s1)
   11020 
   11021    IRTemp res = newTemp(Ity_V128);
   11022    assign(res, binop( Iop_64HLtoV128,
   11023                       SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
   11024 
   11025 #  undef SELD
   11026 #  undef SELS
   11027    return res;
   11028 }
   11029 
   11030 
   11031 static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11032 {
   11033    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11034    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11035    breakupV256toV128s( sV, &sVhi, &sVlo );
   11036    breakupV256toV128s( dV, &dVhi, &dVlo );
   11037    IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11038    IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
   11039    IRTemp rV   = newTemp(Ity_V256);
   11040    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11041    return rV;
   11042 }
   11043 
   11044 
   11045 static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11046 {
   11047    UShort imm8_mask_16;
   11048    IRTemp imm8_mask = newTemp(Ity_V128);
   11049 
   11050    switch( imm8 & 3 ) {
   11051       case 0:  imm8_mask_16 = 0x0000; break;
   11052       case 1:  imm8_mask_16 = 0x00FF; break;
   11053       case 2:  imm8_mask_16 = 0xFF00; break;
   11054       case 3:  imm8_mask_16 = 0xFFFF; break;
   11055       default: vassert(0);            break;
   11056    }
   11057    assign( imm8_mask, mkV128( imm8_mask_16 ) );
   11058 
   11059    IRTemp res = newTemp(Ity_V128);
   11060    assign ( res, binop( Iop_OrV128,
   11061                         binop( Iop_AndV128, mkexpr(sV),
   11062                                             mkexpr(imm8_mask) ),
   11063                         binop( Iop_AndV128, mkexpr(dV),
   11064                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11065    return res;
   11066 }
   11067 
   11068 
   11069 static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11070 {
   11071    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11072    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11073    breakupV256toV128s( sV, &sVhi, &sVlo );
   11074    breakupV256toV128s( dV, &dVhi, &dVlo );
   11075    IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
   11076    IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
   11077    IRTemp rV   = newTemp(Ity_V256);
   11078    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11079    return rV;
   11080 }
   11081 
   11082 
   11083 static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11084 {
   11085    UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
   11086                              0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
   11087                              0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
   11088                              0xFFFF };
   11089    IRTemp imm8_mask = newTemp(Ity_V128);
   11090    assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
   11091 
   11092    IRTemp res = newTemp(Ity_V128);
   11093    assign ( res, binop( Iop_OrV128,
   11094                         binop( Iop_AndV128, mkexpr(sV),
   11095                                             mkexpr(imm8_mask) ),
   11096                         binop( Iop_AndV128, mkexpr(dV),
   11097                                unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
   11098    return res;
   11099 }
   11100 
   11101 
   11102 static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11103 {
   11104    IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
   11105    IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
   11106    breakupV256toV128s( sV, &sVhi, &sVlo );
   11107    breakupV256toV128s( dV, &dVhi, &dVlo );
   11108    IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
   11109    IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
   11110    IRTemp rV   = newTemp(Ity_V256);
   11111    assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
   11112    return rV;
   11113 }
   11114 
   11115 
   11116 static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
   11117 {
   11118    /* Make w be a 16-bit version of imm8, formed by duplicating each
   11119       bit in imm8. */
   11120    Int i;
   11121    UShort imm16 = 0;
   11122    for (i = 0; i < 8; i++) {
   11123       if (imm8 & (1 << i))
   11124          imm16 |= (3 << (2*i));
   11125    }
   11126    IRTemp imm16_mask = newTemp(Ity_V128);
   11127    assign( imm16_mask, mkV128( imm16 ));
   11128 
   11129    IRTemp res = newTemp(Ity_V128);
   11130    assign ( res, binop( Iop_OrV128,
   11131                         binop( Iop_AndV128, mkexpr(sV),
   11132                                             mkexpr(imm16_mask) ),
   11133                         binop( Iop_AndV128, mkexpr(dV),
   11134                                unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
   11135    return res;
   11136 }
   11137 
   11138 
   11139 static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
   11140 {
   11141    /* This is a really poor translation -- could be improved if
   11142       performance critical */
   11143    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11144    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11145    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11146    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11147    IRTemp res = newTemp(Ity_V128);
   11148    assign(res, binop(Iop_64HLtoV128,
   11149                      binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
   11150                      binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
   11151    return res;
   11152 }
   11153 
   11154 
   11155 static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
   11156 {
   11157    /* This is a really poor translation -- could be improved if
   11158       performance critical */
   11159    IRTemp sHi, sLo, dHi, dLo;
   11160    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11161    breakupV256toV128s( dV, &dHi, &dLo);
   11162    breakupV256toV128s( sV, &sHi, &sLo);
   11163    IRTemp res = newTemp(Ity_V256);
   11164    assign(res, binop(Iop_V128HLtoV256,
   11165                      mkexpr(math_PMULUDQ_128(sHi, dHi)),
   11166                      mkexpr(math_PMULUDQ_128(sLo, dLo))));
   11167    return res;
   11168 }
   11169 
   11170 
   11171 static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
   11172 {
   11173    /* This is a really poor translation -- could be improved if
   11174       performance critical */
   11175    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   11176    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   11177    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   11178    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11179    IRTemp res = newTemp(Ity_V128);
   11180    assign(res, binop(Iop_64HLtoV128,
   11181                      binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
   11182                      binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
   11183    return res;
   11184 }
   11185 
   11186 
   11187 static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
   11188 {
   11189    /* This is a really poor translation -- could be improved if
   11190       performance critical */
   11191    IRTemp sHi, sLo, dHi, dLo;
   11192    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11193    breakupV256toV128s( dV, &dHi, &dLo);
   11194    breakupV256toV128s( sV, &sHi, &sLo);
   11195    IRTemp res = newTemp(Ity_V256);
   11196    assign(res, binop(Iop_V128HLtoV256,
   11197                      mkexpr(math_PMULDQ_128(sHi, dHi)),
   11198                      mkexpr(math_PMULDQ_128(sLo, dLo))));
   11199    return res;
   11200 }
   11201 
   11202 
   11203 static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
   11204 {
   11205    IRTemp sVhi, sVlo, dVhi, dVlo;
   11206    IRTemp resHi = newTemp(Ity_I64);
   11207    IRTemp resLo = newTemp(Ity_I64);
   11208    sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
   11209    breakupV128to64s( sV, &sVhi, &sVlo );
   11210    breakupV128to64s( dV, &dVhi, &dVlo );
   11211    assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11212                                 "amd64g_calculate_mmx_pmaddwd",
   11213                                 &amd64g_calculate_mmx_pmaddwd,
   11214                                 mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
   11215    assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
   11216                                 "amd64g_calculate_mmx_pmaddwd",
   11217                                 &amd64g_calculate_mmx_pmaddwd,
   11218                                 mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
   11219    IRTemp res = newTemp(Ity_V128);
   11220    assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
   11221    return res;
   11222 }
   11223 
   11224 
   11225 static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
   11226 {
   11227    IRTemp sHi, sLo, dHi, dLo;
   11228    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   11229    breakupV256toV128s( dV, &dHi, &dLo);
   11230    breakupV256toV128s( sV, &sHi, &sLo);
   11231    IRTemp res = newTemp(Ity_V256);
   11232    assign(res, binop(Iop_V128HLtoV256,
   11233                      mkexpr(math_PMADDWD_128(dHi, sHi)),
   11234                      mkexpr(math_PMADDWD_128(dLo, sLo))));
   11235    return res;
   11236 }
   11237 
   11238 
   11239 static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
   11240 {
   11241    IRTemp addV = newTemp(Ity_V128);
   11242    IRTemp subV = newTemp(Ity_V128);
   11243    IRTemp a1   = newTemp(Ity_I64);
   11244    IRTemp s0   = newTemp(Ity_I64);
   11245    IRTemp rm   = newTemp(Ity_I32);
   11246 
   11247    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11248    assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11249    assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11250 
   11251    assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
   11252    assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
   11253 
   11254    IRTemp res = newTemp(Ity_V128);
   11255    assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
   11256    return res;
   11257 }
   11258 
   11259 
   11260 static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
   11261 {
   11262    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11263    IRTemp addV = newTemp(Ity_V256);
   11264    IRTemp subV = newTemp(Ity_V256);
   11265    IRTemp rm   = newTemp(Ity_I32);
   11266    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11267 
   11268    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11269    assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11270    assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11271 
   11272    breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
   11273    breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
   11274 
   11275    IRTemp res = newTemp(Ity_V256);
   11276    assign( res, mkV256from64s( a3, s2, a1, s0 ) );
   11277    return res;
   11278 }
   11279 
   11280 
   11281 static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
   11282 {
   11283    IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
   11284    IRTemp addV = newTemp(Ity_V128);
   11285    IRTemp subV = newTemp(Ity_V128);
   11286    IRTemp rm   = newTemp(Ity_I32);
   11287    a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11288 
   11289    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11290    assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11291    assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11292 
   11293    breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
   11294    breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
   11295 
   11296    IRTemp res = newTemp(Ity_V128);
   11297    assign( res, mkV128from32s( a3, s2, a1, s0 ) );
   11298    return res;
   11299 }
   11300 
   11301 
   11302 static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
   11303 {
   11304    IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
   11305    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   11306    IRTemp addV = newTemp(Ity_V256);
   11307    IRTemp subV = newTemp(Ity_V256);
   11308    IRTemp rm   = newTemp(Ity_I32);
   11309    a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
   11310    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11311 
   11312    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   11313    assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11314    assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
   11315 
   11316    breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
   11317    breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   11318 
   11319    IRTemp res = newTemp(Ity_V256);
   11320    assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
   11321    return res;
   11322 }
   11323 
   11324 
   11325 /* Handle 128 bit PSHUFLW and PSHUFHW. */
   11326 static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11327                               Long delta, Bool isAvx, Bool xIsH )
   11328 {
   11329    IRTemp addr  = IRTemp_INVALID;
   11330    Int    alen  = 0;
   11331    HChar  dis_buf[50];
   11332    UChar  modrm = getUChar(delta);
   11333    UInt   rG = gregOfRexRM(pfx,modrm);
   11334    UInt   imm8;
   11335    IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
   11336    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11337    sV    = newTemp(Ity_V128);
   11338    dV    = newTemp(Ity_V128);
   11339    sVmut = newTemp(Ity_I64);
   11340    dVmut = newTemp(Ity_I64);
   11341    sVcon = newTemp(Ity_I64);
   11342    if (epartIsReg(modrm)) {
   11343       UInt rE = eregOfRexRM(pfx,modrm);
   11344       assign( sV, getXMMReg(rE) );
   11345       imm8 = (UInt)getUChar(delta+1);
   11346       delta += 1+1;
   11347       DIP("%spshuf%cw $%u,%s,%s\n",
   11348           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11349           imm8, nameXMMReg(rE), nameXMMReg(rG));
   11350    } else {
   11351       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11352       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   11353       imm8 = (UInt)getUChar(delta+alen);
   11354       delta += alen+1;
   11355       DIP("%spshuf%cw $%u,%s,%s\n",
   11356           isAvx ? "v" : "", xIsH ? 'h' : 'l',
   11357           imm8, dis_buf, nameXMMReg(rG));
   11358    }
   11359 
   11360    /* Get the to-be-changed (mut) and unchanging (con) bits of the
   11361       source. */
   11362    assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
   11363    assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
   11364 
   11365    breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
   11366 #  define SEL(n) \
   11367              ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   11368    assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
   11369                               SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
   11370 #  undef SEL
   11371 
   11372    assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
   11373                    : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
   11374 
   11375    (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
   11376    return delta;
   11377 }
   11378 
   11379 
   11380 /* Handle 256 bit PSHUFLW and PSHUFHW. */
   11381 static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
   11382                               Long delta, Bool xIsH )
   11383 {
   11384    IRTemp addr  = IRTemp_INVALID;
   11385    Int    alen  = 0;
   11386    HChar  dis_buf[50];
   11387    UChar  modrm = getUChar(delta);
   11388    UInt   rG = gregOfRexRM(pfx,modrm);
   11389    UInt   imm8;
   11390    IRTemp sV, s[8], sV64[4], dVhi, dVlo;
   11391    sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
   11392    s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
   11393    sV    = newTemp(Ity_V256);
   11394    dVhi  = newTemp(Ity_I64);
   11395    dVlo  = newTemp(Ity_I64);
   11396    if (epartIsReg(modrm)) {
   11397       UInt rE = eregOfRexRM(pfx,modrm);
   11398       assign( sV, getYMMReg(rE) );
   11399       imm8 = (UInt)getUChar(delta+1);
   11400       delta += 1+1;
   11401       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11402           imm8, nameYMMReg(rE), nameYMMReg(rG));
   11403    } else {
   11404       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   11405       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   11406       imm8 = (UInt)getUChar(delta+alen);
   11407       delta += alen+1;
   11408       DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
   11409           imm8, dis_buf, nameYMMReg(rG));
   11410    }
   11411 
   11412    breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
   11413    breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
   11414    breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
   11415 
   11416    assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
   11417                               s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
   11418    assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
   11419                               s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
   11420    putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
   11421                                  xIsH ? sV64[2] : dVhi,
   11422                                  xIsH ? dVlo : sV64[1],
   11423                                  xIsH ? sV64[0] : dVlo ) );
   11424    return delta;
   11425 }
   11426 
   11427 
   11428 static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
   11429                                           Long delta, Bool isAvx )
   11430 {
   11431    Long   deltaIN = delta;
   11432    UChar  modrm   = getUChar(delta);
   11433    UInt   rG      = gregOfRexRM(pfx,modrm);
   11434    IRTemp sV      = newTemp(Ity_V128);
   11435    IRTemp d16     = newTemp(Ity_I16);
   11436    UInt   imm8;
   11437    IRTemp s0, s1, s2, s3;
   11438    if (epartIsReg(modrm)) {
   11439       UInt rE = eregOfRexRM(pfx,modrm);
   11440       assign(sV, getXMMReg(rE));
   11441       imm8 = getUChar(delta+1) & 7;
   11442       delta += 1+1;
   11443       DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
   11444           imm8, nameXMMReg(rE), nameIReg32(rG));
   11445    } else {
   11446       /* The memory case is disallowed, apparently. */
   11447       return deltaIN; /* FAIL */
   11448    }
   11449    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   11450    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   11451    switch (imm8) {
   11452       case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
   11453       case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
   11454       case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
   11455       case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
   11456       case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
   11457       case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
   11458       case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
   11459       case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
   11460       default: vassert(0);
   11461    }
   11462    putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
   11463    return delta;
   11464 }
   11465 
   11466 
   11467 static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   11468                                Long delta, Bool isAvx )
   11469 {
   11470    IRTemp addr  = IRTemp_INVALID;
   11471    Int    alen  = 0;
   11472    HChar  dis_buf[50];
   11473    UChar  modrm = getUChar(delta);
   11474    IRTemp arg64 = newTemp(Ity_I64);
   11475    UInt   rG    = gregOfRexRM(pfx,modrm);
   11476    const HChar* mbV   = isAvx ? "v" : "";
   11477    if (epartIsReg(modrm)) {
   11478       UInt rE = eregOfRexRM(pfx,modrm);
   11479       assign( arg64, getXMMRegLane64(rE, 0) );
   11480       delta += 1;
   11481       DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
   11482    } else {
   11483       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11484       assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   11485       delta += alen;
   11486       DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
   11487    }
   11488    putXMMRegLane64F(
   11489       rG, 0,
   11490       unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
   11491    );
   11492    putXMMRegLane64F(
   11493       rG, 1,
   11494       unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
   11495    );
   11496    if (isAvx)
   11497       putYMMRegLane128(rG, 1, mkV128(0));
   11498    return delta;
   11499 }
   11500 
   11501 
   11502 static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11503                           Long delta, Bool isAvx )
   11504 {
   11505    IRTemp addr  = IRTemp_INVALID;
   11506    Int    alen  = 0;
   11507    HChar  dis_buf[50];
   11508    UChar  modrm = getUChar(delta);
   11509    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11510    vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
   11511 
   11512    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11513    delta += alen;
   11514 
   11515    /* Fake up a native SSE mxcsr word.  The only thing it depends on
   11516       is SSEROUND[1:0], so call a clean helper to cook it up.
   11517    */
   11518    /* ULong amd64h_create_mxcsr ( ULong sseround ) */
   11519    DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11520    storeLE(
   11521       mkexpr(addr),
   11522       unop(Iop_64to32,
   11523            mkIRExprCCall(
   11524               Ity_I64, 0/*regp*/,
   11525               "amd64g_create_mxcsr", &amd64g_create_mxcsr,
   11526               mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
   11527            )
   11528       )
   11529    );
   11530    return delta;
   11531 }
   11532 
   11533 
   11534 static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
   11535                           Long delta, Bool isAvx )
   11536 {
   11537    IRTemp addr  = IRTemp_INVALID;
   11538    Int    alen  = 0;
   11539    HChar  dis_buf[50];
   11540    UChar  modrm = getUChar(delta);
   11541    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11542    vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
   11543 
   11544    IRTemp t64 = newTemp(Ity_I64);
   11545    IRTemp ew  = newTemp(Ity_I32);
   11546 
   11547    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11548    delta += alen;
   11549    DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
   11550 
   11551    /* The only thing we observe in %mxcsr is the rounding mode.
   11552       Therefore, pass the 32-bit value (SSE native-format control
   11553       word) to a clean helper, getting back a 64-bit value, the
   11554       lower half of which is the SSEROUND value to store, and the
   11555       upper half of which is the emulation-warning token which may
   11556       be generated.
   11557    */
   11558    /* ULong amd64h_check_ldmxcsr ( ULong ); */
   11559    assign( t64, mkIRExprCCall(
   11560                    Ity_I64, 0/*regparms*/,
   11561                    "amd64g_check_ldmxcsr",
   11562                    &amd64g_check_ldmxcsr,
   11563                    mkIRExprVec_1(
   11564                       unop(Iop_32Uto64,
   11565                            loadLE(Ity_I32, mkexpr(addr))
   11566                       )
   11567                    )
   11568                 )
   11569          );
   11570 
   11571    put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
   11572    assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
   11573    put_emwarn( mkexpr(ew) );
   11574    /* Finally, if an emulation warning was reported, side-exit to
   11575       the next insn, reporting the warning, so that Valgrind's
   11576       dispatcher sees the warning. */
   11577    stmt(
   11578       IRStmt_Exit(
   11579          binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
   11580          Ijk_EmWarn,
   11581          IRConst_U64(guest_RIP_bbstart+delta),
   11582          OFFB_RIP
   11583       )
   11584    );
   11585    return delta;
   11586 }
   11587 
   11588 
   11589 static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
   11590 {
   11591    /* ------ rfbm[0] gates the x87 state ------ */
   11592 
   11593    /* Uses dirty helper:
   11594          void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11595    */
   11596    IRDirty* d0 = unsafeIRDirty_0_N (
   11597                     0/*regparms*/,
   11598                     "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
   11599                     &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
   11600                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11601                  );
   11602    d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
   11603                      mkU64(1));
   11604 
   11605    /* Declare we're writing memory.  Really, bytes 24 through 31
   11606       (MXCSR and MXCSR_MASK) aren't written, but we can't express more
   11607       than 1 memory area here, so just mark the whole thing as
   11608       written. */
   11609    d0->mFx   = Ifx_Write;
   11610    d0->mAddr = mkexpr(addr);
   11611    d0->mSize = 160;
   11612 
   11613    /* declare we're reading guest state */
   11614    d0->nFxState = 5;
   11615    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11616 
   11617    d0->fxState[0].fx     = Ifx_Read;
   11618    d0->fxState[0].offset = OFFB_FTOP;
   11619    d0->fxState[0].size   = sizeof(UInt);
   11620 
   11621    d0->fxState[1].fx     = Ifx_Read;
   11622    d0->fxState[1].offset = OFFB_FPREGS;
   11623    d0->fxState[1].size   = 8 * sizeof(ULong);
   11624 
   11625    d0->fxState[2].fx     = Ifx_Read;
   11626    d0->fxState[2].offset = OFFB_FPTAGS;
   11627    d0->fxState[2].size   = 8 * sizeof(UChar);
   11628 
   11629    d0->fxState[3].fx     = Ifx_Read;
   11630    d0->fxState[3].offset = OFFB_FPROUND;
   11631    d0->fxState[3].size   = sizeof(ULong);
   11632 
   11633    d0->fxState[4].fx     = Ifx_Read;
   11634    d0->fxState[4].offset = OFFB_FC3210;
   11635    d0->fxState[4].size   = sizeof(ULong);
   11636 
   11637    stmt( IRStmt_Dirty(d0) );
   11638 
   11639    /* ------ rfbm[1] gates the SSE state ------ */
   11640 
   11641    IRTemp rfbm_1    = newTemp(Ity_I64);
   11642    IRTemp rfbm_1or2 = newTemp(Ity_I64);
   11643    assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11644    assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11645 
   11646    IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
   11647    IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
   11648 
   11649    /* Uses dirty helper:
   11650          void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
   11651                  ( VexGuestAMD64State*, ULong )
   11652       This creates only MXCSR and MXCSR_MASK.  We need to do this if
   11653       either components 1 (SSE) or 2 (AVX) are requested.  Hence the
   11654       guard condition is a bit more complex.
   11655    */
   11656    IRDirty* d1 = unsafeIRDirty_0_N (
   11657                     0/*regparms*/,
   11658                     "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
   11659                     &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
   11660                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11661                  );
   11662    d1->guard = guard_1or2;
   11663 
   11664    /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
   11665       the code for rbfm[0] just above claims a write of 0 .. 159, so
   11666       this duplicates it.  But at least correctly connects 24 .. 31 to
   11667       the MXCSR guest state representation (SSEROUND field). */
   11668    d1->mFx   = Ifx_Write;
   11669    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11670    d1->mSize = 8;
   11671 
   11672    /* declare we're reading guest state */
   11673    d1->nFxState = 1;
   11674    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11675 
   11676    d1->fxState[0].fx     = Ifx_Read;
   11677    d1->fxState[0].offset = OFFB_SSEROUND;
   11678    d1->fxState[0].size   = sizeof(ULong);
   11679 
   11680    /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
   11681       else.  We do the actual register array, XMM[0..15], separately,
   11682       in order that any undefinedness in the XMM registers is tracked
   11683       separately by Memcheck and does not "infect" the in-memory
   11684       shadow for the other parts of the image. */
   11685    stmt( IRStmt_Dirty(d1) );
   11686 
   11687    /* And now the XMMs themselves. */
   11688    UInt reg;
   11689    for (reg = 0; reg < 16; reg++) {
   11690       stmt( IRStmt_StoreG(
   11691                Iend_LE,
   11692                binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
   11693                getXMMReg(reg),
   11694                guard_1
   11695       ));
   11696    }
   11697 
   11698    /* ------ rfbm[2] gates the AVX state ------ */
   11699    /* Component 2 is just a bunch of register saves, so we'll do it
   11700       inline, just to be simple and to be Memcheck friendly. */
   11701 
   11702    IRTemp rfbm_2 = newTemp(Ity_I64);
   11703    assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11704 
   11705    IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
   11706 
   11707    for (reg = 0; reg < 16; reg++) {
   11708       stmt( IRStmt_StoreG(
   11709                Iend_LE,
   11710                binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
   11711                getYMMRegLane128(reg,1),
   11712                guard_2
   11713       ));
   11714    }
   11715 }
   11716 
   11717 
   11718 static Long dis_XSAVE ( const VexAbiInfo* vbi,
   11719                         Prefix pfx, Long delta, Int sz )
   11720 {
   11721    /* Note that the presence or absence of REX.W (indicated here by
   11722       |sz|) slightly affects the written format: whether the saved FPU
   11723       IP and DP pointers are 64 or 32 bits.  But the helper function
   11724       we call simply writes zero bits in the relevant fields, which
   11725       are 64 bits regardless of what REX.W is, and so it's good enough
   11726       (iow, equally broken) in both cases. */
   11727    IRTemp addr  = IRTemp_INVALID;
   11728    Int    alen  = 0;
   11729    HChar  dis_buf[50];
   11730    UChar  modrm = getUChar(delta);
   11731    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11732    vassert(sz == 4 || sz == 8); /* ditto */
   11733 
   11734    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11735    delta += alen;
   11736    gen_SEGV_if_not_64_aligned(addr);
   11737 
   11738    DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11739 
   11740    /* VEX's caller is assumed to have checked this. */
   11741    const ULong aSSUMED_XCR0_VALUE = 7;
   11742 
   11743    IRTemp rfbm = newTemp(Ity_I64);
   11744    assign(rfbm,
   11745           binop(Iop_And64,
   11746                 binop(Iop_Or64,
   11747                       binop(Iop_Shl64,
   11748                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   11749                       unop(Iop_32Uto64, getIRegRAX(4))),
   11750                 mkU64(aSSUMED_XCR0_VALUE)));
   11751 
   11752    gen_XSAVE_SEQUENCE(addr, rfbm);
   11753 
   11754    /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
   11755       OR-ing the RFBM value into it. */
   11756    IRTemp addr_plus_512 = newTemp(Ity_I64);
   11757    assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
   11758    storeLE( mkexpr(addr_plus_512),
   11759             binop(Iop_Or8,
   11760                   unop(Iop_64to8, mkexpr(rfbm)),
   11761                   loadLE(Ity_I8, mkexpr(addr_plus_512))) );
   11762 
   11763    return delta;
   11764 }
   11765 
   11766 
   11767 static Long dis_FXSAVE ( const VexAbiInfo* vbi,
   11768                          Prefix pfx, Long delta, Int sz )
   11769 {
   11770    /* See comment in dis_XSAVE about the significance of REX.W. */
   11771    IRTemp addr  = IRTemp_INVALID;
   11772    Int    alen  = 0;
   11773    HChar  dis_buf[50];
   11774    UChar  modrm = getUChar(delta);
   11775    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11776    vassert(sz == 4 || sz == 8); /* ditto */
   11777 
   11778    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11779    delta += alen;
   11780    gen_SEGV_if_not_16_aligned(addr);
   11781 
   11782    DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11783 
   11784    /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
   11785       to 0b011, generate the XSAVE sequence accordingly, and let iropt
   11786       fold out the unused (AVX) parts accordingly. */
   11787    IRTemp rfbm = newTemp(Ity_I64);
   11788    assign(rfbm, mkU64(3));
   11789    gen_XSAVE_SEQUENCE(addr, rfbm);
   11790 
   11791    return delta;
   11792 }
   11793 
   11794 
   11795 static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
   11796 {
   11797    /* ------ rfbm[0] gates the x87 state ------ */
   11798 
   11799    /* If rfbm[0] == 1, we have to write the x87 state.  If
   11800       xstate_bv[0] == 1, we will read it from the memory image, else
   11801       we'll set it to initial values.  Doing this with a helper
   11802       function and getting the definedness flow annotations correct is
   11803       too difficult, so generate stupid but simple code: first set the
   11804       registers to initial values, regardless of xstate_bv[0].  Then,
   11805       conditionally restore from the memory image. */
   11806 
   11807    IRTemp rfbm_0       = newTemp(Ity_I64);
   11808    IRTemp xstate_bv_0  = newTemp(Ity_I64);
   11809    IRTemp restore_0    = newTemp(Ity_I64);
   11810    assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
   11811    assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
   11812    assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
   11813 
   11814    gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
   11815 
   11816    /* Uses dirty helper:
   11817          void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
   11818    */
   11819    IRDirty* d0 = unsafeIRDirty_0_N (
   11820                     0/*regparms*/,
   11821                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
   11822                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
   11823                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11824                  );
   11825    d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
   11826 
   11827    /* Declare we're reading memory.  Really, bytes 24 through 31
   11828       (MXCSR and MXCSR_MASK) aren't read, but we can't express more
   11829       than 1 memory area here, so just mark the whole thing as
   11830       read. */
   11831    d0->mFx   = Ifx_Read;
   11832    d0->mAddr = mkexpr(addr);
   11833    d0->mSize = 160;
   11834 
   11835    /* declare we're writing guest state */
   11836    d0->nFxState = 5;
   11837    vex_bzero(&d0->fxState, sizeof(d0->fxState));
   11838 
   11839    d0->fxState[0].fx     = Ifx_Write;
   11840    d0->fxState[0].offset = OFFB_FTOP;
   11841    d0->fxState[0].size   = sizeof(UInt);
   11842 
   11843    d0->fxState[1].fx     = Ifx_Write;
   11844    d0->fxState[1].offset = OFFB_FPREGS;
   11845    d0->fxState[1].size   = 8 * sizeof(ULong);
   11846 
   11847    d0->fxState[2].fx     = Ifx_Write;
   11848    d0->fxState[2].offset = OFFB_FPTAGS;
   11849    d0->fxState[2].size   = 8 * sizeof(UChar);
   11850 
   11851    d0->fxState[3].fx     = Ifx_Write;
   11852    d0->fxState[3].offset = OFFB_FPROUND;
   11853    d0->fxState[3].size   = sizeof(ULong);
   11854 
   11855    d0->fxState[4].fx     = Ifx_Write;
   11856    d0->fxState[4].offset = OFFB_FC3210;
   11857    d0->fxState[4].size   = sizeof(ULong);
   11858 
   11859    stmt( IRStmt_Dirty(d0) );
   11860 
   11861    /* ------ rfbm[1] gates the SSE state ------ */
   11862 
   11863    /* Same scheme as component 0: first zero it out, and then possibly
   11864       restore from the memory area. */
   11865    IRTemp rfbm_1       = newTemp(Ity_I64);
   11866    IRTemp xstate_bv_1  = newTemp(Ity_I64);
   11867    IRTemp restore_1    = newTemp(Ity_I64);
   11868    assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
   11869    assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
   11870    assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
   11871    IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
   11872    IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
   11873 
   11874    IRTemp rfbm_1or2       = newTemp(Ity_I64);
   11875    IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
   11876    IRTemp restore_1or2    = newTemp(Ity_I64);
   11877    assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
   11878    assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
   11879    assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
   11880                                            mkexpr(xstate_bv_1or2)));
   11881    IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
   11882    IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
   11883 
   11884    /* The areas in question are: SSEROUND, and the XMM register array. */
   11885    putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
   11886 
   11887    UInt reg;
   11888    for (reg = 0; reg < 16; reg++) {
   11889       putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
   11890    }
   11891 
   11892    /* And now possibly restore from MXCSR/MXCSR_MASK */
   11893    /* Uses dirty helper:
   11894          void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
   11895                  ( VexGuestAMD64State*, ULong )
   11896       This restores from only MXCSR and MXCSR_MASK.  We need to do
   11897       this if either components 1 (SSE) or 2 (AVX) are requested.
   11898       Hence the guard condition is a bit more complex.
   11899    */
   11900    IRDirty* d1 = unsafeIRDirty_0_N (
   11901                     0/*regparms*/,
   11902                     "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
   11903                     &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
   11904                     mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
   11905                 ) ;
   11906    d1->guard = restore_1or2e;
   11907 
   11908    /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
   11909       the code for rbfm[0] just above claims a read of 0 .. 159, so
   11910       this duplicates it.  But at least correctly connects 24 .. 31 to
   11911       the MXCSR guest state representation (SSEROUND field). */
   11912    d1->mFx   = Ifx_Read;
   11913    d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
   11914    d1->mSize = 8;
   11915 
   11916    /* declare we're writing guest state */
   11917    d1->nFxState = 1;
   11918    vex_bzero(&d1->fxState, sizeof(d1->fxState));
   11919 
   11920    d1->fxState[0].fx     = Ifx_Write;
   11921    d1->fxState[0].offset = OFFB_SSEROUND;
   11922    d1->fxState[0].size   = sizeof(ULong);
   11923 
   11924    /* Call the helper.  This creates SSEROUND but nothing
   11925       else.  We do the actual register array, XMM[0..15], separately,
   11926       in order that any undefinedness in the XMM registers is tracked
   11927       separately by Memcheck and is not "infected" by the in-memory
   11928       shadow for the other parts of the image. */
   11929    stmt( IRStmt_Dirty(d1) );
   11930 
   11931    /* And now the XMMs themselves.  For each register, we PUT either
   11932       its old value, or the value loaded from memory.  One convenient
   11933       way to do that is with a conditional load that has its the
   11934       default value, the old value of the register. */
   11935    for (reg = 0; reg < 16; reg++) {
   11936       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
   11937       IRExpr* alt = getXMMReg(reg);
   11938       IRTemp  loadedValue = newTemp(Ity_V128);
   11939       stmt( IRStmt_LoadG(Iend_LE,
   11940                          ILGop_IdentV128,
   11941                          loadedValue, ea, alt, restore_1e) );
   11942       putXMMReg(reg, mkexpr(loadedValue));
   11943    }
   11944 
   11945    /* ------ rfbm[2] gates the AVX state ------ */
   11946    /* Component 2 is just a bunch of register loads, so we'll do it
   11947       inline, just to be simple and to be Memcheck friendly. */
   11948 
   11949    /* Same scheme as component 0: first zero it out, and then possibly
   11950       restore from the memory area. */
   11951    IRTemp rfbm_2      = newTemp(Ity_I64);
   11952    IRTemp xstate_bv_2 = newTemp(Ity_I64);
   11953    IRTemp restore_2   = newTemp(Ity_I64);
   11954    assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
   11955    assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
   11956    assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
   11957 
   11958    IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
   11959    IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
   11960 
   11961    for (reg = 0; reg < 16; reg++) {
   11962       putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
   11963    }
   11964 
   11965    for (reg = 0; reg < 16; reg++) {
   11966       IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
   11967       IRExpr* alt = getYMMRegLane128(reg, 1);
   11968       IRTemp  loadedValue = newTemp(Ity_V128);
   11969       stmt( IRStmt_LoadG(Iend_LE,
   11970                          ILGop_IdentV128,
   11971                          loadedValue, ea, alt, restore_2e) );
   11972       putYMMRegLane128(reg, 1, mkexpr(loadedValue));
   11973    }
   11974 }
   11975 
   11976 
   11977 static Long dis_XRSTOR ( const VexAbiInfo* vbi,
   11978                          Prefix pfx, Long delta, Int sz )
   11979 {
   11980    /* As with XRSTOR above we ignore the value of REX.W since we're
   11981       not bothering with the FPU DP and IP fields. */
   11982    IRTemp addr  = IRTemp_INVALID;
   11983    Int    alen  = 0;
   11984    HChar  dis_buf[50];
   11985    UChar  modrm = getUChar(delta);
   11986    vassert(!epartIsReg(modrm)); /* ensured by caller */
   11987    vassert(sz == 4 || sz == 8); /* ditto */
   11988 
   11989    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   11990    delta += alen;
   11991    gen_SEGV_if_not_64_aligned(addr);
   11992 
   11993    DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   11994 
   11995    /* VEX's caller is assumed to have checked this. */
   11996    const ULong aSSUMED_XCR0_VALUE = 7;
   11997 
   11998    IRTemp rfbm = newTemp(Ity_I64);
   11999    assign(rfbm,
   12000           binop(Iop_And64,
   12001                 binop(Iop_Or64,
   12002                       binop(Iop_Shl64,
   12003                             unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
   12004                       unop(Iop_32Uto64, getIRegRAX(4))),
   12005                 mkU64(aSSUMED_XCR0_VALUE)));
   12006 
   12007    IRTemp xstate_bv = newTemp(Ity_I64);
   12008    assign(xstate_bv, loadLE(Ity_I64,
   12009                             binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
   12010 
   12011    IRTemp xcomp_bv = newTemp(Ity_I64);
   12012    assign(xcomp_bv, loadLE(Ity_I64,
   12013                            binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
   12014 
   12015    IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
   12016    assign( xsavehdr_23_16,
   12017            loadLE(Ity_I64,
   12018                   binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
   12019 
   12020    /* We must fault if
   12021       * xcomp_bv[63] == 1, since this simulated CPU does not support
   12022         the compaction extension.
   12023       * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
   12024       * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
   12025         imply that xcomp_bv must be zero.
   12026       xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
   12027    */
   12028    IRTemp fault_if_nonzero = newTemp(Ity_I64);
   12029    assign(fault_if_nonzero,
   12030           binop(Iop_Or64,
   12031                 binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
   12032                 binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
   12033    stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
   12034                      Ijk_SigSEGV,
   12035                      IRConst_U64(guest_RIP_curr_instr),
   12036                      OFFB_RIP
   12037    ));
   12038 
   12039    /* We are guaranteed now that both xstate_bv and rfbm are in the
   12040       range 0 .. 7.  Generate the restore sequence proper. */
   12041    gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
   12042 
   12043    return delta;
   12044 }
   12045 
   12046 
   12047 static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
   12048                           Prefix pfx, Long delta, Int sz )
   12049 {
   12050    /* As with FXSAVE above we ignore the value of REX.W since we're
   12051       not bothering with the FPU DP and IP fields. */
   12052    IRTemp addr  = IRTemp_INVALID;
   12053    Int    alen  = 0;
   12054    HChar  dis_buf[50];
   12055    UChar  modrm = getUChar(delta);
   12056    vassert(!epartIsReg(modrm)); /* ensured by caller */
   12057    vassert(sz == 4 || sz == 8); /* ditto */
   12058 
   12059    addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12060    delta += alen;
   12061    gen_SEGV_if_not_16_aligned(addr);
   12062 
   12063    DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
   12064 
   12065    /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
   12066       as if components 0 and 1 are set as present in XSTATE_BV in the
   12067       XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
   12068       generate the XRSTOR sequence accordingly, and let iropt fold out
   12069       the unused (AVX) parts accordingly. */
   12070    IRTemp three = newTemp(Ity_I64);
   12071    assign(three, mkU64(3));
   12072    gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
   12073 
   12074    return delta;
   12075 }
   12076 
   12077 
   12078 static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
   12079 {
   12080    vassert(imm8 >= 0 && imm8 <= 7);
   12081 
   12082    // Create a V128 value which has the selected word in the
   12083    // specified lane, and zeroes everywhere else.
   12084    IRTemp tmp128    = newTemp(Ity_V128);
   12085    IRTemp halfshift = newTemp(Ity_I64);
   12086    assign(halfshift, binop(Iop_Shl64,
   12087                            unop(Iop_16Uto64, mkexpr(u16)),
   12088                            mkU8(16 * (imm8 & 3))));
   12089    if (imm8 < 4) {
   12090       assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
   12091    } else {
   12092       assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
   12093    }
   12094 
   12095    UShort mask = ~(3 << (imm8 * 2));
   12096    IRTemp res  = newTemp(Ity_V128);
   12097    assign( res, binop(Iop_OrV128,
   12098                       mkexpr(tmp128),
   12099                       binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
   12100    return res;
   12101 }
   12102 
   12103 
   12104 static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
   12105 {
   12106    IRTemp s1, s0, d1, d0;
   12107    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   12108 
   12109    breakupV128to64s( sV, &s1, &s0 );
   12110    breakupV128to64s( dV, &d1, &d0 );
   12111 
   12112    IRTemp res = newTemp(Ity_V128);
   12113    assign( res,
   12114            binop(Iop_64HLtoV128,
   12115                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12116                                "amd64g_calculate_mmx_psadbw",
   12117                                &amd64g_calculate_mmx_psadbw,
   12118                                mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
   12119                  mkIRExprCCall(Ity_I64, 0/*regparms*/,
   12120                                "amd64g_calculate_mmx_psadbw",
   12121                                &amd64g_calculate_mmx_psadbw,
   12122                                mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
   12123    return res;
   12124 }
   12125 
   12126 
   12127 static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
   12128 {
   12129    IRTemp sHi, sLo, dHi, dLo;
   12130    sHi = sLo = dHi = dLo = IRTemp_INVALID;
   12131    breakupV256toV128s( dV, &dHi, &dLo);
   12132    breakupV256toV128s( sV, &sHi, &sLo);
   12133    IRTemp res = newTemp(Ity_V256);
   12134    assign(res, binop(Iop_V128HLtoV256,
   12135                      mkexpr(math_PSADBW_128(dHi, sHi)),
   12136                      mkexpr(math_PSADBW_128(dLo, sLo))));
   12137    return res;
   12138 }
   12139 
   12140 
   12141 static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
   12142                              Long delta, Bool isAvx )
   12143 {
   12144    IRTemp regD    = newTemp(Ity_V128);
   12145    IRTemp mask    = newTemp(Ity_V128);
   12146    IRTemp olddata = newTemp(Ity_V128);
   12147    IRTemp newdata = newTemp(Ity_V128);
   12148    IRTemp addr    = newTemp(Ity_I64);
   12149    UChar  modrm   = getUChar(delta);
   12150    UInt   rG      = gregOfRexRM(pfx,modrm);
   12151    UInt   rE      = eregOfRexRM(pfx,modrm);
   12152 
   12153    assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
   12154    assign( regD, getXMMReg( rG ));
   12155 
   12156    /* Unfortunately can't do the obvious thing with SarN8x16
   12157       here since that can't be re-emitted as SSE2 code - no such
   12158       insn. */
   12159    assign( mask,
   12160            binop(Iop_64HLtoV128,
   12161                  binop(Iop_SarN8x8,
   12162                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
   12163                        mkU8(7) ),
   12164                  binop(Iop_SarN8x8,
   12165                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
   12166                        mkU8(7) ) ));
   12167    assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
   12168    assign( newdata, binop(Iop_OrV128,
   12169                           binop(Iop_AndV128,
   12170                                 mkexpr(regD),
   12171                                 mkexpr(mask) ),
   12172                           binop(Iop_AndV128,
   12173                                 mkexpr(olddata),
   12174                                 unop(Iop_NotV128, mkexpr(mask)))) );
   12175    storeLE( mkexpr(addr), mkexpr(newdata) );
   12176 
   12177    delta += 1;
   12178    DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
   12179        nameXMMReg(rE), nameXMMReg(rG) );
   12180    return delta;
   12181 }
   12182 
   12183 
   12184 static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12185                                Long delta, Bool isAvx )
   12186 {
   12187    UChar modrm = getUChar(delta);
   12188    UInt   rG   = gregOfRexRM(pfx,modrm);
   12189    UInt   rE   = eregOfRexRM(pfx,modrm);
   12190    IRTemp t0   = newTemp(Ity_I32);
   12191    IRTemp t1   = newTemp(Ity_I32);
   12192    IRTemp t2   = newTemp(Ity_I32);
   12193    IRTemp t3   = newTemp(Ity_I32);
   12194    delta += 1;
   12195    assign( t0, binop( Iop_And32,
   12196                       binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
   12197                       mkU32(1) ));
   12198    assign( t1, binop( Iop_And32,
   12199                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
   12200                       mkU32(2) ));
   12201    assign( t2, binop( Iop_And32,
   12202                       binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
   12203                       mkU32(4) ));
   12204    assign( t3, binop( Iop_And32,
   12205                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
   12206                       mkU32(8) ));
   12207    putIReg32( rG, binop(Iop_Or32,
   12208                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12209                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12210    DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
   12211        nameXMMReg(rE), nameIReg32(rG));
   12212    return delta;
   12213 }
   12214 
   12215 
   12216 static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12217 {
   12218    UChar modrm = getUChar(delta);
   12219    UInt   rG   = gregOfRexRM(pfx,modrm);
   12220    UInt   rE   = eregOfRexRM(pfx,modrm);
   12221    IRTemp t0   = newTemp(Ity_I32);
   12222    IRTemp t1   = newTemp(Ity_I32);
   12223    IRTemp t2   = newTemp(Ity_I32);
   12224    IRTemp t3   = newTemp(Ity_I32);
   12225    IRTemp t4   = newTemp(Ity_I32);
   12226    IRTemp t5   = newTemp(Ity_I32);
   12227    IRTemp t6   = newTemp(Ity_I32);
   12228    IRTemp t7   = newTemp(Ity_I32);
   12229    delta += 1;
   12230    assign( t0, binop( Iop_And32,
   12231                       binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
   12232                       mkU32(1) ));
   12233    assign( t1, binop( Iop_And32,
   12234                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
   12235                       mkU32(2) ));
   12236    assign( t2, binop( Iop_And32,
   12237                       binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
   12238                       mkU32(4) ));
   12239    assign( t3, binop( Iop_And32,
   12240                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
   12241                       mkU32(8) ));
   12242    assign( t4, binop( Iop_And32,
   12243                       binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
   12244                       mkU32(16) ));
   12245    assign( t5, binop( Iop_And32,
   12246                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
   12247                       mkU32(32) ));
   12248    assign( t6, binop( Iop_And32,
   12249                       binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
   12250                       mkU32(64) ));
   12251    assign( t7, binop( Iop_And32,
   12252                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
   12253                       mkU32(128) ));
   12254    putIReg32( rG, binop(Iop_Or32,
   12255                         binop(Iop_Or32,
   12256                               binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12257                               binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
   12258                         binop(Iop_Or32,
   12259                               binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
   12260                               binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
   12261    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12262    return delta;
   12263 }
   12264 
   12265 
   12266 static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
   12267                                Long delta, Bool isAvx )
   12268 {
   12269    UChar modrm = getUChar(delta);
   12270    UInt   rG   = gregOfRexRM(pfx,modrm);
   12271    UInt   rE   = eregOfRexRM(pfx,modrm);
   12272    IRTemp t0   = newTemp(Ity_I32);
   12273    IRTemp t1   = newTemp(Ity_I32);
   12274    delta += 1;
   12275    assign( t0, binop( Iop_And32,
   12276                       binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
   12277                       mkU32(1) ));
   12278    assign( t1, binop( Iop_And32,
   12279                       binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
   12280                       mkU32(2) ));
   12281    putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
   12282    DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
   12283        nameXMMReg(rE), nameIReg32(rG));
   12284    return delta;
   12285 }
   12286 
   12287 
   12288 static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
   12289 {
   12290    UChar modrm = getUChar(delta);
   12291    UInt   rG   = gregOfRexRM(pfx,modrm);
   12292    UInt   rE   = eregOfRexRM(pfx,modrm);
   12293    IRTemp t0   = newTemp(Ity_I32);
   12294    IRTemp t1   = newTemp(Ity_I32);
   12295    IRTemp t2   = newTemp(Ity_I32);
   12296    IRTemp t3   = newTemp(Ity_I32);
   12297    delta += 1;
   12298    assign( t0, binop( Iop_And32,
   12299                       binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
   12300                       mkU32(1) ));
   12301    assign( t1, binop( Iop_And32,
   12302                       binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
   12303                       mkU32(2) ));
   12304    assign( t2, binop( Iop_And32,
   12305                       binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
   12306                       mkU32(4) ));
   12307    assign( t3, binop( Iop_And32,
   12308                       binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
   12309                       mkU32(8) ));
   12310    putIReg32( rG, binop(Iop_Or32,
   12311                         binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
   12312                         binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
   12313    DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
   12314    return delta;
   12315 }
   12316 
   12317 
   12318 /* Note, this also handles SSE(1) insns. */
   12319 __attribute__((noinline))
   12320 static
   12321 Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
   12322                         const VexArchInfo* archinfo,
   12323                         const VexAbiInfo* vbi,
   12324                         Prefix pfx, Int sz, Long deltaIN,
   12325                         DisResult* dres )
   12326 {
   12327    IRTemp addr  = IRTemp_INVALID;
   12328    IRTemp t0    = IRTemp_INVALID;
   12329    IRTemp t1    = IRTemp_INVALID;
   12330    IRTemp t2    = IRTemp_INVALID;
   12331    IRTemp t3    = IRTemp_INVALID;
   12332    IRTemp t4    = IRTemp_INVALID;
   12333    IRTemp t5    = IRTemp_INVALID;
   12334    IRTemp t6    = IRTemp_INVALID;
   12335    UChar  modrm = 0;
   12336    Int    alen  = 0;
   12337    HChar  dis_buf[50];
   12338 
   12339    *decode_OK = False;
   12340 
   12341    Long   delta = deltaIN;
   12342    UChar  opc   = getUChar(delta);
   12343    delta++;
   12344    switch (opc) {
   12345 
   12346    case 0x10:
   12347       if (have66noF2noF3(pfx)
   12348           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12349          /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
   12350          modrm = getUChar(delta);
   12351          if (epartIsReg(modrm)) {
   12352             putXMMReg( gregOfRexRM(pfx,modrm),
   12353                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12354             DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12355                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12356             delta += 1;
   12357          } else {
   12358             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12359             putXMMReg( gregOfRexRM(pfx,modrm),
   12360                        loadLE(Ity_V128, mkexpr(addr)) );
   12361             DIP("movupd %s,%s\n", dis_buf,
   12362                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12363             delta += alen;
   12364          }
   12365          goto decode_success;
   12366       }
   12367       /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
   12368          G (lo half xmm).  If E is mem, upper half of G is zeroed out.
   12369          If E is reg, upper half of G is unchanged. */
   12370       if (haveF2no66noF3(pfx)
   12371           && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
   12372          modrm = getUChar(delta);
   12373          if (epartIsReg(modrm)) {
   12374             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12375                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   12376             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12377                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12378             delta += 1;
   12379          } else {
   12380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12381             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12382             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   12383                              loadLE(Ity_I64, mkexpr(addr)) );
   12384             DIP("movsd %s,%s\n", dis_buf,
   12385                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12386             delta += alen;
   12387          }
   12388          goto decode_success;
   12389       }
   12390       /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
   12391          (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
   12392       if (haveF3no66noF2(pfx)
   12393           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12394          modrm = getUChar(delta);
   12395          if (epartIsReg(modrm)) {
   12396             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12397                              getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
   12398             DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12399                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12400             delta += 1;
   12401          } else {
   12402             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12403             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   12404             putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
   12405                              loadLE(Ity_I32, mkexpr(addr)) );
   12406             DIP("movss %s,%s\n", dis_buf,
   12407                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   12408             delta += alen;
   12409          }
   12410          goto decode_success;
   12411       }
   12412       /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
   12413       if (haveNo66noF2noF3(pfx)
   12414           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12415          modrm = getUChar(delta);
   12416          if (epartIsReg(modrm)) {
   12417             putXMMReg( gregOfRexRM(pfx,modrm),
   12418                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12419             DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12420                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12421             delta += 1;
   12422          } else {
   12423             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12424             putXMMReg( gregOfRexRM(pfx,modrm),
   12425                        loadLE(Ity_V128, mkexpr(addr)) );
   12426             DIP("movups %s,%s\n", dis_buf,
   12427                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
   12428             delta += alen;
   12429          }
   12430          goto decode_success;
   12431       }
   12432       break;
   12433 
   12434    case 0x11:
   12435       /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
   12436          or lo half xmm). */
   12437       if (haveF2no66noF3(pfx)
   12438           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12439          modrm = getUChar(delta);
   12440          if (epartIsReg(modrm)) {
   12441             putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
   12442                              getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   12443             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12444                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
   12445             delta += 1;
   12446          } else {
   12447             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12448             storeLE( mkexpr(addr),
   12449                      getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   12450             DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12451                                  dis_buf);
   12452             delta += alen;
   12453          }
   12454          goto decode_success;
   12455       }
   12456       /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
   12457          or lo 1/4 xmm). */
   12458       if (haveF3no66noF2(pfx) && sz == 4) {
   12459          modrm = getUChar(delta);
   12460          if (epartIsReg(modrm)) {
   12461             /* fall through, we don't yet have a test case */
   12462          } else {
   12463             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12464             storeLE( mkexpr(addr),
   12465                      getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   12466             DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12467                                  dis_buf);
   12468             delta += alen;
   12469             goto decode_success;
   12470          }
   12471       }
   12472       /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
   12473       if (have66noF2noF3(pfx)
   12474           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12475          modrm = getUChar(delta);
   12476          if (epartIsReg(modrm)) {
   12477             putXMMReg( eregOfRexRM(pfx,modrm),
   12478                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12479             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12480                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12481             delta += 1;
   12482          } else {
   12483             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12484             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12485             DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12486                                   dis_buf );
   12487             delta += alen;
   12488          }
   12489          goto decode_success;
   12490       }
   12491       /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
   12492       if (haveNo66noF2noF3(pfx)
   12493           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12494          modrm = getUChar(delta);
   12495          if (epartIsReg(modrm)) {
   12496             /* fall through; awaiting test case */
   12497          } else {
   12498             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12499             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12500             DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12501                                   dis_buf );
   12502             delta += alen;
   12503             goto decode_success;
   12504          }
   12505       }
   12506       break;
   12507 
   12508    case 0x12:
   12509       /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
   12510       /* Identical to MOVLPS ? */
   12511       if (have66noF2noF3(pfx)
   12512           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12513          modrm = getUChar(delta);
   12514          if (epartIsReg(modrm)) {
   12515             /* fall through; apparently reg-reg is not possible */
   12516          } else {
   12517             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12518             delta += alen;
   12519             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12520                              0/*lower lane*/,
   12521                              loadLE(Ity_I64, mkexpr(addr)) );
   12522             DIP("movlpd %s, %s\n",
   12523                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12524             goto decode_success;
   12525          }
   12526       }
   12527       /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
   12528       /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
   12529       if (haveNo66noF2noF3(pfx)
   12530           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12531          modrm = getUChar(delta);
   12532          if (epartIsReg(modrm)) {
   12533             delta += 1;
   12534             putXMMRegLane64( gregOfRexRM(pfx,modrm),
   12535                              0/*lower lane*/,
   12536                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
   12537             DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12538                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12539          } else {
   12540             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12541             delta += alen;
   12542             putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
   12543                              loadLE(Ity_I64, mkexpr(addr)) );
   12544             DIP("movlps %s, %s\n",
   12545                 dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12546          }
   12547          goto decode_success;
   12548       }
   12549       break;
   12550 
   12551    case 0x13:
   12552       /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
   12553       if (haveNo66noF2noF3(pfx)
   12554           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12555          modrm = getUChar(delta);
   12556          if (!epartIsReg(modrm)) {
   12557             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12558             delta += alen;
   12559             storeLE( mkexpr(addr),
   12560                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12561                                       0/*lower lane*/ ) );
   12562             DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12563                                    dis_buf);
   12564             goto decode_success;
   12565          }
   12566          /* else fall through */
   12567       }
   12568       /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
   12569       /* Identical to MOVLPS ? */
   12570       if (have66noF2noF3(pfx)
   12571           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12572          modrm = getUChar(delta);
   12573          if (!epartIsReg(modrm)) {
   12574             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12575             delta += alen;
   12576             storeLE( mkexpr(addr),
   12577                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12578                                       0/*lower lane*/ ) );
   12579             DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12580                                    dis_buf);
   12581             goto decode_success;
   12582          }
   12583          /* else fall through */
   12584       }
   12585       break;
   12586 
   12587    case 0x14:
   12588    case 0x15:
   12589       /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
   12590       /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
   12591       /* These just appear to be special cases of SHUFPS */
   12592       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12593          Bool   hi = toBool(opc == 0x15);
   12594          IRTemp sV = newTemp(Ity_V128);
   12595          IRTemp dV = newTemp(Ity_V128);
   12596          modrm = getUChar(delta);
   12597          UInt   rG = gregOfRexRM(pfx,modrm);
   12598          assign( dV, getXMMReg(rG) );
   12599          if (epartIsReg(modrm)) {
   12600             UInt rE = eregOfRexRM(pfx,modrm);
   12601             assign( sV, getXMMReg(rE) );
   12602             delta += 1;
   12603             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12604                 nameXMMReg(rE), nameXMMReg(rG));
   12605          } else {
   12606             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12607             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12608             delta += alen;
   12609             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12610                 dis_buf, nameXMMReg(rG));
   12611          }
   12612          IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
   12613          putXMMReg( rG, mkexpr(res) );
   12614          goto decode_success;
   12615       }
   12616       /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
   12617       /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
   12618       /* These just appear to be special cases of SHUFPS */
   12619       if (have66noF2noF3(pfx)
   12620           && sz == 2 /* could be 8 if rex also present */) {
   12621          Bool   hi = toBool(opc == 0x15);
   12622          IRTemp sV = newTemp(Ity_V128);
   12623          IRTemp dV = newTemp(Ity_V128);
   12624          modrm = getUChar(delta);
   12625          UInt   rG = gregOfRexRM(pfx,modrm);
   12626          assign( dV, getXMMReg(rG) );
   12627          if (epartIsReg(modrm)) {
   12628             UInt rE = eregOfRexRM(pfx,modrm);
   12629             assign( sV, getXMMReg(rE) );
   12630             delta += 1;
   12631             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12632                 nameXMMReg(rE), nameXMMReg(rG));
   12633          } else {
   12634             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12635             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   12636             delta += alen;
   12637             DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
   12638                 dis_buf, nameXMMReg(rG));
   12639          }
   12640          IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
   12641          putXMMReg( rG, mkexpr(res) );
   12642          goto decode_success;
   12643       }
   12644       break;
   12645 
   12646    case 0x16:
   12647       /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
   12648       /* These seems identical to MOVHPS.  This instruction encoding is
   12649          completely crazy. */
   12650       if (have66noF2noF3(pfx)
   12651           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12652          modrm = getUChar(delta);
   12653          if (epartIsReg(modrm)) {
   12654             /* fall through; apparently reg-reg is not possible */
   12655          } else {
   12656             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12657             delta += alen;
   12658             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12659                              loadLE(Ity_I64, mkexpr(addr)) );
   12660             DIP("movhpd %s,%s\n", dis_buf,
   12661                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12662             goto decode_success;
   12663          }
   12664       }
   12665       /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
   12666       /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
   12667       if (haveNo66noF2noF3(pfx)
   12668           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12669          modrm = getUChar(delta);
   12670          if (epartIsReg(modrm)) {
   12671             delta += 1;
   12672             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12673                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
   12674             DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12675                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12676          } else {
   12677             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12678             delta += alen;
   12679             putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
   12680                              loadLE(Ity_I64, mkexpr(addr)) );
   12681             DIP("movhps %s,%s\n", dis_buf,
   12682                                   nameXMMReg( gregOfRexRM(pfx,modrm) ));
   12683          }
   12684          goto decode_success;
   12685       }
   12686       break;
   12687 
   12688    case 0x17:
   12689       /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
   12690       if (haveNo66noF2noF3(pfx)
   12691           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12692          modrm = getUChar(delta);
   12693          if (!epartIsReg(modrm)) {
   12694             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12695             delta += alen;
   12696             storeLE( mkexpr(addr),
   12697                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12698                                       1/*upper lane*/ ) );
   12699             DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12700                                   dis_buf);
   12701             goto decode_success;
   12702          }
   12703          /* else fall through */
   12704       }
   12705       /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
   12706       /* Again, this seems identical to MOVHPS. */
   12707       if (have66noF2noF3(pfx)
   12708           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12709          modrm = getUChar(delta);
   12710          if (!epartIsReg(modrm)) {
   12711             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12712             delta += alen;
   12713             storeLE( mkexpr(addr),
   12714                      getXMMRegLane64( gregOfRexRM(pfx,modrm),
   12715                                       1/*upper lane*/ ) );
   12716             DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
   12717                                   dis_buf);
   12718             goto decode_success;
   12719          }
   12720          /* else fall through */
   12721       }
   12722       break;
   12723 
   12724    case 0x18:
   12725       /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
   12726       /* 0F 18 /1 = PREFETCH0   -- with various different hints */
   12727       /* 0F 18 /2 = PREFETCH1 */
   12728       /* 0F 18 /3 = PREFETCH2 */
   12729       if (haveNo66noF2noF3(pfx)
   12730           && !epartIsReg(getUChar(delta))
   12731           && gregLO3ofRM(getUChar(delta)) >= 0
   12732           && gregLO3ofRM(getUChar(delta)) <= 3) {
   12733          const HChar* hintstr = "??";
   12734 
   12735          modrm = getUChar(delta);
   12736          vassert(!epartIsReg(modrm));
   12737 
   12738          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12739          delta += alen;
   12740 
   12741          switch (gregLO3ofRM(modrm)) {
   12742             case 0: hintstr = "nta"; break;
   12743             case 1: hintstr = "t0"; break;
   12744             case 2: hintstr = "t1"; break;
   12745             case 3: hintstr = "t2"; break;
   12746             default: vassert(0);
   12747          }
   12748 
   12749          DIP("prefetch%s %s\n", hintstr, dis_buf);
   12750          goto decode_success;
   12751       }
   12752       break;
   12753 
   12754    case 0x28:
   12755       /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
   12756       if (have66noF2noF3(pfx)
   12757           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12758          modrm = getUChar(delta);
   12759          if (epartIsReg(modrm)) {
   12760             putXMMReg( gregOfRexRM(pfx,modrm),
   12761                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12762             DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12763                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12764             delta += 1;
   12765          } else {
   12766             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12767             gen_SEGV_if_not_16_aligned( addr );
   12768             putXMMReg( gregOfRexRM(pfx,modrm),
   12769                        loadLE(Ity_V128, mkexpr(addr)) );
   12770             DIP("movapd %s,%s\n", dis_buf,
   12771                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12772             delta += alen;
   12773          }
   12774          goto decode_success;
   12775       }
   12776       /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
   12777       if (haveNo66noF2noF3(pfx)
   12778           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12779          modrm = getUChar(delta);
   12780          if (epartIsReg(modrm)) {
   12781             putXMMReg( gregOfRexRM(pfx,modrm),
   12782                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   12783             DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   12784                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12785             delta += 1;
   12786          } else {
   12787             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12788             gen_SEGV_if_not_16_aligned( addr );
   12789             putXMMReg( gregOfRexRM(pfx,modrm),
   12790                        loadLE(Ity_V128, mkexpr(addr)) );
   12791             DIP("movaps %s,%s\n", dis_buf,
   12792                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   12793             delta += alen;
   12794          }
   12795          goto decode_success;
   12796       }
   12797       break;
   12798 
   12799    case 0x29:
   12800       /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
   12801       if (haveNo66noF2noF3(pfx)
   12802           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   12803          modrm = getUChar(delta);
   12804          if (epartIsReg(modrm)) {
   12805             putXMMReg( eregOfRexRM(pfx,modrm),
   12806                        getXMMReg( gregOfRexRM(pfx,modrm) ));
   12807             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12808                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12809             delta += 1;
   12810          } else {
   12811             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12812             gen_SEGV_if_not_16_aligned( addr );
   12813             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12814             DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12815                                   dis_buf );
   12816             delta += alen;
   12817          }
   12818          goto decode_success;
   12819       }
   12820       /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
   12821       if (have66noF2noF3(pfx)
   12822           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   12823          modrm = getUChar(delta);
   12824          if (epartIsReg(modrm)) {
   12825             putXMMReg( eregOfRexRM(pfx,modrm),
   12826                        getXMMReg( gregOfRexRM(pfx,modrm) ) );
   12827             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12828                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
   12829             delta += 1;
   12830          } else {
   12831             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12832             gen_SEGV_if_not_16_aligned( addr );
   12833             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   12834             DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   12835                                   dis_buf );
   12836             delta += alen;
   12837          }
   12838          goto decode_success;
   12839       }
   12840       break;
   12841 
   12842    case 0x2A:
   12843       /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
   12844          half xmm */
   12845       if (haveNo66noF2noF3(pfx) && sz == 4) {
   12846          IRTemp arg64 = newTemp(Ity_I64);
   12847          IRTemp rmode = newTemp(Ity_I32);
   12848 
   12849          modrm = getUChar(delta);
   12850          do_MMX_preamble();
   12851          if (epartIsReg(modrm)) {
   12852             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12853             delta += 1;
   12854             DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12855                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12856          } else {
   12857             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12858             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12859             delta += alen;
   12860             DIP("cvtpi2ps %s,%s\n", dis_buf,
   12861                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12862          }
   12863 
   12864          assign( rmode, get_sse_roundingmode() );
   12865 
   12866          putXMMRegLane32F(
   12867             gregOfRexRM(pfx,modrm), 0,
   12868             binop(Iop_F64toF32,
   12869                   mkexpr(rmode),
   12870                   unop(Iop_I32StoF64,
   12871                        unop(Iop_64to32, mkexpr(arg64)) )) );
   12872 
   12873          putXMMRegLane32F(
   12874             gregOfRexRM(pfx,modrm), 1,
   12875             binop(Iop_F64toF32,
   12876                   mkexpr(rmode),
   12877                   unop(Iop_I32StoF64,
   12878                        unop(Iop_64HIto32, mkexpr(arg64)) )) );
   12879 
   12880          goto decode_success;
   12881       }
   12882       /* F3 0F 2A = CVTSI2SS
   12883          -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
   12884          -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
   12885       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   12886          IRTemp rmode = newTemp(Ity_I32);
   12887          assign( rmode, get_sse_roundingmode() );
   12888          modrm = getUChar(delta);
   12889          if (sz == 4) {
   12890             IRTemp arg32 = newTemp(Ity_I32);
   12891             if (epartIsReg(modrm)) {
   12892                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12893                delta += 1;
   12894                DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12895                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
   12896             } else {
   12897                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12898                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12899                delta += alen;
   12900                DIP("cvtsi2ss %s,%s\n", dis_buf,
   12901                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12902             }
   12903             putXMMRegLane32F(
   12904                gregOfRexRM(pfx,modrm), 0,
   12905                binop(Iop_F64toF32,
   12906                      mkexpr(rmode),
   12907                      unop(Iop_I32StoF64, mkexpr(arg32)) ) );
   12908          } else {
   12909             /* sz == 8 */
   12910             IRTemp arg64 = newTemp(Ity_I64);
   12911             if (epartIsReg(modrm)) {
   12912                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12913                delta += 1;
   12914                DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12915                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12916             } else {
   12917                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12918                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12919                delta += alen;
   12920                DIP("cvtsi2ssq %s,%s\n", dis_buf,
   12921                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12922             }
   12923             putXMMRegLane32F(
   12924                gregOfRexRM(pfx,modrm), 0,
   12925                binop(Iop_F64toF32,
   12926                      mkexpr(rmode),
   12927                      binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
   12928          }
   12929          goto decode_success;
   12930       }
   12931       /* F2 0F 2A = CVTSI2SD
   12932          when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
   12933          when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
   12934       */
   12935       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   12936          modrm = getUChar(delta);
   12937          if (sz == 4) {
   12938             IRTemp arg32 = newTemp(Ity_I32);
   12939             if (epartIsReg(modrm)) {
   12940                assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
   12941                delta += 1;
   12942                DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   12943                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12944             } else {
   12945                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12946                assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
   12947                delta += alen;
   12948                DIP("cvtsi2sdl %s,%s\n", dis_buf,
   12949                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12950             }
   12951             putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   12952                               unop(Iop_I32StoF64, mkexpr(arg32))
   12953             );
   12954          } else {
   12955             /* sz == 8 */
   12956             IRTemp arg64 = newTemp(Ity_I64);
   12957             if (epartIsReg(modrm)) {
   12958                assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
   12959                delta += 1;
   12960                DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   12961                                         nameXMMReg(gregOfRexRM(pfx,modrm)));
   12962             } else {
   12963                addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   12964                assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   12965                delta += alen;
   12966                DIP("cvtsi2sdq %s,%s\n", dis_buf,
   12967                                         nameXMMReg(gregOfRexRM(pfx,modrm)) );
   12968             }
   12969             putXMMRegLane64F(
   12970                gregOfRexRM(pfx,modrm),
   12971                0,
   12972                binop( Iop_I64StoF64,
   12973                       get_sse_roundingmode(),
   12974                       mkexpr(arg64)
   12975                )
   12976             );
   12977          }
   12978          goto decode_success;
   12979       }
   12980       /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
   12981          xmm(G) */
   12982       if (have66noF2noF3(pfx) && sz == 2) {
   12983          IRTemp arg64 = newTemp(Ity_I64);
   12984 
   12985          modrm = getUChar(delta);
   12986          if (epartIsReg(modrm)) {
   12987             /* Only switch to MMX mode if the source is a MMX register.
   12988                This is inconsistent with all other instructions which
   12989                convert between XMM and (M64 or MMX), which always switch
   12990                to MMX mode even if 64-bit operand is M64 and not MMX.  At
   12991                least, that's what the Intel docs seem to me to say.
   12992                Fixes #210264. */
   12993             do_MMX_preamble();
   12994             assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
   12995             delta += 1;
   12996             DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   12997                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   12998          } else {
   12999             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13000             assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
   13001             delta += alen;
   13002             DIP("cvtpi2pd %s,%s\n", dis_buf,
   13003                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
   13004          }
   13005 
   13006          putXMMRegLane64F(
   13007             gregOfRexRM(pfx,modrm), 0,
   13008             unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
   13009          );
   13010 
   13011          putXMMRegLane64F(
   13012             gregOfRexRM(pfx,modrm), 1,
   13013             unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
   13014          );
   13015 
   13016          goto decode_success;
   13017       }
   13018       break;
   13019 
   13020    case 0x2B:
   13021       /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
   13022       /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
   13023       if ( (haveNo66noF2noF3(pfx) && sz == 4)
   13024            || (have66noF2noF3(pfx) && sz == 2) ) {
   13025          modrm = getUChar(delta);
   13026          if (!epartIsReg(modrm)) {
   13027             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13028             gen_SEGV_if_not_16_aligned( addr );
   13029             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   13030             DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
   13031                                     dis_buf,
   13032                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13033             delta += alen;
   13034             goto decode_success;
   13035          }
   13036          /* else fall through */
   13037       }
   13038       break;
   13039 
   13040    case 0x2C:
   13041    case 0x2D:
   13042       /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13043          I32 in mmx, according to prevailing SSE rounding mode */
   13044       /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
   13045          I32 in mmx, rounding towards zero */
   13046       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13047          IRTemp dst64  = newTemp(Ity_I64);
   13048          IRTemp rmode  = newTemp(Ity_I32);
   13049          IRTemp f32lo  = newTemp(Ity_F32);
   13050          IRTemp f32hi  = newTemp(Ity_F32);
   13051          Bool   r2zero = toBool(opc == 0x2C);
   13052 
   13053          do_MMX_preamble();
   13054          modrm = getUChar(delta);
   13055 
   13056          if (epartIsReg(modrm)) {
   13057             delta += 1;
   13058             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13059             assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
   13060             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13061                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13062                                       nameMMXReg(gregLO3ofRM(modrm)));
   13063          } else {
   13064             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13065             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13066             assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
   13067                                                  mkexpr(addr),
   13068                                                  mkU64(4) )));
   13069             delta += alen;
   13070             DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
   13071                                       dis_buf,
   13072                                       nameMMXReg(gregLO3ofRM(modrm)));
   13073          }
   13074 
   13075          if (r2zero) {
   13076             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13077          } else {
   13078             assign( rmode, get_sse_roundingmode() );
   13079          }
   13080 
   13081          assign(
   13082             dst64,
   13083             binop( Iop_32HLto64,
   13084                    binop( Iop_F64toI32S,
   13085                           mkexpr(rmode),
   13086                           unop( Iop_F32toF64, mkexpr(f32hi) ) ),
   13087                    binop( Iop_F64toI32S,
   13088                           mkexpr(rmode),
   13089                           unop( Iop_F32toF64, mkexpr(f32lo) ) )
   13090                  )
   13091          );
   13092 
   13093          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13094          goto decode_success;
   13095       }
   13096       /* F3 0F 2D = CVTSS2SI
   13097          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13098                        according to prevailing SSE rounding mode
   13099          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13100                        according to prevailing SSE rounding mode
   13101       */
   13102       /* F3 0F 2C = CVTTSS2SI
   13103          when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
   13104                        truncating towards zero
   13105          when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
   13106                        truncating towards zero
   13107       */
   13108       if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
   13109          delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13110          goto decode_success;
   13111       }
   13112       /* F2 0F 2D = CVTSD2SI
   13113          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13114                        according to prevailing SSE rounding mode
   13115          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13116                        according to prevailing SSE rounding mode
   13117       */
   13118       /* F2 0F 2C = CVTTSD2SI
   13119          when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
   13120                        truncating towards zero
   13121          when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
   13122                        truncating towards zero
   13123       */
   13124       if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
   13125          delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
   13126          goto decode_success;
   13127       }
   13128       /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13129          I32 in mmx, according to prevailing SSE rounding mode */
   13130       /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
   13131          I32 in mmx, rounding towards zero */
   13132       if (have66noF2noF3(pfx) && sz == 2) {
   13133          IRTemp dst64  = newTemp(Ity_I64);
   13134          IRTemp rmode  = newTemp(Ity_I32);
   13135          IRTemp f64lo  = newTemp(Ity_F64);
   13136          IRTemp f64hi  = newTemp(Ity_F64);
   13137          Bool   r2zero = toBool(opc == 0x2C);
   13138 
   13139          do_MMX_preamble();
   13140          modrm = getUChar(delta);
   13141 
   13142          if (epartIsReg(modrm)) {
   13143             delta += 1;
   13144             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13145             assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
   13146             DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
   13147                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   13148                                       nameMMXReg(gregLO3ofRM(modrm)));
   13149          } else {
   13150             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13151             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13152             assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
   13153                                                  mkexpr(addr),
   13154                                                  mkU64(8) )));
   13155             delta += alen;
   13156             DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
   13157                                       dis_buf,
   13158                                       nameMMXReg(gregLO3ofRM(modrm)));
   13159          }
   13160 
   13161          if (r2zero) {
   13162             assign(rmode, mkU32((UInt)Irrm_ZERO) );
   13163          } else {
   13164             assign( rmode, get_sse_roundingmode() );
   13165          }
   13166 
   13167          assign(
   13168             dst64,
   13169             binop( Iop_32HLto64,
   13170                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
   13171                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
   13172                  )
   13173          );
   13174 
   13175          putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
   13176          goto decode_success;
   13177       }
   13178       break;
   13179 
   13180    case 0x2E:
   13181    case 0x2F:
   13182       /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
   13183       /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
   13184       if (have66noF2noF3(pfx) && sz == 2) {
   13185          delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
   13186          goto decode_success;
   13187       }
   13188       /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
   13189       /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
   13190       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13191          delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
   13192          goto decode_success;
   13193       }
   13194       break;
   13195 
   13196    case 0x50:
   13197       /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
   13198          to 4 lowest bits of ireg(G) */
   13199       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   13200           && epartIsReg(getUChar(delta))) {
   13201          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13202             set to 1, which has been known to happen:
   13203 
   13204             4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
   13205 
   13206             20071106: Intel docs say that REX.W isn't redundant: when
   13207             present, a 64-bit register is written; when not present, only
   13208             the 32-bit half is written.  However, testing on a Core2
   13209             machine suggests the entire 64 bit register is written
   13210             irrespective of the status of REX.W.  That could be because
   13211             of the default rule that says "if the lower half of a 32-bit
   13212             register is written, the upper half is zeroed".  By using
   13213             putIReg32 here we inadvertantly produce the same behaviour as
   13214             the Core2, for the same reason -- putIReg32 implements said
   13215             rule.
   13216 
   13217             AMD docs give no indication that REX.W is even valid for this
   13218             insn. */
   13219          delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13220          goto decode_success;
   13221       }
   13222       /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
   13223          2 lowest bits of ireg(G) */
   13224       if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   13225          /* sz == 8 is a kludge to handle insns with REX.W redundantly
   13226             set to 1, which has been known to happen:
   13227             66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
   13228             20071106: see further comments on MOVMSKPS implementation above.
   13229          */
   13230          delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13231          goto decode_success;
   13232       }
   13233       break;
   13234 
   13235    case 0x51:
   13236       /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
   13237       if (haveF3no66noF2(pfx) && sz == 4) {
   13238          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13239                                             "sqrtss", Iop_Sqrt32F0x4 );
   13240          goto decode_success;
   13241       }
   13242       /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
   13243       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13244          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13245                                            "sqrtps", Iop_Sqrt32Fx4 );
   13246          goto decode_success;
   13247       }
   13248       /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
   13249       if (haveF2no66noF3(pfx) && sz == 4) {
   13250          delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
   13251                                             "sqrtsd", Iop_Sqrt64F0x2 );
   13252          goto decode_success;
   13253       }
   13254       /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
   13255       if (have66noF2noF3(pfx) && sz == 2) {
   13256          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13257                                            "sqrtpd", Iop_Sqrt64Fx2 );
   13258          goto decode_success;
   13259       }
   13260       break;
   13261 
   13262    case 0x52:
   13263       /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
   13264       if (haveF3no66noF2(pfx) && sz == 4) {
   13265          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13266                                             "rsqrtss", Iop_RSqrtEst32F0x4 );
   13267          goto decode_success;
   13268       }
   13269       /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
   13270       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13271          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13272                                            "rsqrtps", Iop_RSqrtEst32Fx4 );
   13273          goto decode_success;
   13274       }
   13275       break;
   13276 
   13277    case 0x53:
   13278       /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
   13279       if (haveF3no66noF2(pfx) && sz == 4) {
   13280          delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
   13281                                             "rcpss", Iop_RecipEst32F0x4 );
   13282          goto decode_success;
   13283       }
   13284       /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
   13285       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13286          delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
   13287                                            "rcpps", Iop_RecipEst32Fx4 );
   13288          goto decode_success;
   13289       }
   13290       break;
   13291 
   13292    case 0x54:
   13293       /* 0F 54 = ANDPS -- G = G and E */
   13294       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13295          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
   13296          goto decode_success;
   13297       }
   13298       /* 66 0F 54 = ANDPD -- G = G and E */
   13299       if (have66noF2noF3(pfx) && sz == 2) {
   13300          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
   13301          goto decode_success;
   13302       }
   13303       break;
   13304 
   13305    case 0x55:
   13306       /* 0F 55 = ANDNPS -- G = (not G) and E */
   13307       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13308          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
   13309                                                            Iop_AndV128 );
   13310          goto decode_success;
   13311       }
   13312       /* 66 0F 55 = ANDNPD -- G = (not G) and E */
   13313       if (have66noF2noF3(pfx) && sz == 2) {
   13314          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
   13315                                                            Iop_AndV128 );
   13316          goto decode_success;
   13317       }
   13318       break;
   13319 
   13320    case 0x56:
   13321       /* 0F 56 = ORPS -- G = G and E */
   13322       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13323          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
   13324          goto decode_success;
   13325       }
   13326       /* 66 0F 56 = ORPD -- G = G and E */
   13327       if (have66noF2noF3(pfx) && sz == 2) {
   13328          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
   13329          goto decode_success;
   13330       }
   13331       break;
   13332 
   13333    case 0x57:
   13334       /* 66 0F 57 = XORPD -- G = G xor E */
   13335       if (have66noF2noF3(pfx) && sz == 2) {
   13336          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
   13337          goto decode_success;
   13338       }
   13339       /* 0F 57 = XORPS -- G = G xor E */
   13340       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13341          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
   13342          goto decode_success;
   13343       }
   13344       break;
   13345 
   13346    case 0x58:
   13347       /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
   13348       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13349          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
   13350          goto decode_success;
   13351       }
   13352       /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
   13353       if (haveF3no66noF2(pfx) && sz == 4) {
   13354          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
   13355          goto decode_success;
   13356       }
   13357       /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
   13358       if (haveF2no66noF3(pfx)
   13359           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13360          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
   13361          goto decode_success;
   13362       }
   13363       /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
   13364       if (have66noF2noF3(pfx)
   13365           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13366          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
   13367          goto decode_success;
   13368       }
   13369       break;
   13370 
   13371    case 0x59:
   13372       /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
   13373       if (haveF2no66noF3(pfx)
   13374           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13375          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
   13376          goto decode_success;
   13377       }
   13378       /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
   13379       if (haveF3no66noF2(pfx) && sz == 4) {
   13380          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
   13381          goto decode_success;
   13382       }
   13383       /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
   13384       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13385          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
   13386          goto decode_success;
   13387       }
   13388       /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
   13389       if (have66noF2noF3(pfx)
   13390           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13391          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
   13392          goto decode_success;
   13393       }
   13394       break;
   13395 
   13396    case 0x5A:
   13397       /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
   13398          F64 in xmm(G). */
   13399       if (haveNo66noF2noF3(pfx)
   13400           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13401          delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
   13402          goto decode_success;
   13403       }
   13404       /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
   13405          low half xmm(G) */
   13406       if (haveF3no66noF2(pfx) && sz == 4) {
   13407          IRTemp f32lo = newTemp(Ity_F32);
   13408 
   13409          modrm = getUChar(delta);
   13410          if (epartIsReg(modrm)) {
   13411             delta += 1;
   13412             assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
   13413             DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13414                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13415          } else {
   13416             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13417             assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
   13418             delta += alen;
   13419             DIP("cvtss2sd %s,%s\n", dis_buf,
   13420                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13421          }
   13422 
   13423          putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
   13424                            unop( Iop_F32toF64, mkexpr(f32lo) ) );
   13425 
   13426          goto decode_success;
   13427       }
   13428       /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
   13429          low 1/4 xmm(G), according to prevailing SSE rounding mode */
   13430       if (haveF2no66noF3(pfx) && sz == 4) {
   13431          IRTemp rmode = newTemp(Ity_I32);
   13432          IRTemp f64lo = newTemp(Ity_F64);
   13433 
   13434          modrm = getUChar(delta);
   13435          if (epartIsReg(modrm)) {
   13436             delta += 1;
   13437             assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
   13438             DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13439                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13440          } else {
   13441             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13442             assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
   13443             delta += alen;
   13444             DIP("cvtsd2ss %s,%s\n", dis_buf,
   13445                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13446          }
   13447 
   13448          assign( rmode, get_sse_roundingmode() );
   13449          putXMMRegLane32F(
   13450             gregOfRexRM(pfx,modrm), 0,
   13451             binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
   13452          );
   13453 
   13454          goto decode_success;
   13455       }
   13456       /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
   13457          lo half xmm(G), rounding according to prevailing SSE rounding
   13458          mode, and zero upper half */
   13459       /* Note, this is practically identical to CVTPD2DQ.  It would have
   13460          be nice to merge them together. */
   13461       if (have66noF2noF3(pfx) && sz == 2) {
   13462          delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13463          goto decode_success;
   13464       }
   13465       break;
   13466 
   13467    case 0x5B:
   13468       /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13469          xmm(G), rounding towards zero */
   13470       /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
   13471          xmm(G), as per the prevailing rounding mode */
   13472       if ( (have66noF2noF3(pfx) && sz == 2)
   13473            || (haveF3no66noF2(pfx) && sz == 4) ) {
   13474          Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
   13475          delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
   13476          goto decode_success;
   13477       }
   13478       /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
   13479          xmm(G) */
   13480       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13481          delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
   13482          goto decode_success;
   13483       }
   13484       break;
   13485 
   13486    case 0x5C:
   13487       /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
   13488       if (haveF3no66noF2(pfx) && sz == 4) {
   13489          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
   13490          goto decode_success;
   13491       }
   13492       /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
   13493       if (haveF2no66noF3(pfx)
   13494           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13495          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
   13496          goto decode_success;
   13497       }
   13498       /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
   13499       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13500          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
   13501          goto decode_success;
   13502       }
   13503       /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
   13504       if (have66noF2noF3(pfx) && sz == 2) {
   13505          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
   13506          goto decode_success;
   13507       }
   13508       break;
   13509 
   13510    case 0x5D:
   13511       /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
   13512       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13513          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
   13514          goto decode_success;
   13515       }
   13516       /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
   13517       if (haveF3no66noF2(pfx) && sz == 4) {
   13518          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
   13519          goto decode_success;
   13520       }
   13521       /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
   13522       if (haveF2no66noF3(pfx) && sz == 4) {
   13523          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
   13524          goto decode_success;
   13525       }
   13526       /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
   13527       if (have66noF2noF3(pfx) && sz == 2) {
   13528          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
   13529          goto decode_success;
   13530       }
   13531       break;
   13532 
   13533    case 0x5E:
   13534       /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
   13535       if (haveF2no66noF3(pfx) && sz == 4) {
   13536          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
   13537          goto decode_success;
   13538       }
   13539       /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
   13540       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13541          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
   13542          goto decode_success;
   13543       }
   13544       /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
   13545       if (haveF3no66noF2(pfx) && sz == 4) {
   13546          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
   13547          goto decode_success;
   13548       }
   13549       /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
   13550       if (have66noF2noF3(pfx) && sz == 2) {
   13551          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
   13552          goto decode_success;
   13553       }
   13554       break;
   13555 
   13556    case 0x5F:
   13557       /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
   13558       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13559          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
   13560          goto decode_success;
   13561       }
   13562       /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
   13563       if (haveF3no66noF2(pfx) && sz == 4) {
   13564          delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
   13565          goto decode_success;
   13566       }
   13567       /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
   13568       if (haveF2no66noF3(pfx) && sz == 4) {
   13569          delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
   13570          goto decode_success;
   13571       }
   13572       /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
   13573       if (have66noF2noF3(pfx) && sz == 2) {
   13574          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
   13575          goto decode_success;
   13576       }
   13577       break;
   13578 
   13579    case 0x60:
   13580       /* 66 0F 60 = PUNPCKLBW */
   13581       if (have66noF2noF3(pfx) && sz == 2) {
   13582          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13583                                     "punpcklbw",
   13584                                     Iop_InterleaveLO8x16, True );
   13585          goto decode_success;
   13586       }
   13587       break;
   13588 
   13589    case 0x61:
   13590       /* 66 0F 61 = PUNPCKLWD */
   13591       if (have66noF2noF3(pfx) && sz == 2) {
   13592          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13593                                     "punpcklwd",
   13594                                     Iop_InterleaveLO16x8, True );
   13595          goto decode_success;
   13596       }
   13597       break;
   13598 
   13599    case 0x62:
   13600       /* 66 0F 62 = PUNPCKLDQ */
   13601       if (have66noF2noF3(pfx) && sz == 2) {
   13602          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13603                                     "punpckldq",
   13604                                     Iop_InterleaveLO32x4, True );
   13605          goto decode_success;
   13606       }
   13607       break;
   13608 
   13609    case 0x63:
   13610       /* 66 0F 63 = PACKSSWB */
   13611       if (have66noF2noF3(pfx) && sz == 2) {
   13612          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13613                                     "packsswb",
   13614                                     Iop_QNarrowBin16Sto8Sx16, True );
   13615          goto decode_success;
   13616       }
   13617       break;
   13618 
   13619    case 0x64:
   13620       /* 66 0F 64 = PCMPGTB */
   13621       if (have66noF2noF3(pfx) && sz == 2) {
   13622          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13623                                     "pcmpgtb", Iop_CmpGT8Sx16, False );
   13624          goto decode_success;
   13625       }
   13626       break;
   13627 
   13628    case 0x65:
   13629       /* 66 0F 65 = PCMPGTW */
   13630       if (have66noF2noF3(pfx) && sz == 2) {
   13631          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13632                                     "pcmpgtw", Iop_CmpGT16Sx8, False );
   13633          goto decode_success;
   13634       }
   13635       break;
   13636 
   13637    case 0x66:
   13638       /* 66 0F 66 = PCMPGTD */
   13639       if (have66noF2noF3(pfx) && sz == 2) {
   13640          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13641                                     "pcmpgtd", Iop_CmpGT32Sx4, False );
   13642          goto decode_success;
   13643       }
   13644       break;
   13645 
   13646    case 0x67:
   13647       /* 66 0F 67 = PACKUSWB */
   13648       if (have66noF2noF3(pfx) && sz == 2) {
   13649          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13650                                     "packuswb",
   13651                                     Iop_QNarrowBin16Sto8Ux16, True );
   13652          goto decode_success;
   13653       }
   13654       break;
   13655 
   13656    case 0x68:
   13657       /* 66 0F 68 = PUNPCKHBW */
   13658       if (have66noF2noF3(pfx) && sz == 2) {
   13659          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13660                                     "punpckhbw",
   13661                                     Iop_InterleaveHI8x16, True );
   13662          goto decode_success;
   13663       }
   13664       break;
   13665 
   13666    case 0x69:
   13667       /* 66 0F 69 = PUNPCKHWD */
   13668       if (have66noF2noF3(pfx) && sz == 2) {
   13669          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13670                                     "punpckhwd",
   13671                                     Iop_InterleaveHI16x8, True );
   13672          goto decode_success;
   13673       }
   13674       break;
   13675 
   13676    case 0x6A:
   13677       /* 66 0F 6A = PUNPCKHDQ */
   13678       if (have66noF2noF3(pfx) && sz == 2) {
   13679          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13680                                     "punpckhdq",
   13681                                     Iop_InterleaveHI32x4, True );
   13682          goto decode_success;
   13683       }
   13684       break;
   13685 
   13686    case 0x6B:
   13687       /* 66 0F 6B = PACKSSDW */
   13688       if (have66noF2noF3(pfx) && sz == 2) {
   13689          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13690                                     "packssdw",
   13691                                     Iop_QNarrowBin32Sto16Sx8, True );
   13692          goto decode_success;
   13693       }
   13694       break;
   13695 
   13696    case 0x6C:
   13697       /* 66 0F 6C = PUNPCKLQDQ */
   13698       if (have66noF2noF3(pfx) && sz == 2) {
   13699          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13700                                     "punpcklqdq",
   13701                                     Iop_InterleaveLO64x2, True );
   13702          goto decode_success;
   13703       }
   13704       break;
   13705 
   13706    case 0x6D:
   13707       /* 66 0F 6D = PUNPCKHQDQ */
   13708       if (have66noF2noF3(pfx) && sz == 2) {
   13709          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13710                                     "punpckhqdq",
   13711                                     Iop_InterleaveHI64x2, True );
   13712          goto decode_success;
   13713       }
   13714       break;
   13715 
   13716    case 0x6E:
   13717       /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
   13718                     zeroing high 3/4 of xmm. */
   13719       /*              or from ireg64/m64 to xmm lo 1/2,
   13720                     zeroing high 1/2 of xmm. */
   13721       if (have66noF2noF3(pfx)) {
   13722          vassert(sz == 2 || sz == 8);
   13723          if (sz == 2) sz = 4;
   13724          modrm = getUChar(delta);
   13725          if (epartIsReg(modrm)) {
   13726             delta += 1;
   13727             if (sz == 4) {
   13728                putXMMReg(
   13729                   gregOfRexRM(pfx,modrm),
   13730                   unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
   13731                );
   13732                DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
   13733                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13734             } else {
   13735                putXMMReg(
   13736                   gregOfRexRM(pfx,modrm),
   13737                   unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
   13738                );
   13739                DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
   13740                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
   13741             }
   13742          } else {
   13743             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   13744             delta += alen;
   13745             putXMMReg(
   13746                gregOfRexRM(pfx,modrm),
   13747                sz == 4
   13748                   ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
   13749                   :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
   13750             );
   13751             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
   13752                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13753          }
   13754          goto decode_success;
   13755       }
   13756       break;
   13757 
   13758    case 0x6F:
   13759       if (have66noF2noF3(pfx)
   13760           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   13761          /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
   13762          modrm = getUChar(delta);
   13763          if (epartIsReg(modrm)) {
   13764             putXMMReg( gregOfRexRM(pfx,modrm),
   13765                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13766             DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13767                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13768             delta += 1;
   13769          } else {
   13770             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13771             gen_SEGV_if_not_16_aligned( addr );
   13772             putXMMReg( gregOfRexRM(pfx,modrm),
   13773                        loadLE(Ity_V128, mkexpr(addr)) );
   13774             DIP("movdqa %s,%s\n", dis_buf,
   13775                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13776             delta += alen;
   13777          }
   13778          goto decode_success;
   13779       }
   13780       if (haveF3no66noF2(pfx) && sz == 4) {
   13781          /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
   13782          modrm = getUChar(delta);
   13783          if (epartIsReg(modrm)) {
   13784             putXMMReg( gregOfRexRM(pfx,modrm),
   13785                        getXMMReg( eregOfRexRM(pfx,modrm) ));
   13786             DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13787                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13788             delta += 1;
   13789          } else {
   13790             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13791             putXMMReg( gregOfRexRM(pfx,modrm),
   13792                        loadLE(Ity_V128, mkexpr(addr)) );
   13793             DIP("movdqu %s,%s\n", dis_buf,
   13794                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
   13795             delta += alen;
   13796          }
   13797          goto decode_success;
   13798       }
   13799       break;
   13800 
   13801    case 0x70:
   13802       /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
   13803       if (have66noF2noF3(pfx) && sz == 2) {
   13804          delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
   13805          goto decode_success;
   13806       }
   13807       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   13808       /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
   13809       if (haveNo66noF2noF3(pfx) && sz == 4) {
   13810          Int order;
   13811          IRTemp sV, dV, s3, s2, s1, s0;
   13812          s3 = s2 = s1 = s0 = IRTemp_INVALID;
   13813          sV = newTemp(Ity_I64);
   13814          dV = newTemp(Ity_I64);
   13815          do_MMX_preamble();
   13816          modrm = getUChar(delta);
   13817          if (epartIsReg(modrm)) {
   13818             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   13819             order = (Int)getUChar(delta+1);
   13820             delta += 1+1;
   13821             DIP("pshufw $%d,%s,%s\n", order,
   13822                                       nameMMXReg(eregLO3ofRM(modrm)),
   13823                                       nameMMXReg(gregLO3ofRM(modrm)));
   13824          } else {
   13825             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   13826                               1/*extra byte after amode*/ );
   13827             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   13828             order = (Int)getUChar(delta+alen);
   13829             delta += 1+alen;
   13830             DIP("pshufw $%d,%s,%s\n", order,
   13831                                       dis_buf,
   13832                                       nameMMXReg(gregLO3ofRM(modrm)));
   13833          }
   13834          breakup64to16s( sV, &s3, &s2, &s1, &s0 );
   13835 #        define SEL(n) \
   13836                    ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
   13837          assign(dV,
   13838                 mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
   13839                              SEL((order>>2)&3), SEL((order>>0)&3) )
   13840          );
   13841          putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
   13842 #        undef SEL
   13843          goto decode_success;
   13844       }
   13845       /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
   13846          mem) to G(xmm), and copy upper half */
   13847       if (haveF2no66noF3(pfx) && sz == 4) {
   13848          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13849                                   False/*!isAvx*/, False/*!xIsH*/ );
   13850          goto decode_success;
   13851       }
   13852       /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
   13853          mem) to G(xmm), and copy lower half */
   13854       if (haveF3no66noF2(pfx) && sz == 4) {
   13855          delta = dis_PSHUFxW_128( vbi, pfx, delta,
   13856                                   False/*!isAvx*/, True/*xIsH*/ );
   13857          goto decode_success;
   13858       }
   13859       break;
   13860 
   13861    case 0x71:
   13862       /* 66 0F 71 /2 ib = PSRLW by immediate */
   13863       if (have66noF2noF3(pfx) && sz == 2
   13864           && epartIsReg(getUChar(delta))
   13865           && gregLO3ofRM(getUChar(delta)) == 2) {
   13866          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
   13867          goto decode_success;
   13868       }
   13869       /* 66 0F 71 /4 ib = PSRAW by immediate */
   13870       if (have66noF2noF3(pfx) && sz == 2
   13871           && epartIsReg(getUChar(delta))
   13872           && gregLO3ofRM(getUChar(delta)) == 4) {
   13873          delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
   13874          goto decode_success;
   13875       }
   13876       /* 66 0F 71 /6 ib = PSLLW by immediate */
   13877       if (have66noF2noF3(pfx) && sz == 2
   13878           && epartIsReg(getUChar(delta))
   13879           && gregLO3ofRM(getUChar(delta)) == 6) {
   13880          delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
   13881          goto decode_success;
   13882       }
   13883       break;
   13884 
   13885    case 0x72:
   13886       /* 66 0F 72 /2 ib = PSRLD by immediate */
   13887       if (have66noF2noF3(pfx) && sz == 2
   13888           && epartIsReg(getUChar(delta))
   13889           && gregLO3ofRM(getUChar(delta)) == 2) {
   13890          delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
   13891          goto decode_success;
   13892       }
   13893       /* 66 0F 72 /4 ib = PSRAD by immediate */
   13894       if (have66noF2noF3(pfx) && sz == 2
   13895           && epartIsReg(getUChar(delta))
   13896           && gregLO3ofRM(getUChar(delta)) == 4) {
   13897          delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
   13898          goto decode_success;
   13899       }
   13900       /* 66 0F 72 /6 ib = PSLLD by immediate */
   13901       if (have66noF2noF3(pfx) && sz == 2
   13902           && epartIsReg(getUChar(delta))
   13903           && gregLO3ofRM(getUChar(delta)) == 6) {
   13904          delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
   13905          goto decode_success;
   13906       }
   13907       break;
   13908 
   13909    case 0x73:
   13910       /* 66 0F 73 /3 ib = PSRLDQ by immediate */
   13911       /* note, if mem case ever filled in, 1 byte after amode */
   13912       if (have66noF2noF3(pfx) && sz == 2
   13913           && epartIsReg(getUChar(delta))
   13914           && gregLO3ofRM(getUChar(delta)) == 3) {
   13915          Int imm = (Int)getUChar(delta+1);
   13916          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13917          DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
   13918          delta += 2;
   13919          IRTemp sV = newTemp(Ity_V128);
   13920          assign( sV, getXMMReg(reg) );
   13921          putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
   13922          goto decode_success;
   13923       }
   13924       /* 66 0F 73 /7 ib = PSLLDQ by immediate */
   13925       /* note, if mem case ever filled in, 1 byte after amode */
   13926       if (have66noF2noF3(pfx) && sz == 2
   13927           && epartIsReg(getUChar(delta))
   13928           && gregLO3ofRM(getUChar(delta)) == 7) {
   13929          Int imm = (Int)getUChar(delta+1);
   13930          Int reg = eregOfRexRM(pfx,getUChar(delta));
   13931          DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
   13932          vassert(imm >= 0 && imm <= 255);
   13933          delta += 2;
   13934          IRTemp sV = newTemp(Ity_V128);
   13935          assign( sV, getXMMReg(reg) );
   13936          putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
   13937          goto decode_success;
   13938       }
   13939       /* 66 0F 73 /2 ib = PSRLQ by immediate */
   13940       if (have66noF2noF3(pfx) && sz == 2
   13941           && epartIsReg(getUChar(delta))
   13942           && gregLO3ofRM(getUChar(delta)) == 2) {
   13943          delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
   13944          goto decode_success;
   13945       }
   13946       /* 66 0F 73 /6 ib = PSLLQ by immediate */
   13947       if (have66noF2noF3(pfx) && sz == 2
   13948           && epartIsReg(getUChar(delta))
   13949           && gregLO3ofRM(getUChar(delta)) == 6) {
   13950          delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
   13951          goto decode_success;
   13952       }
   13953       break;
   13954 
   13955    case 0x74:
   13956       /* 66 0F 74 = PCMPEQB */
   13957       if (have66noF2noF3(pfx) && sz == 2) {
   13958          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13959                                     "pcmpeqb", Iop_CmpEQ8x16, False );
   13960          goto decode_success;
   13961       }
   13962       break;
   13963 
   13964    case 0x75:
   13965       /* 66 0F 75 = PCMPEQW */
   13966       if (have66noF2noF3(pfx) && sz == 2) {
   13967          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13968                                     "pcmpeqw", Iop_CmpEQ16x8, False );
   13969          goto decode_success;
   13970       }
   13971       break;
   13972 
   13973    case 0x76:
   13974       /* 66 0F 76 = PCMPEQD */
   13975       if (have66noF2noF3(pfx) && sz == 2) {
   13976          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   13977                                     "pcmpeqd", Iop_CmpEQ32x4, False );
   13978          goto decode_success;
   13979       }
   13980       break;
   13981 
   13982    case 0x7E:
   13983       /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
   13984          G (lo half xmm).  Upper half of G is zeroed out. */
   13985       if (haveF3no66noF2(pfx)
   13986           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   13987          modrm = getUChar(delta);
   13988          if (epartIsReg(modrm)) {
   13989             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   13990                              getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   13991                /* zero bits 127:64 */
   13992                putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
   13993             DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   13994                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   13995             delta += 1;
   13996          } else {
   13997             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   13998             putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
   13999             putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
   14000                              loadLE(Ity_I64, mkexpr(addr)) );
   14001             DIP("movsd %s,%s\n", dis_buf,
   14002                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
   14003             delta += alen;
   14004          }
   14005          goto decode_success;
   14006       }
   14007       /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
   14008       /*              or from xmm low 1/2 to ireg64 or m64. */
   14009          if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
   14010          if (sz == 2) sz = 4;
   14011          modrm = getUChar(delta);
   14012          if (epartIsReg(modrm)) {
   14013             delta += 1;
   14014             if (sz == 4) {
   14015                putIReg32( eregOfRexRM(pfx,modrm),
   14016                           getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
   14017                DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14018                                     nameIReg32(eregOfRexRM(pfx,modrm)));
   14019             } else {
   14020                putIReg64( eregOfRexRM(pfx,modrm),
   14021                           getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
   14022                DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14023                                     nameIReg64(eregOfRexRM(pfx,modrm)));
   14024             }
   14025          } else {
   14026             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14027             delta += alen;
   14028             storeLE( mkexpr(addr),
   14029                      sz == 4
   14030                         ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
   14031                         : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
   14032             DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
   14033                                   nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14034          }
   14035          goto decode_success;
   14036       }
   14037       break;
   14038 
   14039    case 0x7F:
   14040       /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
   14041       if (haveF3no66noF2(pfx) && sz == 4) {
   14042          modrm = getUChar(delta);
   14043          if (epartIsReg(modrm)) {
   14044             goto decode_failure; /* awaiting test case */
   14045             delta += 1;
   14046             putXMMReg( eregOfRexRM(pfx,modrm),
   14047                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14048             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14049                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14050          } else {
   14051             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14052             delta += alen;
   14053             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14054             DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14055          }
   14056          goto decode_success;
   14057       }
   14058       /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
   14059       if (have66noF2noF3(pfx) && sz == 2) {
   14060          modrm = getUChar(delta);
   14061          if (epartIsReg(modrm)) {
   14062             delta += 1;
   14063             putXMMReg( eregOfRexRM(pfx,modrm),
   14064                        getXMMReg(gregOfRexRM(pfx,modrm)) );
   14065             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
   14066                                    nameXMMReg(eregOfRexRM(pfx,modrm)));
   14067          } else {
   14068             addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
   14069             gen_SEGV_if_not_16_aligned( addr );
   14070             delta += alen;
   14071             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14072             DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
   14073          }
   14074          goto decode_success;
   14075       }
   14076       break;
   14077 
   14078    case 0xAE:
   14079       /* 0F AE /7 = SFENCE -- flush pending operations to memory */
   14080       if (haveNo66noF2noF3(pfx)
   14081           && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14082           && sz == 4) {
   14083          delta += 1;
   14084          /* Insert a memory fence.  It's sometimes important that these
   14085             are carried through to the generated code. */
   14086          stmt( IRStmt_MBE(Imbe_Fence) );
   14087          DIP("sfence\n");
   14088          goto decode_success;
   14089       }
   14090       /* mindless duplication follows .. */
   14091       /* 0F AE /5 = LFENCE -- flush pending operations to memory */
   14092       /* 0F AE /6 = MFENCE -- flush pending operations to memory */
   14093       if (haveNo66noF2noF3(pfx)
   14094           && epartIsReg(getUChar(delta))
   14095           && (gregLO3ofRM(getUChar(delta)) == 5
   14096               || gregLO3ofRM(getUChar(delta)) == 6)
   14097           && sz == 4) {
   14098          delta += 1;
   14099          /* Insert a memory fence.  It's sometimes important that these
   14100             are carried through to the generated code. */
   14101          stmt( IRStmt_MBE(Imbe_Fence) );
   14102          DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
   14103          goto decode_success;
   14104       }
   14105 
   14106       /* 0F AE /7 = CLFLUSH -- flush cache line */
   14107       if (haveNo66noF2noF3(pfx)
   14108           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
   14109           && sz == 4) {
   14110 
   14111          /* This is something of a hack.  We need to know the size of
   14112             the cache line containing addr.  Since we don't (easily),
   14113             assume 256 on the basis that no real cache would have a
   14114             line that big.  It's safe to invalidate more stuff than we
   14115             need, just inefficient. */
   14116          ULong lineszB = 256ULL;
   14117 
   14118          addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14119          delta += alen;
   14120 
   14121          /* Round addr down to the start of the containing block. */
   14122          stmt( IRStmt_Put(
   14123                   OFFB_CMSTART,
   14124                   binop( Iop_And64,
   14125                          mkexpr(addr),
   14126                          mkU64( ~(lineszB-1) ))) );
   14127 
   14128          stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
   14129 
   14130          jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
   14131 
   14132          DIP("clflush %s\n", dis_buf);
   14133          goto decode_success;
   14134       }
   14135 
   14136       /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
   14137       if (haveNo66noF2noF3(pfx)
   14138           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
   14139           && sz == 4) {
   14140          delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14141          goto decode_success;
   14142       }
   14143       /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
   14144       if (haveNo66noF2noF3(pfx)
   14145           && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
   14146           && sz == 4) {
   14147          delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
   14148          goto decode_success;
   14149       }
   14150       /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
   14151       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14152           && !epartIsReg(getUChar(delta))
   14153           && gregOfRexRM(pfx,getUChar(delta)) == 0) {
   14154          delta = dis_FXSAVE(vbi, pfx, delta, sz);
   14155          goto decode_success;
   14156       }
   14157       /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
   14158       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14159           && !epartIsReg(getUChar(delta))
   14160           && gregOfRexRM(pfx,getUChar(delta)) == 1) {
   14161          delta = dis_FXRSTOR(vbi, pfx, delta, sz);
   14162          goto decode_success;
   14163       }
   14164       /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
   14165       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14166           && !epartIsReg(getUChar(delta))
   14167           && gregOfRexRM(pfx,getUChar(delta)) == 4
   14168           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14169          delta = dis_XSAVE(vbi, pfx, delta, sz);
   14170          goto decode_success;
   14171       }
   14172       /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
   14173       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
   14174           && !epartIsReg(getUChar(delta))
   14175           && gregOfRexRM(pfx,getUChar(delta)) == 5
   14176           && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
   14177          delta = dis_XRSTOR(vbi, pfx, delta, sz);
   14178          goto decode_success;
   14179       }
   14180       break;
   14181 
   14182    case 0xC2:
   14183       /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
   14184       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14185          Long delta0 = delta;
   14186          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
   14187          if (delta > delta0) goto decode_success;
   14188       }
   14189       /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
   14190       if (haveF3no66noF2(pfx) && sz == 4) {
   14191          Long delta0 = delta;
   14192          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
   14193          if (delta > delta0) goto decode_success;
   14194       }
   14195       /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
   14196       if (haveF2no66noF3(pfx) && sz == 4) {
   14197          Long delta0 = delta;
   14198          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
   14199          if (delta > delta0) goto decode_success;
   14200       }
   14201       /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
   14202       if (have66noF2noF3(pfx) && sz == 2) {
   14203          Long delta0 = delta;
   14204          delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
   14205          if (delta > delta0) goto decode_success;
   14206       }
   14207       break;
   14208 
   14209    case 0xC3:
   14210       /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
   14211       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14212          modrm = getUChar(delta);
   14213          if (!epartIsReg(modrm)) {
   14214             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14215             storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
   14216             DIP("movnti %s,%s\n", dis_buf,
   14217                                   nameIRegG(sz, pfx, modrm));
   14218             delta += alen;
   14219             goto decode_success;
   14220          }
   14221          /* else fall through */
   14222       }
   14223       break;
   14224 
   14225    case 0xC4:
   14226       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14227       /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14228          put it into the specified lane of mmx(G). */
   14229       if (haveNo66noF2noF3(pfx)
   14230           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14231          /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
   14232             mmx reg.  t4 is the new lane value.  t5 is the original
   14233             mmx value. t6 is the new mmx value. */
   14234          Int lane;
   14235          t4 = newTemp(Ity_I16);
   14236          t5 = newTemp(Ity_I64);
   14237          t6 = newTemp(Ity_I64);
   14238          modrm = getUChar(delta);
   14239          do_MMX_preamble();
   14240 
   14241          assign(t5, getMMXReg(gregLO3ofRM(modrm)));
   14242          breakup64to16s( t5, &t3, &t2, &t1, &t0 );
   14243 
   14244          if (epartIsReg(modrm)) {
   14245             assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
   14246             delta += 1+1;
   14247             lane = getUChar(delta-1);
   14248             DIP("pinsrw $%d,%s,%s\n", lane,
   14249                                       nameIReg16(eregOfRexRM(pfx,modrm)),
   14250                                       nameMMXReg(gregLO3ofRM(modrm)));
   14251          } else {
   14252             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14253             delta += 1+alen;
   14254             lane = getUChar(delta-1);
   14255             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14256             DIP("pinsrw $%d,%s,%s\n", lane,
   14257                                       dis_buf,
   14258                                       nameMMXReg(gregLO3ofRM(modrm)));
   14259          }
   14260 
   14261          switch (lane & 3) {
   14262             case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
   14263             case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
   14264             case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
   14265             case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
   14266             default: vassert(0);
   14267          }
   14268          putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
   14269          goto decode_success;
   14270       }
   14271       /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
   14272          put it into the specified lane of xmm(G). */
   14273       if (have66noF2noF3(pfx)
   14274           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14275          Int lane;
   14276          t4 = newTemp(Ity_I16);
   14277          modrm = getUChar(delta);
   14278          UInt rG = gregOfRexRM(pfx,modrm);
   14279          if (epartIsReg(modrm)) {
   14280             UInt rE = eregOfRexRM(pfx,modrm);
   14281             assign(t4, getIReg16(rE));
   14282             delta += 1+1;
   14283             lane = getUChar(delta-1);
   14284             DIP("pinsrw $%d,%s,%s\n",
   14285                 lane, nameIReg16(rE), nameXMMReg(rG));
   14286          } else {
   14287             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
   14288                               1/*byte after the amode*/ );
   14289             delta += 1+alen;
   14290             lane = getUChar(delta-1);
   14291             assign(t4, loadLE(Ity_I16, mkexpr(addr)));
   14292             DIP("pinsrw $%d,%s,%s\n",
   14293                 lane, dis_buf, nameXMMReg(rG));
   14294          }
   14295          IRTemp src_vec = newTemp(Ity_V128);
   14296          assign(src_vec, getXMMReg(rG));
   14297          IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
   14298          putXMMReg(rG, mkexpr(res_vec));
   14299          goto decode_success;
   14300       }
   14301       break;
   14302 
   14303    case 0xC5:
   14304       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14305       /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
   14306          zero-extend of it in ireg(G). */
   14307       if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
   14308          modrm = getUChar(delta);
   14309          if (epartIsReg(modrm)) {
   14310             IRTemp sV = newTemp(Ity_I64);
   14311             t5 = newTemp(Ity_I16);
   14312             do_MMX_preamble();
   14313             assign(sV, getMMXReg(eregLO3ofRM(modrm)));
   14314             breakup64to16s( sV, &t3, &t2, &t1, &t0 );
   14315             switch (getUChar(delta+1) & 3) {
   14316                case 0:  assign(t5, mkexpr(t0)); break;
   14317                case 1:  assign(t5, mkexpr(t1)); break;
   14318                case 2:  assign(t5, mkexpr(t2)); break;
   14319                case 3:  assign(t5, mkexpr(t3)); break;
   14320                default: vassert(0);
   14321             }
   14322             if (sz == 8)
   14323                putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
   14324             else
   14325                putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
   14326             DIP("pextrw $%d,%s,%s\n",
   14327                 (Int)getUChar(delta+1),
   14328                 nameMMXReg(eregLO3ofRM(modrm)),
   14329                 sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
   14330                       : nameIReg32(gregOfRexRM(pfx,modrm))
   14331             );
   14332             delta += 2;
   14333             goto decode_success;
   14334          }
   14335          /* else fall through */
   14336          /* note, for anyone filling in the mem case: this insn has one
   14337             byte after the amode and therefore you must pass 1 as the
   14338             last arg to disAMode */
   14339       }
   14340       /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
   14341          zero-extend of it in ireg(G). */
   14342       if (have66noF2noF3(pfx)
   14343           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14344          Long delta0 = delta;
   14345          delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
   14346                                               False/*!isAvx*/ );
   14347          if (delta > delta0) goto decode_success;
   14348          /* else fall through -- decoding has failed */
   14349       }
   14350       break;
   14351 
   14352    case 0xC6:
   14353       /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
   14354       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14355          Int    imm8 = 0;
   14356          IRTemp sV   = newTemp(Ity_V128);
   14357          IRTemp dV   = newTemp(Ity_V128);
   14358          modrm = getUChar(delta);
   14359          UInt rG = gregOfRexRM(pfx,modrm);
   14360          assign( dV, getXMMReg(rG) );
   14361          if (epartIsReg(modrm)) {
   14362             UInt rE = eregOfRexRM(pfx,modrm);
   14363             assign( sV, getXMMReg(rE) );
   14364             imm8 = (Int)getUChar(delta+1);
   14365             delta += 1+1;
   14366             DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
   14367          } else {
   14368             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14369             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14370             imm8 = (Int)getUChar(delta+alen);
   14371             delta += 1+alen;
   14372             DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
   14373          }
   14374          IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
   14375          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14376          goto decode_success;
   14377       }
   14378       /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
   14379       if (have66noF2noF3(pfx) && sz == 2) {
   14380          Int    select;
   14381          IRTemp sV = newTemp(Ity_V128);
   14382          IRTemp dV = newTemp(Ity_V128);
   14383 
   14384          modrm = getUChar(delta);
   14385          assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
   14386 
   14387          if (epartIsReg(modrm)) {
   14388             assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
   14389             select = (Int)getUChar(delta+1);
   14390             delta += 1+1;
   14391             DIP("shufpd $%d,%s,%s\n", select,
   14392                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
   14393                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14394          } else {
   14395             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
   14396             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14397             select = getUChar(delta+alen);
   14398             delta += 1+alen;
   14399             DIP("shufpd $%d,%s,%s\n", select,
   14400                                       dis_buf,
   14401                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
   14402          }
   14403 
   14404          IRTemp res = math_SHUFPD_128( sV, dV, select );
   14405          putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
   14406          goto decode_success;
   14407       }
   14408       break;
   14409 
   14410    case 0xD1:
   14411       /* 66 0F D1 = PSRLW by E */
   14412       if (have66noF2noF3(pfx) && sz == 2) {
   14413          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
   14414          goto decode_success;
   14415       }
   14416       break;
   14417 
   14418    case 0xD2:
   14419       /* 66 0F D2 = PSRLD by E */
   14420       if (have66noF2noF3(pfx) && sz == 2) {
   14421          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
   14422          goto decode_success;
   14423       }
   14424       break;
   14425 
   14426    case 0xD3:
   14427       /* 66 0F D3 = PSRLQ by E */
   14428       if (have66noF2noF3(pfx) && sz == 2) {
   14429          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
   14430          goto decode_success;
   14431       }
   14432       break;
   14433 
   14434    case 0xD4:
   14435       /* 66 0F D4 = PADDQ */
   14436       if (have66noF2noF3(pfx) && sz == 2) {
   14437          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14438                                     "paddq", Iop_Add64x2, False );
   14439          goto decode_success;
   14440       }
   14441       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14442       /* 0F D4 = PADDQ -- add 64x1 */
   14443       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14444          do_MMX_preamble();
   14445          delta = dis_MMXop_regmem_to_reg (
   14446                    vbi, pfx, delta, opc, "paddq", False );
   14447          goto decode_success;
   14448       }
   14449       break;
   14450 
   14451    case 0xD5:
   14452       /* 66 0F D5 = PMULLW -- 16x8 multiply */
   14453       if (have66noF2noF3(pfx) && sz == 2) {
   14454          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14455                                     "pmullw", Iop_Mul16x8, False );
   14456          goto decode_success;
   14457       }
   14458       break;
   14459 
   14460    case 0xD6:
   14461       /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
   14462          hi half). */
   14463       if (haveF3no66noF2(pfx) && sz == 4) {
   14464          modrm = getUChar(delta);
   14465          if (epartIsReg(modrm)) {
   14466             do_MMX_preamble();
   14467             putXMMReg( gregOfRexRM(pfx,modrm),
   14468                        unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
   14469             DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14470                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14471             delta += 1;
   14472             goto decode_success;
   14473          }
   14474          /* apparently no mem case for this insn */
   14475       }
   14476       /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
   14477          or lo half xmm).  */
   14478       if (have66noF2noF3(pfx)
   14479           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
   14480          modrm = getUChar(delta);
   14481          if (epartIsReg(modrm)) {
   14482             /* fall through, awaiting test case */
   14483             /* dst: lo half copied, hi half zeroed */
   14484          } else {
   14485             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14486             storeLE( mkexpr(addr),
   14487                      getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
   14488             DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
   14489             delta += alen;
   14490             goto decode_success;
   14491          }
   14492       }
   14493       /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
   14494       if (haveF2no66noF3(pfx) && sz == 4) {
   14495          modrm = getUChar(delta);
   14496          if (epartIsReg(modrm)) {
   14497             do_MMX_preamble();
   14498             putMMXReg( gregLO3ofRM(modrm),
   14499                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
   14500             DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
   14501                                    nameMMXReg(gregLO3ofRM(modrm)));
   14502             delta += 1;
   14503             goto decode_success;
   14504          }
   14505          /* apparently no mem case for this insn */
   14506       }
   14507       break;
   14508 
   14509    case 0xD7:
   14510       /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
   14511          lanes in xmm(E), turn them into a byte, and put
   14512          zero-extend of it in ireg(G).  Doing this directly is just
   14513          too cumbersome; give up therefore and call a helper. */
   14514       if (have66noF2noF3(pfx)
   14515           && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
   14516           && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
   14517          delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
   14518          goto decode_success;
   14519       }
   14520       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14521       /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
   14522          mmx(E), turn them into a byte, and put zero-extend of it in
   14523          ireg(G). */
   14524       if (haveNo66noF2noF3(pfx)
   14525           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   14526          modrm = getUChar(delta);
   14527          if (epartIsReg(modrm)) {
   14528             do_MMX_preamble();
   14529             t0 = newTemp(Ity_I64);
   14530             t1 = newTemp(Ity_I32);
   14531             assign(t0, getMMXReg(eregLO3ofRM(modrm)));
   14532             assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
   14533             putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
   14534             DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14535                                     nameIReg32(gregOfRexRM(pfx,modrm)));
   14536             delta += 1;
   14537             goto decode_success;
   14538          }
   14539          /* else fall through */
   14540       }
   14541       break;
   14542 
   14543    case 0xD8:
   14544       /* 66 0F D8 = PSUBUSB */
   14545       if (have66noF2noF3(pfx) && sz == 2) {
   14546          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14547                                     "psubusb", Iop_QSub8Ux16, False );
   14548          goto decode_success;
   14549       }
   14550       break;
   14551 
   14552    case 0xD9:
   14553       /* 66 0F D9 = PSUBUSW */
   14554       if (have66noF2noF3(pfx) && sz == 2) {
   14555          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14556                                     "psubusw", Iop_QSub16Ux8, False );
   14557          goto decode_success;
   14558       }
   14559       break;
   14560 
   14561    case 0xDA:
   14562       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14563       /* 0F DA = PMINUB -- 8x8 unsigned min */
   14564       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14565          do_MMX_preamble();
   14566          delta = dis_MMXop_regmem_to_reg (
   14567                     vbi, pfx, delta, opc, "pminub", False );
   14568          goto decode_success;
   14569       }
   14570       /* 66 0F DA = PMINUB -- 8x16 unsigned min */
   14571       if (have66noF2noF3(pfx) && sz == 2) {
   14572          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14573                                     "pminub", Iop_Min8Ux16, False );
   14574          goto decode_success;
   14575       }
   14576       break;
   14577 
   14578    case 0xDB:
   14579       /* 66 0F DB = PAND */
   14580       if (have66noF2noF3(pfx) && sz == 2) {
   14581          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
   14582          goto decode_success;
   14583       }
   14584       break;
   14585 
   14586    case 0xDC:
   14587       /* 66 0F DC = PADDUSB */
   14588       if (have66noF2noF3(pfx) && sz == 2) {
   14589          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14590                                     "paddusb", Iop_QAdd8Ux16, False );
   14591          goto decode_success;
   14592       }
   14593       break;
   14594 
   14595    case 0xDD:
   14596       /* 66 0F DD = PADDUSW */
   14597       if (have66noF2noF3(pfx) && sz == 2) {
   14598          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14599                                     "paddusw", Iop_QAdd16Ux8, False );
   14600          goto decode_success;
   14601       }
   14602       break;
   14603 
   14604    case 0xDE:
   14605       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14606       /* 0F DE = PMAXUB -- 8x8 unsigned max */
   14607       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14608          do_MMX_preamble();
   14609          delta = dis_MMXop_regmem_to_reg (
   14610                     vbi, pfx, delta, opc, "pmaxub", False );
   14611          goto decode_success;
   14612       }
   14613       /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
   14614       if (have66noF2noF3(pfx) && sz == 2) {
   14615          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14616                                     "pmaxub", Iop_Max8Ux16, False );
   14617          goto decode_success;
   14618       }
   14619       break;
   14620 
   14621    case 0xDF:
   14622       /* 66 0F DF = PANDN */
   14623       if (have66noF2noF3(pfx) && sz == 2) {
   14624          delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
   14625          goto decode_success;
   14626       }
   14627       break;
   14628 
   14629    case 0xE0:
   14630       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14631       /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
   14632       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14633          do_MMX_preamble();
   14634          delta = dis_MMXop_regmem_to_reg (
   14635                     vbi, pfx, delta, opc, "pavgb", False );
   14636          goto decode_success;
   14637       }
   14638       /* 66 0F E0 = PAVGB */
   14639       if (have66noF2noF3(pfx) && sz == 2) {
   14640          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14641                                     "pavgb", Iop_Avg8Ux16, False );
   14642          goto decode_success;
   14643       }
   14644       break;
   14645 
   14646    case 0xE1:
   14647       /* 66 0F E1 = PSRAW by E */
   14648       if (have66noF2noF3(pfx) && sz == 2) {
   14649          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
   14650          goto decode_success;
   14651       }
   14652       break;
   14653 
   14654    case 0xE2:
   14655       /* 66 0F E2 = PSRAD by E */
   14656       if (have66noF2noF3(pfx) && sz == 2) {
   14657          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
   14658          goto decode_success;
   14659       }
   14660       break;
   14661 
   14662    case 0xE3:
   14663       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14664       /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
   14665       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14666          do_MMX_preamble();
   14667          delta = dis_MMXop_regmem_to_reg (
   14668                     vbi, pfx, delta, opc, "pavgw", False );
   14669          goto decode_success;
   14670       }
   14671       /* 66 0F E3 = PAVGW */
   14672       if (have66noF2noF3(pfx) && sz == 2) {
   14673          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14674                                     "pavgw", Iop_Avg16Ux8, False );
   14675          goto decode_success;
   14676       }
   14677       break;
   14678 
   14679    case 0xE4:
   14680       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14681       /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
   14682       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14683          do_MMX_preamble();
   14684          delta = dis_MMXop_regmem_to_reg (
   14685                     vbi, pfx, delta, opc, "pmuluh", False );
   14686          goto decode_success;
   14687       }
   14688       /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
   14689       if (have66noF2noF3(pfx) && sz == 2) {
   14690          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14691                                     "pmulhuw", Iop_MulHi16Ux8, False );
   14692          goto decode_success;
   14693       }
   14694       break;
   14695 
   14696    case 0xE5:
   14697       /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
   14698       if (have66noF2noF3(pfx) && sz == 2) {
   14699          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14700                                     "pmulhw", Iop_MulHi16Sx8, False );
   14701          goto decode_success;
   14702       }
   14703       break;
   14704 
   14705    case 0xE6:
   14706       /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14707          lo half xmm(G), and zero upper half, rounding towards zero */
   14708       /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
   14709          lo half xmm(G), according to prevailing rounding mode, and zero
   14710          upper half */
   14711       if ( (haveF2no66noF3(pfx) && sz == 4)
   14712            || (have66noF2noF3(pfx) && sz == 2) ) {
   14713          delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
   14714                                     toBool(sz == 2)/*r2zero*/);
   14715          goto decode_success;
   14716       }
   14717       /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
   14718          F64 in xmm(G) */
   14719       if (haveF3no66noF2(pfx) && sz == 4) {
   14720          delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
   14721          goto decode_success;
   14722       }
   14723       break;
   14724 
   14725    case 0xE7:
   14726       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14727       /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
   14728          Intel manual does not say anything about the usual business of
   14729          the FP reg tags getting trashed whenever an MMX insn happens.
   14730          So we just leave them alone.
   14731       */
   14732       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14733          modrm = getUChar(delta);
   14734          if (!epartIsReg(modrm)) {
   14735             /* do_MMX_preamble(); Intel docs don't specify this */
   14736             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14737             storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
   14738             DIP("movntq %s,%s\n", dis_buf,
   14739                                   nameMMXReg(gregLO3ofRM(modrm)));
   14740             delta += alen;
   14741             goto decode_success;
   14742          }
   14743          /* else fall through */
   14744       }
   14745       /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
   14746       if (have66noF2noF3(pfx) && sz == 2) {
   14747          modrm = getUChar(delta);
   14748          if (!epartIsReg(modrm)) {
   14749             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14750             gen_SEGV_if_not_16_aligned( addr );
   14751             storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
   14752             DIP("movntdq %s,%s\n", dis_buf,
   14753                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
   14754             delta += alen;
   14755             goto decode_success;
   14756          }
   14757          /* else fall through */
   14758       }
   14759       break;
   14760 
   14761    case 0xE8:
   14762       /* 66 0F E8 = PSUBSB */
   14763       if (have66noF2noF3(pfx) && sz == 2) {
   14764          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14765                                     "psubsb", Iop_QSub8Sx16, False );
   14766          goto decode_success;
   14767       }
   14768       break;
   14769 
   14770    case 0xE9:
   14771       /* 66 0F E9 = PSUBSW */
   14772       if (have66noF2noF3(pfx) && sz == 2) {
   14773          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14774                                     "psubsw", Iop_QSub16Sx8, False );
   14775          goto decode_success;
   14776       }
   14777       break;
   14778 
   14779    case 0xEA:
   14780       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14781       /* 0F EA = PMINSW -- 16x4 signed min */
   14782       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14783          do_MMX_preamble();
   14784          delta = dis_MMXop_regmem_to_reg (
   14785                     vbi, pfx, delta, opc, "pminsw", False );
   14786          goto decode_success;
   14787       }
   14788       /* 66 0F EA = PMINSW -- 16x8 signed min */
   14789       if (have66noF2noF3(pfx) && sz == 2) {
   14790          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14791                                     "pminsw", Iop_Min16Sx8, False );
   14792          goto decode_success;
   14793       }
   14794       break;
   14795 
   14796    case 0xEB:
   14797       /* 66 0F EB = POR */
   14798       if (have66noF2noF3(pfx) && sz == 2) {
   14799          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
   14800          goto decode_success;
   14801       }
   14802       break;
   14803 
   14804    case 0xEC:
   14805       /* 66 0F EC = PADDSB */
   14806       if (have66noF2noF3(pfx) && sz == 2) {
   14807          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14808                                     "paddsb", Iop_QAdd8Sx16, False );
   14809          goto decode_success;
   14810       }
   14811       break;
   14812 
   14813    case 0xED:
   14814       /* 66 0F ED = PADDSW */
   14815       if (have66noF2noF3(pfx) && sz == 2) {
   14816          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14817                                     "paddsw", Iop_QAdd16Sx8, False );
   14818          goto decode_success;
   14819       }
   14820       break;
   14821 
   14822    case 0xEE:
   14823       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14824       /* 0F EE = PMAXSW -- 16x4 signed max */
   14825       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14826          do_MMX_preamble();
   14827          delta = dis_MMXop_regmem_to_reg (
   14828                     vbi, pfx, delta, opc, "pmaxsw", False );
   14829          goto decode_success;
   14830       }
   14831       /* 66 0F EE = PMAXSW -- 16x8 signed max */
   14832       if (have66noF2noF3(pfx) && sz == 2) {
   14833          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   14834                                     "pmaxsw", Iop_Max16Sx8, False );
   14835          goto decode_success;
   14836       }
   14837       break;
   14838 
   14839    case 0xEF:
   14840       /* 66 0F EF = PXOR */
   14841       if (have66noF2noF3(pfx) && sz == 2) {
   14842          delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
   14843          goto decode_success;
   14844       }
   14845       break;
   14846 
   14847    case 0xF1:
   14848       /* 66 0F F1 = PSLLW by E */
   14849       if (have66noF2noF3(pfx) && sz == 2) {
   14850          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
   14851          goto decode_success;
   14852       }
   14853       break;
   14854 
   14855    case 0xF2:
   14856       /* 66 0F F2 = PSLLD by E */
   14857       if (have66noF2noF3(pfx) && sz == 2) {
   14858          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
   14859          goto decode_success;
   14860       }
   14861       break;
   14862 
   14863    case 0xF3:
   14864       /* 66 0F F3 = PSLLQ by E */
   14865       if (have66noF2noF3(pfx) && sz == 2) {
   14866          delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
   14867          goto decode_success;
   14868       }
   14869       break;
   14870 
   14871    case 0xF4:
   14872       /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14873          0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
   14874          half */
   14875       if (have66noF2noF3(pfx) && sz == 2) {
   14876          IRTemp sV = newTemp(Ity_V128);
   14877          IRTemp dV = newTemp(Ity_V128);
   14878          modrm = getUChar(delta);
   14879          UInt rG = gregOfRexRM(pfx,modrm);
   14880          assign( dV, getXMMReg(rG) );
   14881          if (epartIsReg(modrm)) {
   14882             UInt rE = eregOfRexRM(pfx,modrm);
   14883             assign( sV, getXMMReg(rE) );
   14884             delta += 1;
   14885             DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14886          } else {
   14887             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14888             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14889             delta += alen;
   14890             DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
   14891          }
   14892          putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
   14893          goto decode_success;
   14894       }
   14895       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   14896       /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
   14897          0 to form 64-bit result */
   14898       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14899          IRTemp sV = newTemp(Ity_I64);
   14900          IRTemp dV = newTemp(Ity_I64);
   14901          t1 = newTemp(Ity_I32);
   14902          t0 = newTemp(Ity_I32);
   14903          modrm = getUChar(delta);
   14904 
   14905          do_MMX_preamble();
   14906          assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
   14907 
   14908          if (epartIsReg(modrm)) {
   14909             assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
   14910             delta += 1;
   14911             DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
   14912                                    nameMMXReg(gregLO3ofRM(modrm)));
   14913          } else {
   14914             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14915             assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
   14916             delta += alen;
   14917             DIP("pmuludq %s,%s\n", dis_buf,
   14918                                    nameMMXReg(gregLO3ofRM(modrm)));
   14919          }
   14920 
   14921          assign( t0, unop(Iop_64to32, mkexpr(dV)) );
   14922          assign( t1, unop(Iop_64to32, mkexpr(sV)) );
   14923          putMMXReg( gregLO3ofRM(modrm),
   14924                     binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
   14925          goto decode_success;
   14926       }
   14927       break;
   14928 
   14929    case 0xF5:
   14930       /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
   14931          E(xmm or mem) to G(xmm) */
   14932       if (have66noF2noF3(pfx) && sz == 2) {
   14933          IRTemp sV = newTemp(Ity_V128);
   14934          IRTemp dV = newTemp(Ity_V128);
   14935          modrm     = getUChar(delta);
   14936          UInt   rG = gregOfRexRM(pfx,modrm);
   14937          if (epartIsReg(modrm)) {
   14938             UInt rE = eregOfRexRM(pfx,modrm);
   14939             assign( sV, getXMMReg(rE) );
   14940             delta += 1;
   14941             DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14942          } else {
   14943             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14944             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14945             delta += alen;
   14946             DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
   14947          }
   14948          assign( dV, getXMMReg(rG) );
   14949          putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
   14950          goto decode_success;
   14951       }
   14952       break;
   14953 
   14954    case 0xF6:
   14955       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14956       /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
   14957       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14958          do_MMX_preamble();
   14959          delta = dis_MMXop_regmem_to_reg (
   14960                     vbi, pfx, delta, opc, "psadbw", False );
   14961          goto decode_success;
   14962       }
   14963       /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
   14964          from E(xmm or mem) to G(xmm) */
   14965       if (have66noF2noF3(pfx) && sz == 2) {
   14966          IRTemp sV  = newTemp(Ity_V128);
   14967          IRTemp dV  = newTemp(Ity_V128);
   14968          modrm = getUChar(delta);
   14969          UInt   rG   = gregOfRexRM(pfx,modrm);
   14970          if (epartIsReg(modrm)) {
   14971             UInt rE = eregOfRexRM(pfx,modrm);
   14972             assign( sV, getXMMReg(rE) );
   14973             delta += 1;
   14974             DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   14975          } else {
   14976             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   14977             assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   14978             delta += alen;
   14979             DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
   14980          }
   14981          assign( dV, getXMMReg(rG) );
   14982          putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
   14983 
   14984          goto decode_success;
   14985       }
   14986       break;
   14987 
   14988    case 0xF7:
   14989       /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
   14990       /* 0F F7 = MASKMOVQ -- 8x8 masked store */
   14991       if (haveNo66noF2noF3(pfx) && sz == 4) {
   14992          Bool ok = False;
   14993          delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
   14994          if (ok) goto decode_success;
   14995       }
   14996       /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
   14997       if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
   14998          delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
   14999          goto decode_success;
   15000       }
   15001       break;
   15002 
   15003    case 0xF8:
   15004       /* 66 0F F8 = PSUBB */
   15005       if (have66noF2noF3(pfx) && sz == 2) {
   15006          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15007                                     "psubb", Iop_Sub8x16, False );
   15008          goto decode_success;
   15009       }
   15010       break;
   15011 
   15012    case 0xF9:
   15013       /* 66 0F F9 = PSUBW */
   15014       if (have66noF2noF3(pfx) && sz == 2) {
   15015          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15016                                     "psubw", Iop_Sub16x8, False );
   15017          goto decode_success;
   15018       }
   15019       break;
   15020 
   15021    case 0xFA:
   15022       /* 66 0F FA = PSUBD */
   15023       if (have66noF2noF3(pfx) && sz == 2) {
   15024          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15025                                     "psubd", Iop_Sub32x4, False );
   15026          goto decode_success;
   15027       }
   15028       break;
   15029 
   15030    case 0xFB:
   15031       /* 66 0F FB = PSUBQ */
   15032       if (have66noF2noF3(pfx) && sz == 2) {
   15033          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15034                                     "psubq", Iop_Sub64x2, False );
   15035          goto decode_success;
   15036       }
   15037       /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
   15038       /* 0F FB = PSUBQ -- sub 64x1 */
   15039       if (haveNo66noF2noF3(pfx) && sz == 4) {
   15040          do_MMX_preamble();
   15041          delta = dis_MMXop_regmem_to_reg (
   15042                    vbi, pfx, delta, opc, "psubq", False );
   15043          goto decode_success;
   15044       }
   15045       break;
   15046 
   15047    case 0xFC:
   15048       /* 66 0F FC = PADDB */
   15049       if (have66noF2noF3(pfx) && sz == 2) {
   15050          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15051                                     "paddb", Iop_Add8x16, False );
   15052          goto decode_success;
   15053       }
   15054       break;
   15055 
   15056    case 0xFD:
   15057       /* 66 0F FD = PADDW */
   15058       if (have66noF2noF3(pfx) && sz == 2) {
   15059          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15060                                     "paddw", Iop_Add16x8, False );
   15061          goto decode_success;
   15062       }
   15063       break;
   15064 
   15065    case 0xFE:
   15066       /* 66 0F FE = PADDD */
   15067       if (have66noF2noF3(pfx) && sz == 2) {
   15068          delta = dis_SSEint_E_to_G( vbi, pfx, delta,
   15069                                     "paddd", Iop_Add32x4, False );
   15070          goto decode_success;
   15071       }
   15072       break;
   15073 
   15074    default:
   15075       goto decode_failure;
   15076 
   15077    }
   15078 
   15079   decode_failure:
   15080    *decode_OK = False;
   15081    return deltaIN;
   15082 
   15083   decode_success:
   15084    *decode_OK = True;
   15085    return delta;
   15086 }
   15087 
   15088 
   15089 /*------------------------------------------------------------*/
   15090 /*---                                                      ---*/
   15091 /*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
   15092 /*---                                                      ---*/
   15093 /*------------------------------------------------------------*/
   15094 
   15095 static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15096                               Long delta, Bool isAvx )
   15097 {
   15098    IRTemp addr   = IRTemp_INVALID;
   15099    Int    alen   = 0;
   15100    HChar  dis_buf[50];
   15101    IRTemp sV    = newTemp(Ity_V128);
   15102    IRTemp d0    = newTemp(Ity_I64);
   15103    UChar  modrm = getUChar(delta);
   15104    UInt   rG    = gregOfRexRM(pfx,modrm);
   15105    if (epartIsReg(modrm)) {
   15106       UInt rE = eregOfRexRM(pfx,modrm);
   15107       assign( sV, getXMMReg(rE) );
   15108       DIP("%smovddup %s,%s\n",
   15109           isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
   15110       delta += 1;
   15111       assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
   15112    } else {
   15113       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15114       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15115       DIP("%smovddup %s,%s\n",
   15116           isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
   15117       delta += alen;
   15118    }
   15119    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15120       ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
   15121    return delta;
   15122 }
   15123 
   15124 
   15125 static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15126                               Long delta )
   15127 {
   15128    IRTemp addr   = IRTemp_INVALID;
   15129    Int    alen   = 0;
   15130    HChar  dis_buf[50];
   15131    IRTemp d0    = newTemp(Ity_I64);
   15132    IRTemp d1    = newTemp(Ity_I64);
   15133    UChar  modrm = getUChar(delta);
   15134    UInt   rG    = gregOfRexRM(pfx,modrm);
   15135    if (epartIsReg(modrm)) {
   15136       UInt rE = eregOfRexRM(pfx,modrm);
   15137       DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
   15138       delta += 1;
   15139       assign ( d0, getYMMRegLane64(rE, 0) );
   15140       assign ( d1, getYMMRegLane64(rE, 2) );
   15141    } else {
   15142       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15143       assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
   15144       assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
   15145                                         mkexpr(addr), mkU64(16))) );
   15146       DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
   15147       delta += alen;
   15148    }
   15149    putYMMRegLane64( rG, 0, mkexpr(d0) );
   15150    putYMMRegLane64( rG, 1, mkexpr(d0) );
   15151    putYMMRegLane64( rG, 2, mkexpr(d1) );
   15152    putYMMRegLane64( rG, 3, mkexpr(d1) );
   15153    return delta;
   15154 }
   15155 
   15156 
   15157 static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
   15158                                Long delta, Bool isAvx, Bool isL )
   15159 {
   15160    IRTemp addr  = IRTemp_INVALID;
   15161    Int    alen  = 0;
   15162    HChar  dis_buf[50];
   15163    IRTemp sV    = newTemp(Ity_V128);
   15164    UChar  modrm = getUChar(delta);
   15165    UInt   rG    = gregOfRexRM(pfx,modrm);
   15166    IRTemp s3, s2, s1, s0;
   15167    s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15168    if (epartIsReg(modrm)) {
   15169       UInt rE = eregOfRexRM(pfx,modrm);
   15170       assign( sV, getXMMReg(rE) );
   15171       DIP("%smovs%cdup %s,%s\n",
   15172           isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
   15173       delta += 1;
   15174    } else {
   15175       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15176       if (!isAvx)
   15177          gen_SEGV_if_not_16_aligned( addr );
   15178       assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
   15179       DIP("%smovs%cdup %s,%s\n",
   15180           isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
   15181       delta += alen;
   15182    }
   15183    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15184    (isAvx ? putYMMRegLoAndZU : putXMMReg)
   15185       ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
   15186                 : mkV128from32s( s3, s3, s1, s1 ) );
   15187    return delta;
   15188 }
   15189 
   15190 
   15191 static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
   15192                                Long delta, Bool isL )
   15193 {
   15194    IRTemp addr  = IRTemp_INVALID;
   15195    Int    alen  = 0;
   15196    HChar  dis_buf[50];
   15197    IRTemp sV    = newTemp(Ity_V256);
   15198    UChar  modrm = getUChar(delta);
   15199    UInt   rG    = gregOfRexRM(pfx,modrm);
   15200    IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
   15201    s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
   15202    if (epartIsReg(modrm)) {
   15203       UInt rE = eregOfRexRM(pfx,modrm);
   15204       assign( sV, getYMMReg(rE) );
   15205       DIP("vmovs%cdup %s,%s\n",
   15206           isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
   15207       delta += 1;
   15208    } else {
   15209       addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15210       assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
   15211       DIP("vmovs%cdup %s,%s\n",
   15212           isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
   15213       delta += alen;
   15214    }
   15215    breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
   15216    putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
   15217                                 : mkV128from32s( s7, s7, s5, s5 ) );
   15218    putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
   15219                                 : mkV128from32s( s3, s3, s1, s1 ) );
   15220    return delta;
   15221 }
   15222 
   15223 
   15224 static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15225 {
   15226    IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
   15227    IRTemp leftV  = newTemp(Ity_V128);
   15228    IRTemp rightV = newTemp(Ity_V128);
   15229    IRTemp rm     = newTemp(Ity_I32);
   15230    s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
   15231 
   15232    breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
   15233    breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
   15234 
   15235    assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
   15236    assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
   15237 
   15238    IRTemp res = newTemp(Ity_V128);
   15239    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15240    assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
   15241                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15242    return res;
   15243 }
   15244 
   15245 
   15246 static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
   15247 {
   15248    IRTemp s1, s0, d1, d0;
   15249    IRTemp leftV  = newTemp(Ity_V128);
   15250    IRTemp rightV = newTemp(Ity_V128);
   15251    IRTemp rm     = newTemp(Ity_I32);
   15252    s1 = s0 = d1 = d0 = IRTemp_INVALID;
   15253 
   15254    breakupV128to64s( sV, &s1, &s0 );
   15255    breakupV128to64s( dV, &d1, &d0 );
   15256 
   15257    assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
   15258    assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
   15259 
   15260    IRTemp res = newTemp(Ity_V128);
   15261    assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
   15262    assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
   15263                       mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
   15264    return res;
   15265 }
   15266 
   15267 
   15268 __attribute__((noinline))
   15269 static
   15270 Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
   15271                         const VexAbiInfo* vbi,
   15272                         Prefix pfx, Int sz, Long deltaIN )
   15273 {
   15274    IRTemp addr  = IRTemp_INVALID;
   15275    UChar  modrm = 0;
   15276    Int    alen  = 0;
   15277    HChar  dis_buf[50];
   15278 
   15279    *decode_OK = False;
   15280 
   15281    Long   delta = deltaIN;
   15282    UChar  opc   = getUChar(delta);
   15283    delta++;
   15284    switch (opc) {
   15285 
   15286    case 0x12:
   15287       /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
   15288          duplicating some lanes (2:2:0:0). */
   15289       if (haveF3no66noF2(pfx) && sz == 4) {
   15290          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15291                                    True/*isL*/ );
   15292          goto decode_success;
   15293       }
   15294       /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
   15295          duplicating some lanes (0:1:0:1). */
   15296       if (haveF2no66noF3(pfx)
   15297           && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
   15298          delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
   15299          goto decode_success;
   15300       }
   15301       break;
   15302 
   15303    case 0x16:
   15304       /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
   15305          duplicating some lanes (3:3:1:1). */
   15306       if (haveF3no66noF2(pfx) && sz == 4) {
   15307          delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
   15308                                    False/*!isL*/ );
   15309          goto decode_success;
   15310       }
   15311       break;
   15312 
   15313    case 0x7C:
   15314    case 0x7D:
   15315       /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
   15316       /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
   15317       if (haveF2no66noF3(pfx) && sz == 4) {
   15318          IRTemp eV     = newTemp(Ity_V128);
   15319          IRTemp gV     = newTemp(Ity_V128);
   15320          Bool   isAdd  = opc == 0x7C;
   15321          const HChar* str = isAdd ? "add" : "sub";
   15322          modrm         = getUChar(delta);
   15323          UInt   rG     = gregOfRexRM(pfx,modrm);
   15324          if (epartIsReg(modrm)) {
   15325             UInt rE = eregOfRexRM(pfx,modrm);
   15326             assign( eV, getXMMReg(rE) );
   15327             DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15328             delta += 1;
   15329          } else {
   15330             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15331             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15332             DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15333             delta += alen;
   15334          }
   15335 
   15336          assign( gV, getXMMReg(rG) );
   15337          putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
   15338          goto decode_success;
   15339       }
   15340       /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
   15341       /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
   15342       if (have66noF2noF3(pfx) && sz == 2) {
   15343          IRTemp eV     = newTemp(Ity_V128);
   15344          IRTemp gV     = newTemp(Ity_V128);
   15345          Bool   isAdd  = opc == 0x7C;
   15346          const HChar* str = isAdd ? "add" : "sub";
   15347          modrm         = getUChar(delta);
   15348          UInt   rG     = gregOfRexRM(pfx,modrm);
   15349          if (epartIsReg(modrm)) {
   15350             UInt rE = eregOfRexRM(pfx,modrm);
   15351             assign( eV, getXMMReg(rE) );
   15352             DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
   15353             delta += 1;
   15354          } else {
   15355             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15356             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15357             DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
   15358             delta += alen;
   15359          }
   15360 
   15361          assign( gV, getXMMReg(rG) );
   15362          putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
   15363          goto decode_success;
   15364       }
   15365       break;
   15366 
   15367    case 0xD0:
   15368       /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
   15369       if (have66noF2noF3(pfx) && sz == 2) {
   15370          IRTemp eV   = newTemp(Ity_V128);
   15371          IRTemp gV   = newTemp(Ity_V128);
   15372          modrm       = getUChar(delta);
   15373          UInt   rG   = gregOfRexRM(pfx,modrm);
   15374          if (epartIsReg(modrm)) {
   15375             UInt rE = eregOfRexRM(pfx,modrm);
   15376             assign( eV, getXMMReg(rE) );
   15377             DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15378             delta += 1;
   15379          } else {
   15380             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15381             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15382             DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
   15383             delta += alen;
   15384          }
   15385 
   15386          assign( gV, getXMMReg(rG) );
   15387          putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
   15388          goto decode_success;
   15389       }
   15390       /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
   15391       if (haveF2no66noF3(pfx) && sz == 4) {
   15392          IRTemp eV   = newTemp(Ity_V128);
   15393          IRTemp gV   = newTemp(Ity_V128);
   15394          modrm       = getUChar(delta);
   15395          UInt   rG   = gregOfRexRM(pfx,modrm);
   15396 
   15397          modrm = getUChar(delta);
   15398          if (epartIsReg(modrm)) {
   15399             UInt rE = eregOfRexRM(pfx,modrm);
   15400             assign( eV, getXMMReg(rE) );
   15401             DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
   15402             delta += 1;
   15403          } else {
   15404             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15405             assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
   15406             DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
   15407             delta += alen;
   15408          }
   15409 
   15410          assign( gV, getXMMReg(rG) );
   15411          putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
   15412          goto decode_success;
   15413       }
   15414       break;
   15415 
   15416    case 0xF0:
   15417       /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
   15418       if (haveF2no66noF3(pfx) && sz == 4) {
   15419          modrm = getUChar(delta);
   15420          if (epartIsReg(modrm)) {
   15421             goto decode_failure;
   15422          } else {
   15423             addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
   15424             putXMMReg( gregOfRexRM(pfx,modrm),
   15425                        loadLE(Ity_V128, mkexpr(addr)) );
   15426             DIP("lddqu %s,%s\n",